예제 #1
0
def extractSelectionResponse(respList):
    """
    To extract responses from in-text selection item types in reading
    assessments.

    :param respList: the json structure with responses
    :return: answer string
    """
    try:
        # res = ["option-" + str(i + 1) for i, o in enumerate(respList) if o["val"] == "true"]
        # i is for the option number, 0-based
        # o is for whether this option is chosen
        res = [
            str(i + 1) for i, o in enumerate(respList) if o["val"] == "true"
        ]
    except Exception as e:
        logger.error("extractSelectionResponse:")
        logger.exception(e)
        exc_buffer = io.StringIO()
        traceback.print_exc(file=exc_buffer)
        logger.error('Uncaught exception in worker process:\n%s',
                     exc_buffer.getvalue())

        res = respList

    return res
예제 #2
0
def reconByConfig(df, config):
    """
    Auxilury function to reconstruct responses using a particular configuration.

    It takes an observable data frame df that is already pre-filtered to contain a certain events,
    groupby the typical variables (BookletNumber and BlockCode), run the reconstruction function (aka "dispatcher")
    before running teh "postprocessor" function to clean up the output.

    :param df: the observable data frame, typically containing a single event type
    :param config: the configuration dict with required field.
    :return: the reconstructed data frame, or an empty data frame if errors occur
    """

    assert ("byVars" in config)
    assert ("dispatcher" in config)
    assert ("postprocessor" in config)
    # config the parser function
    try:
        res = df.groupby(config["byVars"])\
            .apply(config["dispatcher"], config=config).reset_index()\
            .pipe(config["postprocessor"])
    except Exception as e:
        logger.error("reconByConfig:")
        #logger.error("input df:\n{}".format(df))
        logger.exception(e)
        res = pd.DataFrame()
    return res
예제 #3
0
def extractMCResponse(respList, maxNumberOfOptions=10):
    """
    This function extracts MC responses from the XML output, and returns a list of options in letters (and numbers if
    out of range).

    The input is of the following format:

    [{u'val': u'true', u'key': u'4'}]

    The output of the above would be ["D"].

    If no response, return [].
    If not a response string, return None

    :param respList: the json structure with responses
    :param maxNumberOfOptions: optional the max number of options for MC; default to 10 to save space
    :return: answer string
    """

    try:
        res = [num2alpha(r["key"]) for r in respList]
    except Exception as e:
        logger.error("extractMCResponse:")
        logger.exception(e)
        exc_buffer = io.StringIO()
        traceback.print_exc(file=exc_buffer)
        logger.error('Uncaught exception in worker process:\n%s',
                     exc_buffer.getvalue())

        res = respList

    return res
예제 #4
0
def postProcessSBTTextSelectionResp(dfSBTTextSelectionResp):
    """
    Auxilary function to post-process TextSelection responses for SBTs. The input df is is output from
    reconSBTTextSelection().

    It's a Series with reset_index() applied; therefore the var of interst is dfSBTTextSelectionResp[0], which contains
    [{'ExtractedAnswer': 'selection-22', 'ResponseComponentId': 'item-SelectExamples'}]

    We then take the df with many of these, return something like ['selection-22', 'selection-23', 'selection-27'] for
    each BookletNumber, BlockId, and ResponseComponentId.

    We will use a 2-level groupby(). The first level is to make sure we keep the ["BookletNumber", "BlockCode"] info.
    The second level, we create a new df based on ResponseComponentId and ExtractedAnswer, then combine ExtractedAnswer
    for each ResponseComponentId, creating a sorted list as output. We do a little clearn up to ensure the output df
    has the column names we wanted.

    :param dfSBTTextSelectionResp: the output from reconSBTTextSelection()
    :return: a df with columns ["BookletNumber","BlockCode","ResponseComponentId","ExtractedAnswer"]; None if response is empty
            or error.
    """

    #    print "From postProcessSBTTextSelectionResp"

    if dfSBTTextSelectionResp is None:
        logger.error(
            "postProcessSBTTextSelectionResp: dfSBTTextSelectionResp is None")
        return None

    if "ResponseComponentId" not in dfSBTTextSelectionResp.columns:
        try:
            #logger.error("postProcessSBTTextSelectionResp: ResponseComponentId not in dfSBTTextSelectionResp.columns")
            #logger.error("\n{}".format(dfSBTTextSelectionResp))
            dfSBTTextSelectionResp[
                "ResponseComponentId"] = dfSBTTextSelectionResp[
                    "ControlId"].apply(truncate)
        except:
            return None

#   print dfSBTTextSelectionResp["ResponseComponentId"]

    if dfSBTTextSelectionResp.shape[0] > 0:
        # condense this by ResponseComponentId
        try:
            # first, melt the data to multiple rows per ResponseComponentId
            res = dfSBTTextSelectionResp.groupby(["BookletNumber", "BlockCode"]) \
                .apply(lambda df: pd.DataFrame(df["ReconstructedAnswer"].sum())) \
                .reset_index()
            # now recast to one row per ResponseComponentId, with responses in a list and sorted
            res= res.groupby(["BookletNumber", "BlockCode", "ResponseComponentId"]) \
                .apply(lambda df: df["ReconstructedAnswer"].sort_values().tolist()) \
                .rename("ReconstructedAnswer").reset_index()
            return res
        except Exception as e:
            logger.error("postProcessSBTTextSelectionResp:")
            logger.exception(e)
            logger.debug(dfSBTTextSelectionResp)
            return None

    else:
        return None
예제 #5
0
def extractTextResponse(respList):
    """
    Extract response from Text item types in SBTs.

    Example: 
        [{'key': "Explain message of xxxx",
          'val': 'XXXXX'}]
        -->
        [{'val': 'XXXXX'}]

    :param respList: the json structure with responses
    :return: answer string
    """
    try:
        res = [{'val': o["val"]} for o in respList]
    except Exception as e:
        logger.error("extractTextResponse:")
        logger.exception(e)
        exc_buffer = io.StringIO()
        traceback.print_exc(file=exc_buffer)
        logger.error('Uncaught exception in worker process:\n%s',
                     exc_buffer.getvalue())

        res = respList

    return res
예제 #6
0
def extractSbtResponseXML(itemResult, headerDict):
    """Given a XML node "itemResult", return a list of responses
    :param itemResult: a xml.etree node that is itemResult
    :param headerDict: a dictionary with student-level information such as teh BookletNumber, etc.
    :return a list of dicts or None
    """
    responseMatrix = []
    try:
        blockCode = itemResult.get("blockCode")
        itemAccessionNumber = itemResult.get("accessionNumber")
        # sometimes response data is stored under a different AccNum than the ItemAccNum
        accessionNumber = itemResult.get("respondedIn")
        if not accessionNumber:
            accessionNumber = itemAccessionNumber

        itemType = itemResult.get("itemType")

        # we need to loop through responseDatum elements
        responseData = itemResult.find(
            'responseVariable/candidateResponse/value/taskState/responseData')
        for responseDatum in responseData.iter('responseDatum'):
            sceneId = responseDatum.find("sceneId").text
            responseComponentId = responseDatum.find(
                "responseComponentId").text
            responseType = responseDatum.find("responseType").text

            content = responseDatum.find("content")
            ct = []
            if content is not None:
                for pair in content.iter('pair'):
                    k = pair.find("key").text
                    v = pair.find("value").text
                    ct.append({"key": k, "val": v})

            responseMatrix.append({
                'BookletNumber': headerDict['BookletNumber'],
                'Form': headerDict['Form'],
                'Year': headerDict['Year'],
                'SubjectCode': headerDict['SubjectCode'],
                'Grade': headerDict['Grade'],
                'BlockCode': blockCode,
                'AccessionNumber': accessionNumber,
                'ItemAccessionNumber': itemAccessionNumber,
                'ItemTypeCode': itemType,
                'ChildItemAccessionNumber': sceneId,
                'ChildItemType': responseType,
                'ResponseComponentId': responseComponentId,
                'Response': ct
            })
    except Exception as e:
        logger.error(
            "extractSbtResponseXML: Error parsing the SBT XML itemResult")
        logger.exception(e)
        return None

    return responseMatrix
예제 #7
0
def extractTextSelectionResponse(respList):
    """
    Extract response from TextSelection item types in SBTS.

    Example:
        [{u'val': u'3', u'key': u'selectedUnit1'}] --> [selection-3]

    :param respList: the json structure with responses
    :param maxNumberOfOptions: optional the max number of options for MC; default to 10 to save space
    :return: answer string
    """
    try:
        res = ["selection-" + r["val"] for r in respList]
    except Exception as e:
        logger.error("extractTextSelectionResponse:")
        logger.exception(e)
        exc_buffer = io.StringIO()
        traceback.print_exc(file=exc_buffer)
        logger.error('Uncaught exception in worker process:\n%s', exc_buffer.getvalue())

        res = respList

    return res
예제 #8
0
def parseSbtXML(source, keepResponseData=False):
    """
    Parse the SBT xml string for each block.
    The 2019 SBTs adds a new stateDiff field. This function takes an
    individual XML per student per block, and returns a Pandas data frame.
    
    :param source: the XML string or a XML node
    :param keepResponseData: false by default, otherwise combines both obs and res data.
    :return: a data frame or None
    """

    # if source is a XML node, skip the parsing
    try:
        parser = ET.XMLParser()
        root = ET.fromstring(source, parser=parser)
    except:
        # not a string
        root = source

    df = None
    # get top level basic info
    try:
        bookletId = root.findtext('bookletId')
        #    stateInfo=root.findtext('stateInfo')
        taskId = root.findtext('taskId')
        blockId = root.findtext('blockId')
        accommodations = root.findtext('accommodations')
        extendedTimeFactor = root.findtext('extendedTimeFactor')
        # print bookletId, taskId, blockId
    except Exception as e:
        warnings.warn(
            "ParseSbtXML: XML contains incomplete Booklet level information")
        logger.error(
            "ParseSbtXML: XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    # get observable data
    observableMatrix = []
    ct = None
    for observableDatum in root.iter('observableDatum'):
        sceneId = observableDatum.findtext('sceneId')
        controlId = observableDatum.findtext('controlId')
        eventType = observableDatum.findtext('eventType')
        timestamp = observableDatum.findtext('timestamp')
        stateDiff = observableDatum.findtext('stateDiff')
        # get content in json format
        for content in observableDatum.iter('content'):
            # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
            # @@ why are we looping here? Shouldn't there be a single Content here?
            ct = str(parseXMLContentDatum(content))

        observableMatrix.append({
            'BookletNumber': bookletId,
            'BlockId': blockId,
            'TaskId': taskId,
            'Accomodations': accommodations,
            'ExtendedTimeFactor': extendedTimeFactor,
            'SceneId': sceneId,
            'ControlId': controlId,
            'Label': eventType,
            'EventTime': timestamp,
            'StateDiff': stateDiff.replace("\n", ""),
            'ExtendedInfo': ct
        })

    # turn the data into a data frame
    if not keepResponseData:
        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            df = pd.DataFrame.from_dict(observableMatrix)
        except Exception as e:
            warnings.warn(
                "ParseSbtXML: cannot turn Observable XML into a data frame")
            logger.error(
                "ParseSbtXML: cannot turn Observable XML into a data frame")
            logger.exception(e)
            return None

    else:
        # get response data
        responseMatrix = []
        ct = None
        for responseDatum in root.iter('responseDatum'):
            sceneId = responseDatum.findtext('sceneId')
            responseComponentId = responseDatum.findtext('responseComponentId')
            responseType = responseDatum.findtext('responseType')
            # get content in json format
            for content in responseDatum.iter('content'):
                # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
                # @@ why are we looping here? Shouldn't there be a single Content here?
                ct = str(parseXMLContentDatum(content))
            responseMatrix.append({
                'SceneId': sceneId,
                'ResponseComponentId': responseComponentId,
                'ResponseType': responseType,
                'ResponseContent': ct
            })

        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            dfObs = pd.DataFrame.from_dict(observableMatrix)
            # then create response dataframe
            dfResp = pd.DataFrame.from_dict(responseMatrix)
            if (dfResp.empty):
                df = dfObs
            elif (dfObs.empty):
                df = dfResp
            else:
                # GF: Wait, does "outer" really work here?
                df = pd.merge(dfObs, dfResp, how='outer', on='SceneId')
        except Exception as e:
            warnings.warn("ParseSbtXML: cannot turn XML into a data frame")
            logger.error("ParseSbtXML: cannot turn XML into a data frame")
            logger.exception(e)
            return None

    return df
예제 #9
0
def parseSbtObservableXML(
    source,
    bl="current",
    bc="current",
    unicodeJunkChar="@"
):  #bl stands for bookletnumber; bc stands for blockcode
    """
    Parse the SBT xml string from SQL Response data table for each block to get Observable data frame.

    This function parses the XML string stored in the SQL Response Data Table for SBT and similar
    black-box component, where it keeps its own observable data and export
    an XML along with the response data. We have to export this data from
    the responseData SQL database as an XML file. This function takes an
    individual XML per student per block, and returns a Pandas data frame.

    :param source: the XML string or a XML node
    :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@"
    :return: a data frame of observables or None if errors
    """
    # if source is a XML node, skip the parsing

    print('From parseSbtobservableXML')

    try:
        # first, replacing all unicode characters to unicodeJunkChar
        source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source)
        # try to parse the xml string
        root = ET.fromstring(source)
    except Exception as e:
        #print bl, " ", bc, "Not able to parse"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        # not a string
        root = source
    observableList = []
    bookletDict = dict()
    # processing top-level xml elements
    try:
        bookletDict["BookletNumber"] = root.find("bookletId").text
        # bookletDict["Form"] = ""
        # bookletDict["SchoolCode"] = ""
        # bookletDict["sessionNumber"] = ""
        # bookletDict["Year"] = ""
        # bookletDict["Grade"] = ""
        # bookletDict["SubjectCode"] = ""
        bookletDict["BlockCode"] = root.find("blockId").text
        bookletDict["ItemTypeCode"] = "SBT"
        bookletDict["AccessionNumber"] = root.find("taskId").text
    except Exception as e:
        #print bl, " ", bc, " error"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        return None
    # SBTs actually embed its own XML data, we need to loop through them and save each as a row
    for observableDatum in root.iter('observableDatum'):

        # make a copy of the itemDict
        obsDict = bookletDict.copy()
        # populate
        obsDict["SceneId"] = observableDatum.findtext('sceneId')
        obsDict["ControlId"] = observableDatum.findtext('controlId')

        #        obsDict["ResponseComponentId"]=observableDatum.findtext('controlId').rsplit('.',1)[0]

        obsDict["Label"] = observableDatum.findtext('eventType')
        obsDict["EventTime"] = observableDatum.findtext('timestamp')
        obsDict["ExtendedInfo"] = parseXMLContentDatum(
            observableDatum.find("content"))
        # add to the list
        observableList.append(obsDict)
        # obsDict.clear() # gc? No, stupid. This would clear the obj in the list already.

        # itemDict.clear() # gc? NO, see above.
    # error check
    # if no actual data records, exit with a warning and return None
    if len(observableList) == 0:
        #print bl," ",bc," has no data"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains no data")
        logger.warning("BlockCode " + bc + " BookletNumber " + bl +
                       " XML contains no data")
        return None  # We have data. Now we create a data frame, parse the ExtendedInfo
    # notice the configuration is specified.
    try:
        df = pd.DataFrame.from_dict(observableList)
        # parse extended info for SBT items
        idx = df["ItemTypeCode"].isin(["SBT", "ReadingNonSbt"])
        df.loc[idx,
               "extInfo"] = df.loc[idx,
                                   "ExtendedInfo"].pipe(parseJSONObservables)
        df = df.sort_values("EventTime")
    except Exception as e:
        warnings.warn("XML data cannot be converted to a data frame")
        logger.error("XML data cannot be converted to a data frame")
        logger.exception(e)
        return None
    return df
예제 #10
0
def xvalBooklets(dfResp, dfObsResp, configObsList, configRespList):
    """
    Cross-validates records for a booklet using data from a ready-made data frames. Returns a data frame containing
    extracted responses from the response data table and the reconstructed responses from the observable
    data, for selected item types that the x-val algorithm currently handles.

    :param dfResp: a data frame of response data, from which we extract the responses for each item
    :param dfObsResp: a data frame of observable data, from which we reconstruct responses for each item
    :param configObsList: list containing configurations for processing observables
    :param configRespList: list containing configurations for processing responses
    :return: a data frame that matches the extracted and reconstructed responses
    """

    assert (len(configObsList) > 0 & ("itemtypeColumn" in configObsList))
    assert (isinstance(dfResp, pd.DataFrame))
    assert (isinstance(dfObsResp, pd.DataFrame))
    # make sure there are overlapping subjects
    subjlist = list(
        set(dfResp.BookletNumber.unique()).intersection(
            set(dfObsResp.BookletNumber.unique())))
    assert (len(subjlist) > 0)

    ##################
    # recon answers using the configObsList
    # Join the observable data back again

    try:
        dfObs = pd.concat(
            [reconByConfig(dfObsResp, config=c) for c in configObsList])
        if dfObs.shape[0] > 0:
            dfObs = dfObs.loc[:, [
                'BlockCode', 'BookletNumber', "AccessionNumber",
                'ResponseComponentId', 'ReconstructedAnswer', 'ResponseHistory'
            ]]
    except Exception as e:
        logger.error("xvalBooklets: Error reconstructing responses")
        logger.exception(e)
        return None

    ##################
    # Merge recorded and reconstructed responses

    try:
        dfCompare = pd.merge(
            dfResp,
            dfObs,
            how="outer",
            on=["BookletNumber", "BlockCode", "ResponseComponentId"])
    except Exception as e:
        logger.error(
            "xvalBooklets: Error merging response and observable data")
        logger.exception(e)
        return None

    # Need to transform the extracted responses by the `childItemType`, because `ItemTypeCode` is too gross.
    dfCompare.loc[dfCompare.ItemTypeCode.isin(["MCSS", "BQMCSS"]),
                  "ChildItemType"] = "MCSS"
    dfCompare.loc[dfCompare.ItemTypeCode.isin(["MCMS", "BQMCMS"]),
                  "ChildItemType"] = "MCMS"

    # ## Extract and transform responses to prepare for comparisons
    try:
        dfCompare = pd.concat(
            [parseItemResponses(dfCompare, config=c) for c in configRespList])
    except Exception as e:
        logger.error("xvalBooklets: Error extracting responses")
        logger.exception(e)
        return None

    # ## Comparison and discrepancies

    # first, take care of a special case in BQMCMS and BQChoices, where one can add free text as "response"
    idx = dfCompare.ItemTypeCode.isin([
        "BQMCSS", "BQMCMS", "BQChoices"
    ]) & dfCompare["ExtractedAnswer"].notnull()
    dfCompare.loc[idx, "ExtractedAnswer"] = dfCompare.loc[idx, "ExtractedAnswer"] \
        .apply(lambda l: [i for i in l if i not in ['response', 'response']])

    # discrepancies
    try:
        # we take a shortcut here, converting responses to a set of string-values
        # if the response is None, then the result is not a set, but a None
        setReconAnswer = dfCompare.loc[:, "ReconstructedAnswer"]\
            .apply(lambda respList: set([str(i) for i in respList]) if isinstance(respList,list) else None)
        setExtraAnswer = dfCompare.loc[:, "ExtractedAnswer"]\
            .apply(lambda respList: set([str(i) for i in respList]) if isinstance(respList,list) else None)

        dfCompare.loc[:, "matched"] = None
        # matched==True iff neither is None and the sets (of strings) are equal (recall None!=None)
        idx = setReconAnswer == setExtraAnswer
        dfCompare.loc[idx, "matched"] = True
        # matched==False iff the 2 sets were not equal, or one of them is None, but if both are None, we ignore
        idx = (setReconAnswer != setExtraAnswer)
        dfCompare.loc[idx, "matched"] = False
        dfCompare.loc[setReconAnswer.isnull() & setExtraAnswer.isnull(),
                      "matched"] = None
        # if the response is empty, it is treated as missing; comparison is True
        idx = dfCompare["ReconstructedAnswer"].isnull() & (
            dfCompare["ExtractedAnswer"].apply(lambda l: l == []))
        #        dfCompare.loc[idx, "matched"] = None
        dfCompare.loc[idx, "matched"] = True
    except Exception as e:
        logger.error(
            "xvalBooklets: Error comparing extracted and reconstructed responses"
        )
        logger.exception(e)
        return None

    return dfCompare
예제 #11
0
def reconSBTItemResponses(df, config=None):
    """Parse SBT process data, reconstruct responses using an array of functions

    :param df: the input data frame
    :type df: Pandas data frame

    :param config: optional configuation object; default to None
    :type config: object or None

    :returns: df with responses
    :rtype: Pandas data frame

    """

    try:
        assert (isinstance(df, pd.DataFrame))
        assert (config["itemtypeColumn"] in df.columns)
        assert (config["accnumColumn"] in df.columns)
    except Exception as e:
        #logger.error("reconSBTItemResponses: Returning None due to errors")
        #logger.exception(e)
        return None

    # make sure we have relevant events, else return None
    if df.loc[df[config["itemtypeColumn"]].isin(list(
            config["handlers"].keys()))].shape[0] == 0:
        return None

    if config is None:
        config = {
            "itemtypeColumn": "Label",
            "accnumColumn": "ControlId",
            "outputColumn": "ReconAnswer",
            "handlers": {
                "select.drop": reconSBTSelectDrop,
                "text.blur": reconSBTText,
                "select.choose": reconSBTSelectChoice
            }
        }

    # now let's revert the config, to get `parser:[list of labels]`
    funcMap = {}
    for k, v in config["handlers"].items():
        funcMap[v] = funcMap.get(v, []) + [k]

    # we now loop through all funcMap elements and do the conversion
    # TODO: consider ways to parallelize the process, e.g., using dask
    alldata = []
    for parser, eventList in funcMap.items():
        idx = df.loc[:, config["itemtypeColumn"]].isin(eventList)
        # alldata.append( df.loc[idx, :].groupby([accnum, itemtype]).apply(parser, accnum=accnum, itemtype=itemtype))
        # alldata.append( df.loc[idx, :].groupby(accnum).apply(parser, accnum=accnum, itemtype=itemtype))
        tmp = df.loc[idx, :]\
            .groupby(config["accnumColumn"])\
            .apply(parser, accnum=config["accnumColumn"], itemtype=config["itemtypeColumn"])
        if tmp.shape[0] > 0:
            alldata.append(tmp)

    # concat data
    try:
        res = pd.concat(alldata).reset_index()
        res.columns = [config["accnumColumn"], config["outputColumn"]]
    except Exception as e:
        logger.error("reconSBTItemResponses: Returning None due to errors")
        logger.exception(e)
        return None
    return res
예제 #12
0
def parseSbtResponseXML(
    source,
    bl="current",
    bc="current",
    unicodeJunkChar="@"
):  #bl stands for bookletnumber; bc stands for blockcode
    """
    Parse the SBT xml string from SQL Response data table for each block and get Response Data.

    This function parses the XML string stored in the SQL Response Data Table for SBT and similar
    black-box component, where it keeps its own observable data and export
    an XML along with the response data. We have to export this data from
    the responseData SQL database as an XML file. This function takes an
    individual XML per student per block, and returns a Pandas data frame.

    :param source: the XML string or a XML node
    :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@"
    :return: a data frame of response data or None if errors
    """

    # if source is a XML node, skip the parsing
    try:
        # first, replacing all unicode characters to unicodeJunkChar
        source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source)
        # try to parse the xml string
        root = ET.fromstring(source)
    except Exception as e:
        #print bl, " " , bc, "Not able to parse"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        # not a string
        root = source

    responseList = []
    bookletDict = dict()
    # processing top-level xml elements
    try:
        bookletDict["BookletNumber"] = root.find("bookletId").text
        bookletDict["BlockCode"] = root.find("blockId").text
        bookletDict["ItemTypeCode"] = "SBT"
        bookletDict["AccessionNumber"] = root.find("taskId").text
    except Exception as e:
        #print bl, " ", bc, " error"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    # SBTs actually embed its own XML data, we need to loop through them and save each as a row
    for responseDatum in root.iter('responseDatum'):
        # make a copy of the itemDict
        respDict = bookletDict.copy()
        # populate
        respDict["SceneId"] = responseDatum.findtext('sceneId')
        respDict["responseComponentId"] = responseDatum.findtext(
            'responseComponentId')
        respDict["responseType"] = responseDatum.findtext('responseType')
        # parse content
        content = responseDatum.find("content")
        ct = []
        if content is not None:
            for pair in content.iter('pair'):
                k = pair.find("key").text
                v = pair.find("value").text
                ct.append({"key": k, "val": v})
        respDict["Response"] = ct
        # add to the list
        responseList.append(respDict)
        # respDict.clear() # gc? No, stupid. This would clear the obj in the list already.

        # itemDict.clear() # gc? NO, see above.
    # error check
    # if no actual data records, exit with a warning and return None
    if len(responseList) == 0:
        #print bl," ",bc," has no data"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains no data")
        logger.warning("BlockCode " + bc + " BookletNumber " + bl +
                       " XML contains no data")
        return None  # We have data. Now we create a data frame, parse the ExtendedInfo

    # notice the configuration is specified.
    try:
        df = pd.DataFrame.from_dict(responseList)
    except Exception as e:
        warnings.warn("XML data cannot be converted to a data frame")
        logger.error("XML data cannot be converted to a data frame")
        logger.exception(e)
        return None

    return df
예제 #13
0
def parseIctXML(source, keepResponseData=False):
    """
    Parse the 2015 ICT observables xml string for each block.

    The 2015 Science ICT follows a precursor of the SBT data format, where most of the
    observable events are saved in the Response data table, as a "response" associated
    with an AccNum. This function takes the XML string of that session, and returns a
    data frame following the standard format of the process data log.

    Note that there are several fields that are unused, e.g., "Label", because they
    are used in the eNAEP-based process data logs (which deal with response-related
    events). We need to merge the two sources of logs to obtain a complete proces
    data log.

    :param source: the XML string or a XML node
    :param keepResponseData: false by default, otherwise combines both obs and res data.
    :return: a data frame or None
    """

    # if source is a XML node, skip the parsing
    try:
        parser = ET.XMLParser()
        root = ET.fromstring(source, parser=parser)
    except:
        # not a string
        root = source

    df = None
    # get top level basic info
    try:
        bookletId = root.findtext('bookletId')
        #    stateInfo=root.findtext('stateInfo')
        taskId = root.findtext('taskId')
        blockId = root.findtext('blockId')
        accommodations = root.findtext('accommodations')
        extendedTimeFactor = root.findtext('extendedTimeFactor')
        # print bookletId, taskId, blockId
    except Exception as e:
        warnings.warn("ParseIctXML: XML contains incomplete Booklet level information")
        logger.error("ParseIctXML: XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    # get observable data
    observableMatrix = []
    ct = None
    for observableDatum in root.iter('observableDatum'):
        sceneId = observableDatum.findtext('sceneId')
        controlId = observableDatum.findtext('controlId')
        eventType = observableDatum.findtext('eventType')
        timestamp = observableDatum.findtext('timestamp')
        # get content in json format
        for content in observableDatum.iter('content'):
            # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
            # @@ why are we looping here? Shouldn't there be a single Content here?
            ct = unicodedata.normalize('NFKD', str(content.text))
            #ct = unicodeToAscii(content.text)

        observableMatrix.append({'BookletNumber': bookletId,
                                 'BlockId': blockId,
                                 'TaskId': taskId,
                                 'Accomodations': accommodations,
                                 'ExtendedTimeFactor': extendedTimeFactor,
                                 'SceneId': sceneId,
                                 'ControlId': controlId,
                                 'Label': eventType,
                                 'EventTime': timestamp,
                                 'ExtendedInfo': ct})

    # turn the data into a data frame
    if not keepResponseData:
        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            df = pd.DataFrame.from_dict(observableMatrix)
        except Exception as e:
            warnings.warn("ParseIctXML: cannot turn Observable XML into a data frame")
            logger.error("ParseIctXML: cannot turn Observable XML into a data frame")
            logger.exception(e)
            return None

    else:
        # get response data
        responseMatrix = []
        ct = None
        for responseDatum in root.iter('responseDatum'):
            sceneId = responseDatum.findtext('sceneId')
            responseComponentId = responseDatum.findtext('responseComponentId')
            responseType = responseDatum.findtext('responseType')
            # get content in json format
            for content in responseDatum.iter('content'):
                # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
                # @@ why are we looping here? Shouldn't there be a single Content here?
                ct = str(parseXMLContentDatum(content))
            responseMatrix.append({'SceneId': sceneId,
                                   'ResponseComponentId': responseComponentId,
                                   'ResponseType': responseType,
                                   'ResponseContent': ct})

        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            dfObs = pd.DataFrame.from_dict(observableMatrix)
            # then create response dataframe
            dfResp = pd.DataFrame.from_dict(responseMatrix)
            if (dfResp.empty):
                df = dfObs
            elif (dfObs.empty):
                df = dfResp
            else:
                # GF: Wait, does "outer" really work here?
                # TODO: We don't want to back-fill the response for each scene
                # If we ever do this, we want the ResponseContent to relfect the
                # state of the responses at this point.
                df = pd.merge(dfObs, dfResp, how='outer', on='SceneId')
        except Exception as e:
            warnings.warn("ParseIctXML: cannot turn XML into a data frame")
            logger.error("ParseIctXML: cannot turn XML into a data frame")
            logger.exception(e)
            return None

    return df
예제 #14
0
def reconSBTTextSelection(itemLog, accnum="ControlId", itemtype="Label"):
    """
    Given a Pandas data frame containing the log for one item, return the reconstructed response.

    Examples of how select.toggle works:

    Label == "select.toggle"
    extendedInfo == {u'to': u'true', u'from': u'false'}
    controlId == "item-SelectExamples-selection-22"
    ...

    ==> {"ResponseComponentId": "item-SelectExamples", "ExtractedAnswer" = [selection-22, selection-23, selection-27]}

    The trick is that when we process by controlId, we can't put the the selections for each item back as a list.
    We will do this using a post-processing function, which will scan all item of this type, and combine ones with
    the same ResponseComponentId as a list.

    So for now, we will process one action at a time. We will have to run through each line and track the selection
    and deselection of each unit of selection.

    :param itemLog: a data fram containing the log of a single TextSelection select.toggle item
    :param accnum: the column name that identifies items
    :param itemtype: the column name that identifies the item type
    :return: a Pandas series of the reconstructed responses.

    """
    assert (isinstance(itemLog, pd.DataFrame))
    assert ("extInfo" in itemLog.columns)
    # only a single item
    assert (itemLog[accnum].nunique() == 1)
    # only a single item type
    assert (itemLog["ItemTypeCode"].nunique() == 1)

    # return the last content
    try:
        res = []
        lastRow = itemLog\
            .loc[itemLog[itemtype] == "select.toggle"]\
            .iloc[-1]
        response = lastRow.extInfo["to"]
        controlId = lastRow[accnum]
        # if this is not an "unselect" event
        if response == "true":
            # for "item-SelectExamples-selection-22"
            # for "slide10questions-option-2"
            for delimitor in ["_Selection", "_String"]:
                if delimitor in controlId:
                    tmplist = controlId.split(delimitor)
                    if len(tmplist) == 2:
                        # we should get 2 parts
                        res.append({
                            "ResponseComponentId":
                            "{}".format(tmplist[0]),
                            "ReconstructedAnswer":
                            "{}".format(tmplist[1])
                        })
                    else:
                        # error parsing this response: return controlId
                        res.append({
                            "ResponseComponentId":
                            "{}".format(controlId),
                            "ReconstructedAnswer":
                            "{}".format(controlId)
                        })
                    break  # quite the loop if processed
            # if no match, this is a case of "controlId == True"; i.e.,
            # we return
            if len(res) == 0:
                res = [{
                    "ResponseComponentId": "{}".format(controlId),
                    "ReconstructedAnswer": "{}_Selected".format(controlId)
                }]
    except Exception as e:
        logger.error("reconSBTTextSelection:")
        logger.exception(e)
        # logger.debug(itemLog)
        res = [{
            "ResponseComponentId": "ERROR_reconSBTTextSelection",
            "ReconstructedAnswer": "ERROR_reconSBTTextSelection"
        }]
    return res
예제 #15
0
def parsePearsonResponseXML(source):
    """
    Parse Pearson response XMLs, using XPath.

    Naming following the SQL:
    ```
        [ItemResponse].[ItemResponseId],
        Subject.SubjectCode,
        Assessment.AssessedGroupId as Grade,
        Student.BookletNumber,
        [Block].BlockCode,
        Item.AccessionNumber,
        ItemType.ItemTypeCode,
        [ItemResponse].[Response],
        [ItemResponse].[IsAnswered]
    ```

    We are adding a few new columns:

    - `ChildItemAccessionNumber`: native for some eNAEP; for SBT type,
    ```
    'ChildItemAccessionNumber': sceneId
    ```
    - `ChildItemType`: native for some eNAEP; for SBT type,
    ```
    'ChildItemType': responseType
    ```
    - `ResponseComponentId`: native for SBT style data; for eNAEP, it is a combination of AccNum and childAccNum

    ```
    'ResponseComponentId': "item-{}".format(accessionNumber) \
            if childItemAccessionNumber is None else \
            "item-{}-{}".format(accessionNumber, childItemAccessionNumber),
    ```

    :param source: the XML string or a XML node
    :return: a data frame or None
    """

    # if source is a XML node, skip the parsing
    try:
        root = ET.fromstring(source)
    except:
        # not a string
        root = source

    # get top level basic info, using proper xpath
    try:
        bookletNumber = root.find('./context/bookletNumber').text
        assignedForm = root.find('./context/assignedForm').text
        assessmentYear = root.find('./testResult').get("assessmentYear")
        subjectName = root.find('./testResult').get("subjectName")
        grade = root.find('./testResult').get("assessedGroup")
    except Exception as e:
        logger.error(
            "parseResponseXML: XML contains incomplete Booklet level information"
        )
        logger.exception(e)
        return None

    responseMatrix = []
    headerDict = {
        'BookletNumber': bookletNumber,
        'Form': assignedForm,
        'Year': assessmentYear,
        'SubjectCode': subjectName,
        'Grade': grade
    }

    for itemResult in root.iter('itemResult'):

        # now the key/value pairs
        if itemResult.get("itemType") in ["SBT", "ReadingNonSBT"]:
            try:
                responseMatrix += extractSbtResponseXML(itemResult, headerDict)
            except Exception as e:
                logger.error(
                    "parseResponseXML: Unable to parse SBT responseData")
                logger.exception(e)
                continue
        else:
            # regular eNAEP types
            try:
                blockCode = itemResult.get("blockCode")
                itemAccessionNumber = itemResult.get("accessionNumber")
                # sometimes response data is stored under a different AccNum than the ItemAccNum
                accessionNumber = itemResult.get("respondedIn")
                if not accessionNumber:
                    accessionNumber = itemAccessionNumber
                itemType = itemResult.get("itemType")
                childItemAccessionNumber = itemResult.get(
                    "childItemAccessionNumber")
                childItemType = itemResult.get("childItemType")
                content = itemResult.find(
                    "responseVariable/candidateResponse/value/content")
                ct = []
                if content is not None:
                    for pair in content.iter('pair'):
                        k = pair.find("key").text
                        v = pair.find("value").text
                        ct.append({"key": k, "val": v})

                responseMatrix.append({
                    'BookletNumber': bookletNumber,
                    'Form': assignedForm,
                    'Year': assessmentYear,
                    'SubjectCode': subjectName,
                    'Grade': grade,
                    'BlockCode': blockCode,
                    'AccessionNumber': accessionNumber,
                    'ItemAccessionNumber': itemAccessionNumber,
                    'ItemTypeCode': itemType,
                    'ChildItemAccessionNumber': childItemAccessionNumber,
                    'ChildItemType': childItemType,
                    'ResponseComponentId': "item-{}".format(accessionNumber) \
                        if childItemAccessionNumber is None else \
                        "item-{}-{}".format(accessionNumber, childItemAccessionNumber),
                    'Response': ct
                })
            except Exception as e:
                logger.error(
                    "parseResponseXML: Unable to parse eNAEP response content")
                logger.exception(e)
                continue

    return pd.DataFrame(responseMatrix)