Exemplo n.º 1
0
 def test_cet():
     """cElementTree"""
     _table = cet.Element('table')
     for row in table:
         tr = cet.SubElement(_table, 'tr')
         for c in row.values():
             cet.SubElement(tr, 'td').text = str(c)
     cet.tostring(_table)
Exemplo n.º 2
0
 def test_cet():
     """cElementTree"""
     _table = cet.Element("table")
     for row in table:
         tr = cet.SubElement(_table, "tr")
         for c in row.values():
             cet.SubElement(tr, "td").text = str(c)
     cet.tostring(_table)
Exemplo n.º 3
0
 def test_kid_et():
     """Kid template + cElementTree"""
     _table = cet.Element("table")
     for row in table:
         td = cet.SubElement(_table, "tr")
         for c in row.values():
             cet.SubElement(td, "td").text = str(c)
     kid_tmpl2.table = _table
     kid_tmpl2.serialize(output="html")
Exemplo n.º 4
0
 def test_kid_et():
     """Kid template + cElementTree"""
     _table = cet.Element('table')
     for row in table:
         td = cet.SubElement(_table, 'tr')
         for c in row.values():
             cet.SubElement(td, 'td').text = str(c)
     kid_tmpl2.table = _table
     kid_tmpl2.serialize(output='html')
Exemplo n.º 5
0
def addTokensToTree(tokens, element):
    for t in tokens:
        newToken = ElementTree.SubElement(element, "token")
        newToken.attrib["id"]   = t.id
        newToken.attrib["text"] = t.text
        newToken.attrib["POS"]  = t.pos
        newToken.attrib["charOffset"] = t.charOffset
Exemplo n.º 6
0
 def _node2xmlfields(self,noderecord):
     ''' fields in a node are written to xml fields; output is sorted according to grammar
     '''
     if 'BOTSID' not in noderecord:
         raise botslib.OutMessageError(_(u'No field "BOTSID" in xml-output in: "$record"'),record=noderecord)
     #first generate the xml-'record'
     attributedict = {}
     recordtag = noderecord['BOTSID']
     attributemarker = recordtag + self.ta_info['attributemarker'] 
     for key,value in noderecord.items():    #find the attributes for the xml-record, put these in attributedict
         if key.startswith(attributemarker):
             attributedict[key[len(attributemarker):]] = value
     xmlrecord = ET.Element(recordtag,attributedict) #make the xml ET node
     if 'BOTSCONTENT' in noderecord:
         xmlrecord.text = noderecord['BOTSCONTENT']
         del noderecord['BOTSCONTENT']
     for key in attributedict.keys():  #remove used fields
         del noderecord[attributemarker+key]
     del noderecord['BOTSID']    #remove 'record' tag
     #generate xml-'fields' in xml-'record'; not sorted
     noderecordcopy = noderecord.copy()
     for key,value in noderecordcopy.items():
         if key not in noderecord or self.ta_info['attributemarker'] in key: #if field not in outmessage: skip
             continue
         attributedict = {}
         attributemarker = key + self.ta_info['attributemarker'] 
         for key2,value2 in noderecord.items():
             if key2.startswith(attributemarker):
                 attributedict[key2[len(attributemarker):]] = value2
         ET.SubElement(xmlrecord, key,attributedict).text=value    #add xml element to xml record
         for key2 in attributedict.keys():  #remove used fields
             del noderecord[attributemarker+key2]
         del noderecord[key]    #remove xml entity tag
     return xmlrecord
Exemplo n.º 7
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(
                sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities")  #None
            #             for entitiesElement in entitiesElements:
            #                 if entitiesElement.get("source") == "SPECIES":
            #                     container = entitiesElement
            #                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0],
                                  offset[1] - sentOffset[0])
                    matchingText = sentence.get(
                        "text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText,
                                                      charOffset)
                    span.set("charOffset",
                             "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [
                        str(x) for x in charOffset
                    ]
                    del span.attrib["offset"]  #span.set("offset", "")
                    container.append(span)
Exemplo n.º 8
0
def addTokenization(tokenization, sentence, sentenceId):
    toks = sentence.find("sentenceanalyses/tokenizations")
    assert toks, "Missing <tokenizations> in sentence %s" % sentenceId

    # assume new-style XML format if there's at least one <tokenization> with
    # a "tokenizer" attribute. Also check duplicates.
    isNew = False
    for t in toks.getiterator("tokenization"):
        if t.get("tokenizer") is not None:
            assert t.get("tokenizer") is not None, "Split tokenization '%s' already exists in sentence %s!" % (tokenization, sentenceId)
            isNew = True

    if isNew:
        newTok = ElementTree.SubElement(toks, "tokenization")
        newTok.attrib["tokenizer"] = tokenization
    else:
        assert toks.find(tokenization) is None, "Split tokenization '%s' already exists in sentence %s!" % (tokenization, sentenceId)
        newTok = ElementTree.SubElement(toks, tokenization)

    return newTok
Exemplo n.º 9
0
def addParse(parse, tokenization, sentence, sentenceId):
    # check whether the XML is new-style or old-style.
    parses = sentence.find("sentenceanalyses/parses")
    assert parses, "Missing <parses> in sentence %s" % sentenceId

    # assume new-style if we have at least one <parse> with a "parser"
    # attribute. Also check that a parse under the given name isn't there
    # already
    isNew = False
    for p in parses.getiterator("parse"):
        if p.get("parser") is not None:
            assert p.get("parser") != parse, "New parse '%s' already exists in sentence %s!" % (parse, sentenceId)
            isNew = True

    if isNew:
        newParse = ElementTree.SubElement(parses, "parse")
        newParse.attrib["parser"] = parse
        newParse.attrib["tokenizer"] = tokenization
    else:
        # check for overlap
        assert parses.find(parse) is None, "New parse '%s' already exists in sentence %s!" % (options.newparse, sentenceId)
        newParse = ElementTree.SubElement(parses, parse)

    return newParse
Exemplo n.º 10
0
def makePath(element, tagList):
    #taglist is a list of tag names
    #a list of corresponding elements is returned
    #if these did not exist, they are created!
    #
    result = []
    currElem = element
    for tag in tagList:
        for subElem in currElem:
            if subElem.tag == tag:
                break
        else:
            subElem = ElementTree.SubElement(currElem, tag)
        result.append(subElem)
        currElem = subElem
    return result
Exemplo n.º 11
0
def updateXML(root, removeAnalyses=True):
    counts = defaultdict(int)
    for document in root.findall("document"):
        sentencePos = 0
        counts["documents"] += 1
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            # Remove the original parses
            analyses = sentence.find("sentenceanalyses")
            if analyses != None:
                counts["analyses"] += 1
                if removeAnalyses:
                    counts["removed-analyses"] += 1
                    sentence.remove(analyses)
            # Add an artifical sentence offset so that sentences can be exported as a single document
            sentenceText = sentence.get("text")
            sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
            # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
            for entity in sentence.findall("entity"):
                counts["entities"] += 1
                offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
                entityText = entity.get("text")
                for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
                    counts["entity-offsets"] += 1
                    lenOffset = offset[1] - offset[0]
                    offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
                    assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
                entity.set("charOffset", Range.tuplesToCharOffset(offsets))
            # Convert positive pairs into interaction elements
            numInteractions = 0
            for pair in sentence.findall("pair"):
                counts["pairs"] += 1
                sentence.remove(pair)
                if pair.get("interaction") == "True":
                    del pair.attrib["interaction"]
                    pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
                    pair.set("type", "PPI")
                    ET.SubElement(sentence, "interaction", pair.attrib)
                    numInteractions += 1
                    counts["interactions"] += 1
            sentencePos += len(sentenceText) + 1
    print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
    return root
Exemplo n.º 12
0
 def _node2xmlfields(self, noderecord):
     ''' fields in a node are written to xml fields; output is sorted according to grammar
     '''
     #first generate the xml-'record'
     #~ print 'record',noderecord['BOTSID']
     attributedict = {}
     recordtag = noderecord['BOTSID']
     attributemarker = recordtag + self.ta_info[
         'attributemarker']  #attributemarker is a marker in the fieldname used to find out if field is an attribute of either xml-'record' or xml-element
     #~ print '    rec_att_mark',attributemarker
     for key, value in noderecord.items(
     ):  #find attributes belonging to xml-'record' and store in attributedict
         if key.startswith(attributemarker):
             #~ print '    record attribute',key,value
             attributedict[key[len(attributemarker):]] = value
     xmlrecord = ET.Element(recordtag, attributedict)  #make the xml ET node
     if 'BOTSCONTENT' in noderecord:  #BOTSCONTENT is used to store the value/text of the xml-record itself.
         xmlrecord.text = noderecord['BOTSCONTENT']
         del noderecord['BOTSCONTENT']
     for key in attributedict.keys():  #remove used fields
         del noderecord[attributemarker + key]
     del noderecord['BOTSID']  #remove 'record' tag
     #generate xml-'fields' in xml-'record'; sort these by looping over records definition
     for field_def in self.defmessage.recorddefs[
             recordtag]:  #loop over fields in 'record'
         if field_def[
                 ID] not in noderecord:  #if field not in outmessage: skip
             continue
         #~ print '    field',field_def
         attributedict = {}
         attributemarker = field_def[ID] + self.ta_info['attributemarker']
         #~ print '    field_att_mark',attributemarker
         for key, value in noderecord.items():
             if key.startswith(attributemarker):
                 print '        field attribute', key, value
                 attributedict[key[len(attributemarker):]] = value
         ET.SubElement(xmlrecord, field_def[ID],
                       attributedict).text = noderecord[
                           field_def[ID]]  #add xml element to xml record
         for key in attributedict.keys():  #remove used fields
             del noderecord[attributemarker + key]
         del noderecord[field_def[ID]]  #remove xml entity tag
     return xmlrecord
Exemplo n.º 13
0
def convertChemProt(inDirs=None,
                    setNames=None,
                    outPath=None,
                    goldTestSet=True,
                    downloadDir=None,
                    extractDir=None,
                    redownload=False,
                    debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir,
                                                 setName.lower())
                inDirs.append(
                    downloadFile(Settings.URL["CP17_" + setName], downloadDir,
                                 currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv")
                    and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(
            dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source": corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["id", "title", "abstract"],
                    quoting=csv.QUOTE_NONE):
                document = ET.Element(
                    "document", {
                        "id": corpusName + ".d" + str(counts["documents"]),
                        "origId": row["id"],
                        "set": dataSetId
                    })
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset",
                             Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get(
                        "origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get(
                        "origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["docId", "id", "type", "begin", "end", "text"],
                    quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(
                        document, "entity", {
                            "id":
                            document.get("id") + ".e" +
                            str(len([x for x in document.findall("entity")]))
                        })
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set(
                        "normalized",
                        "True" if row["type"].endswith("-Y") else "False")
                    entity.set(
                        "charOffset",
                        Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row[
                        "docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f,
                                             delimiter="\t",
                                             fieldnames=[
                                                 "docId", "group", "groupEval",
                                                 "type", "arg1", "arg2"
                                             ],
                                             quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId +
                                                             ":")
                        row["arg" + argId] = row["arg" + argId][5:]
                    document = docById[row["docId"]]
                    e1 = entityById[row["docId"]].get(row["arg1"])
                    e2 = entityById[row["docId"]].get(row["arg2"])
                    if e1 != None and e2 != None:
                        interaction = ET.SubElement(
                            document, "interaction", {
                                "id":
                                document.get("id") + ".i" + str(
                                    len([
                                        x for x in document.findall(
                                            "interaction")
                                    ]))
                            })
                        interaction.set("directed", "True")
                        interaction.set("type", row["group"])
                        interaction.set("relType", row["type"])
                        row["groupEval"] = row["groupEval"].strip()
                        assert row["groupEval"] in ("Y", "N")
                        interaction.set(
                            "evaluated",
                            "True" if row["groupEval"] == "Y" else "False")
                        interaction.set("e1", e1.get("id"))
                        interaction.set("e2", e2.get("id"))
                        counts["interactions"] += 1
                    else:
                        counts["interaction-error"] += 1
                        docsWithErrors.add(row["docId"])
        else:
            print >> sys.stderr, "No relations for dataset", dataSetId
        print >> sys.stderr, "dataset", dataSetId, {
            x: counts[x] - prevCounts.get(x, 0)
            for x in counts if counts[x] - prevCounts.get(x, 0) > 0
        }
    if len(docsWithErrors) > 0:
        counts["documents-with-errors"] = len(docsWithErrors)
    print >> sys.stderr, "---", "All Datasets Done", "---"
    print >> sys.stderr, "ChemProt conversion:", dict(counts)
    if tempDir != None and not debug:
        print >> sys.stderr, "Removing temporary directory", tempDir
        shutil.rmtree(tempDir)
    if outPath != None:
        ETUtils.write(corpus, outPath)
    return ET.ElementTree(corpus)
Exemplo n.º 14
0
def eventsToNewXML(events):
    xml = ET.Element("corpus")
    xml.set("source", "Static Relations")
    docCount = 0
    sentenceById = {}
    for sentenceId in sorted(events.keys()):
        entities = []
        interactions = []
        entityByOffset = {}
        for event in events[sentenceId]:
            #print event
            if sentenceId not in sentenceById:
                document = ET.SubElement(xml, "document")
                document.set("id", "SR.d"+str(docCount))
                document.set("origId", sentenceId)
                document.set("set", event["dataSet"])
                sentence = ET.SubElement(document, "sentence")
                sentence.set("id", "SR.d"+str(docCount)+".s"+str(docCount))
                sentence.set("origId", sentenceId)
                sentence.set("text", event["text"])
                sentence.set("charOffset", "0-"+str(len(event["text"])-1))
                docCount += 1
                sentenceById[sentenceId] = sentence
            else:
                sentence = sentenceById[sentenceId]
                assert sentence.get("text") == event["text"], (sentence.get("text"), event["text"])
            # Add entities
            e1Offset = event["entity"].split("\t")[0]
            e2Offset = event["namedEntity"].split("\t")[0]
            if e1Offset not in entityByOffset:
                e1 = ET.Element("entity")
                e1.set("text", event["entity"].split("\t")[1].strip())
                e1.set("id", sentence.get("id")+".e"+str(len(entities)))
                offset = getOffset(event["entity"].split("\t")[0])
                assert sentence.get("text")[offset[0]:offset[1]+1] == e1.get("text"), (event, sentence.get("text"), e1.get("text"))
                e1.set("charOffset", str(offset[0]) + "-" + str(offset[1]))
                e1.set("isName", "False")
                e1.set("type", "Entity")
                entities.append(e1)
                entityByOffset[e1Offset] = e1
            else:
                e1 = entityByOffset[e1Offset]
            if e2Offset not in entityByOffset:
                e2 = ET.Element("entity")
                e2.set("text", event["namedEntity"].split("\t")[1].strip())
                e2.set("id", sentence.get("id")+".e"+str(len(entities)))
                offset = getOffset(event["namedEntity"].split("\t")[0])
                assert sentence.get("text")[offset[0]:offset[1]+1] == e2.get("text"), (event, sentence.get("text"), e2.get("text"))
                e2.set("charOffset", str(offset[0]) + "-" + str(offset[1]))
                e2.set("isName", "True")
                e2.set("type", "Protein")
                entities.append(e2)
                entityByOffset[e2Offset] = e2
            else:
                e2 = entityByOffset[e2Offset]
            # Add interactions
            interaction = ET.Element("interaction")
            interaction.set("id", sentence.get("id")+".i"+str(len(interactions)))
            interaction.set("origId", event["id"])
            interaction.set("type", event["eventType"])
            interaction.set("e1", e1.get("id"))
            interaction.set("e2", e2.get("id"))
            interactions.append(interaction)
        for entity in entities:
            sentence.append(entity)
        for interaction in interactions:
            sentence.append(interaction)
    return xml
Exemplo n.º 15
0
def string_variables_dict_to_xml_element(string_variables_dict):
    '''
    Convert string variables dictionary to xml element

    @author: Canh Duong <*****@*****.**>

    :param string_variables_dict:
        string_variables_dict = {
            'string0': {
                'name': 'string0',
                'original_text': 'Calculate',
                'default': 'Calculate',
                'value': 'Calculate',
                'context': 'context1',
                'context_list':
                    {
                        'context0': {
                            'name': "Context 1 of Calculate",
                            'help': "Synonym set 1",
                            'synonyms': ['Calculate', 'Compute'],
                            'select': 'true',
                        },
                        'context1': {
                            'name': "Context 2 of Calculate",
                            'help': "Synonym set 2",
                            'synonyms': ['Find', 'Figure out', 'Estimate'],
                            'select': 'false',
                        }
                    }
            },
            'string1': {
                'name': 'string1',
                'original_text': 'apple',
                'default': 'apple',
                'value': 'apple',
                'context': 'context0',
                'context_list':
                    {
                        'context0': {
                            'name': "Context 1 - Fruits",
                            'help': "Synonym set 1",
                            'synonyms': ['apple', 'mango'],
                            'select': 'true',
                        },
                        'context1': {
                            'name': "Context 2 - Computer",
                            'help': "Synonym set 2",
                            'synonyms': ['Apple', 'IBM', 'Google', 'GCS'],
                            'select': 'false',
                        }

                    }
            }
        }


    :return: string_variables_elem - xml element

        <string_variables>
            <string_variable default="car" name="string0" original_text="car" value="car">
                <context_list>
                    <context name="Synonyms of text 'car' (Default)" select="true">
                        <option>car</option>
                        <option>machine</option>
                        <option>truck</option>
                        <option>auto</option>
                        <option>automobile</option>
                        <option>electric car</option>
                    </context>
                </context_list>
            </string_variable>
            <string_variable default="house" name="string1" original_text="house" value="house">
                <context_list>
                    <context name="Synonyms of text 'house' (Default)" select="true">
                        <option>house</option>
                        <option>palace</option>
                        <option>building</option>
                        <option>land</option>
                        <option>island</option>
                    </context>
                </context_list>
            </string_variable>
        </string_variables>

    '''

    # xml elements
    string_variables_elem = ET.Element('string_variables')
    # Append child
    for var_name, string_var in string_variables_dict.iteritems():
        # create sub-element
        string_variable_elem = ET.SubElement(string_variables_elem,
                                             'string_variable')
        # Set attributes here
        string_variable_elem.set('name', var_name)
        string_variable_elem.set('original_text', string_var['original_text'])
        string_variable_elem.set('value', string_var['value'])
        string_variable_elem.set('default', string_var['default'])
        # create sub-element
        context_list_elem = ET.SubElement(string_variable_elem, 'context_list')
        # Append child
        for context_id, context in string_var['context_list'].iteritems():
            # create sub-element
            context_elem = ET.SubElement(context_list_elem, 'context')
            # Set attributes here
            context_elem.set('name', context['name'])
            # context_elem.text(context['help'])
            if context_id == string_var['context']:
                context_elem.set('select', 'true')
            else:
                context_elem.set('select', 'false')
            # Append child
            for word in context['synonyms']:
                # create sub-element
                option = ET.SubElement(context_elem, 'option')
                # Set text value here
                option.text = word

    return string_variables_elem
Exemplo n.º 16
0
def convert_data_from_dict_to_xml(data):
    '''
    Convert data fields from dictionary to XML when perform studio submission on Basic Template tab.
    The output will be updated to value of Advanced Editor.

        1. problem description
        2. Image url
        3. variables (name, min_value, max_value, type, decimal_places)
        4. answer_template_string

    :param data -- a dictionary of fields supported for raw editor
    :return:
    <problem>
        <description>Given a = [a] and b = [b]. Calculate the [sum], [difference] of a and b. </description>
        <images>
            <image_url link="http://example.com/image1">Image 1</image_url>
            <image_url link="http://example.com/image2">Image 2</image_url>
        </images>
        <variables>
            <variable name="a" min="1" max="200" type="integer"/>
            <variable name="b" min="1.0" max="20.5" type="float" decimal_places="2"/>
        </variables>
        <answer_templates>
            <answer sum = "[a] + [b]" difference = "[a] - [b]">Teacher's answer</answer>
        </answer_templates>
    </problem>
    '''
    print("## CALLING FUNCTION convert_data_from_dict_to_xml() ##")
    # print("Input data type: {}".format(type(data)))
    print("Input data: {}".format(data))

    xml_string = ''
    # init the root element: problem
    problem = ET.Element('problem')

    # convert question template
    field_question_template = data['question_template']
    description = ET.SubElement(problem, 'description')
    description.text = field_question_template

    # Convert answer template tring to dictionary,
    # then build xml string for raw edit fields
    field_answer_template = data['answer_template']
    # Check for empty input
    if not field_answer_template:
        raise JsonHandlerError(400, "Answer template must not be empty")

    # Handle answer template
    # Parse and convert answer template string to dictionary first
    answer_template_dict = {}
    answer_template_list = field_answer_template.split('\n')

    for answer in answer_template_list:
        # only process if not empty, ignore empty answer template
        if answer:
            # answer template must contains '=' character
            if (answer.find('=') != -1):  # found '=' at lowest index of string
                answer_attrib_list = answer.split('=')
                # print "answer_attrib_list = "
                # print(answer_attrib_list)

                answer_attrib_key = answer_attrib_list[0]
                answer_attrib_value = answer_attrib_list[1]
                # print "answer_attrib_key = "
                # print(answer_attrib_key)
                # print "answer_attrib_value = "
                # print(answer_attrib_value)

                # Remove unexpected white spaces
                answer_attrib_key = answer_attrib_key.lstrip(
                )  # all leading whitespaces are removed from the string.
                answer_attrib_key = answer_attrib_key.rstrip(
                )  # all ending whitespaces are removed from the string.
                answer_attrib_value = answer_attrib_value.lstrip(
                )  # all leading whitespaces are removed from the string.
                answer_attrib_value = answer_attrib_value.rstrip(
                )  # all ending whitespaces are removed from the string.

                # print "REMOVED SPACES, answer_attrib_key = "
                # print(answer_attrib_key)
                # print "REMOVED SPACES,answer_attrib_value = "
                # print(answer_attrib_value)

                # Add answer attribute to dict
                answer_template_dict[answer_attrib_key] = answer_attrib_value
            else:
                raise JsonHandlerError(
                    400,
                    "Unsupported answer format. Answer template must contains '=' character: {}"
                    .format(answer))
                # print("Resulted answer_template_dict: {}".format(answer_template_dict))
    # Answer template xml elements
    answer_templates = ET.SubElement(problem, 'answer_templates')
    answer = ET.SubElement(answer_templates, 'answer')
    # Add the converted dict data to xml elements
    for attrib_key, attrib_value in answer_template_dict.iteritems():
        answer.set(attrib_key, attrib_value)
        answer.text = "Teacher's answer"

    # Convert numeric variables
    field_variables = data['variables']
    # xml elements
    variables_elem = ET.SubElement(problem, 'variables')
    for var_name, attributes in field_variables.iteritems():
        var_name = ET.SubElement(variables_elem, 'variable')
        for attribute, value in attributes.iteritems():
            # Set attribute
            var_name.set(attribute, value)

    # Convert string variables dictionary to xml string
    field_string_variables = data['string_variables']
    # xml elements
    string_variables_elem = string_variables_dict_to_xml_element(
        field_string_variables)
    # Adds the element subelement to the end of this elements internal list of subelements.
    problem.append(string_variables_elem)

    # convert image
    field_image_url = data['image_url']
    # xml elements
    images = ET.SubElement(problem, 'images')
    image_url = ET.SubElement(images, 'image_url')
    # Set attribute
    image_url.set('link', field_image_url)

    # print "before indent, Problem elem dum = ", ET.dump(problem)
    indented_problem = indent(problem)
    # print "after indent ,Problem elem dum = ", ET.dump(indented_problem)

    xml_string = ET.tostring(indented_problem)
    # print "Output xml string = ", xml_string
    print("## End FUNCTION convert_data_from_dict_to_xml() ##")

    return xml_string
Exemplo n.º 17
0
 def process(self, directed, negatives):
     # Build the entities
     for tag in ("e1", "e2"):
         self.entities.append(self._getEntity(self.text, tag))
         self.text = self.text.replace("<" + tag + ">",
                                       "").replace("</" + tag + ">", "")
     # Check entity offsets
     for entity in self.entities:
         begin, end = [int(x) for x in entity.get("charOffset").split("-")]
         assert entity.get("text") == self.text[begin:end], (
             entity.get("text"), self.text, self.text[begin:end],
             [begin, end])
     assert len(self.entities) == 2
     eMap = {"e1": self.entities[0], "e2": self.entities[1]}
     for key in eMap:  # Check that e1 == e1 and e2 == e2
         assert eMap[key].get("id").endswith("." + key)
     # Build the sentence
     docElem = ET.Element("document", {
         "id": self.corpusId + ".d" + self.origId,
         "set": self.setName
     })
     sentElem = ET.SubElement(
         docElem, "sentence", {
             "id": self.id,
             "charOffset": "0-" + str(len(self.text)),
             "text": self.text,
             "origId": self.origId
         })
     sentElem.set("relation", self.relation)
     if self.comment != None and self.comment != "":
         sentElem.set("comment", self.comment)
     for entity in self.entities:
         sentElem.append(entity)
     # Determine interaction types per direction
     relFrom, relTo = "", ""
     if self.relation == "Other":
         sentElem.append(
             self._getInteraction(self.relation, "e1", "e2", directed, 0,
                                  eMap, relFrom, relTo))
     else:
         relType, rest = self.relation.strip(")").split("(")
         relFrom, relTo = rest.split(",")
         reverse = (relFrom == "e2" and relTo == "e1")
         if not reverse:
             assert relFrom == "e1" and relTo == "e2"
             forwardType = self.mergeType(relType, relFrom, relTo) if (
                 negatives == "REVERSE_POS") else relType
             reverseType = self.mergeType(relType, relTo, relFrom) if (
                 negatives == "REVERSE_POS") else "neg"
         else:
             forwardType = self.mergeType(relType, relFrom, relTo) if (
                 negatives == "REVERSE_POS") else "neg"
             reverseType = self.mergeType(relType, relTo, relFrom) if (
                 negatives == "REVERSE_POS") else relType
         # Build the interactions
         if directed:
             if forwardType != "neg" or negatives == "INCLUDE":
                 sentElem.append(
                     self._getInteraction(forwardType, "e1", "e2", directed,
                                          0, eMap, "e1", "e2"))
             if reverseType != "neg" or negatives == "INCLUDE":
                 sentElem.append(
                     self._getInteraction(reverseType, "e2", "e1", directed,
                                          1, eMap, "e2", "e1"))
         else:
             sentElem.append(
                 self._getInteraction(self.relation, "e1", "e2", directed,
                                      0, eMap, relFrom, relTo))
     return docElem
Exemplo n.º 18
0
        # make a mapping from original to split token ids. Note that for
        # split tokens, only the last of the split parts will be stored.
        tokenIdMap = {}
        for t in split:
            tokenIdMap[t.origId] = t.id

        # make a copy of the specified parse that refers to the split tokens
        # instead of the originals.
        newparse = addParse(options.newparse, options.splittokenization, sentence, sId)

        depSeqId = 1
        for d in parse.getiterator("dependency"):
            t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
            assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"

            dep = ElementTree.SubElement(newparse, "dependency")
            dep.attrib["t1"]   = tokenIdMap[t1]
            dep.attrib["t2"]   = tokenIdMap[t2]
            dep.attrib["type"] = dType
            dep.attrib["id"]   = "split_%d" % depSeqId
            depSeqId += 1

        # Add in new dependencies between the split parts.
        for t in [tok for tok in split if tok.splitFrom is not None]:
            dep = ElementTree.SubElement(newparse, "dependency")
            dep.attrib["t1"]   = t.id
            dep.attrib["t2"]   = t.splitFrom
            dep.attrib["type"] = splitDepName

    indent(root)
    tree.write(sys.stdout)