Exemplo n.º 1
0
def writeUTF8(rootElement,out):
    indent(rootElement)
    if isinstance(out,str):
        f=open(out,"wt")
        print >> f, '<?xml version="1.0" encoding="UTF-8"?>'
        ET.ElementTree(rootElement).write(f,"utf-8")
        f.close()
    else:
        print >> out, '<?xml version="1.0" encoding="UTF-8"?>'
        ET.ElementTree(rootElement).write(out,"utf-8")
Exemplo n.º 2
0
def processLines(lines,
                 setName,
                 usedIds,
                 directed=True,
                 negatives="INCLUDE",
                 tree=None,
                 corpusId="SE10T8"):
    if tree == None:
        corpus = ET.Element("corpus", {"source": corpusId})
        tree = ET.ElementTree(corpus)
    else:
        corpus = tree.getroot()
    sentence = None
    for line in lines:
        line = line.strip()
        if sentence == None:
            assert line[0].isdigit(), line
            origId, line = line.split("\t")
            sentence = Sentence(origId,
                                line.strip().strip("\""), corpusId, usedIds,
                                setName)
        else:
            if line.startswith("Comment:"):
                sentence.comment = line.split(":", 1)[-1].strip()
            elif line != "":
                sentence.relation = line
            else:
                assert sentence != None
                corpus.append(
                    sentence.process(directed=directed, negatives=negatives))
                sentence = None
    return tree
Exemplo n.º 3
0
 def initfromfile(self):
     filename = botslib.abspathdata(self.ta_info['filename'])
     self.ta_info['attributemarker'] = '__'
     parser = ET.XMLParser()
     etree =  ET.ElementTree()   #ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed
     etreeroot = etree.parse(filename, parser)
     self.root = self._etree2botstree(etreeroot)  #convert etree to bots-nodes-tree
Exemplo n.º 4
0
def _test():
    msfile = os.path.join(os.path.abspath(os.curdir), 'modulesets', moduleset)
    tree = ElementTree.ElementTree(file=msfile)
    elem = tree.getroot()

    handlers = Handlers(msfile)
    handlers.handle(elem, None)

    def filefunc(package):
        # return a relative path to the bitbake .bb which will be written
        src_uri = package.getVar('SRC_URI', 1)
        filename = package.getVar('PN', 1) + '.bb'
        if not src_uri:
            return filename
        else:
            substr = src_uri[src_uri.find('xorg/'):]
            subdirlist = substr.split('/')[:2]
            subdir = '-'.join(subdirlist)
            return os.path.join(subdir, filename)

    emitter = Emitter(filefunc)
    for package in handlers.packages:
        template = emitter.filefunc(package) + '.in'
        if os.path.exists(template):
            print("%s exists, emitting based on template" % template)
            emitter.write(package, template)
        else:
            print("%s does not exist, emitting non-templated" % template)
            emitter.write(package)
Exemplo n.º 5
0
def serialize(elem, encoding=None):
    import StringIO
    file = StringIO.StringIO()
    tree = ElementTree.ElementTree(elem)
    if encoding:
        tree.write(file, encoding)
    else:
        tree.write(file)
    return file.getvalue()
Exemplo n.º 6
0
 def envelopewrite(self,node):
     ''' write envelope for XML messages'''
     self._initwrite()
     self.normalisetree(node)
     xmltree = ET.ElementTree(self._node2xml(node))
     root = xmltree.getroot()
     ETI.include(root)
     self._xmlcorewrite(xmltree,root)
     self._closewrite()
Exemplo n.º 7
0
def saveKeys():
    if not _KEYFILE:
        raise Exception, 'No _KEYFILE'

    tree = ElementTree.ElementTree(ElementTree.Element('Keys'))
    for x in _KEYS:
        e = ElementTree.Element('key')
        e.text = x
        tree.getroot().append(e)
    tree.write(_KEYFILE)
def write(rootElement, filename):
    if isinstance(rootElement, ElementTree.ElementTree):
        rootElement = rootElement.getroot()
    indent(rootElement)
    if filename.endswith(".gz"):
        out = GzipFile(filename, "wt")
    else:
        out = open(filename, "wt")
    print >> out, '<?xml version="1.0" encoding="UTF-8"?>'
    ElementTree.ElementTree(rootElement).write(out, "utf-8")
    out.close()
Exemplo n.º 9
0
    def include(self, element, parent):
        href = element.attrib.get('href')
        fullhref = os.path.join(self.msdirname, href)
        tree = ElementTree.ElementTree(file=fullhref)
        elem = tree.getroot()

        # Append the children of the newly included root element to the parent
        # element, and manually handle() them, as the currently running
        # iteration isn't going to hit them.
        for child in elem:
            self.handle(child, elem)
            parent.append(elem)
Exemplo n.º 10
0
def make_keyfiles():
    x = ElementTree.Element('Keys')
    y = ElementTree.Element('key')
    y["name"] = KEY[0]
    y.text = KEY[1]
    x.append(y)
    y = ElementTree.Element('key')
    y.text = 'foo'
    x.append(y)
    b = ElementTree.ElementTree(x)
    b.write('tests/xml/keys')
    b.write('tests/xml/keys3')
Exemplo n.º 11
0
    def _write(self, node):
        ''' write normal XML messages (no envelope)'''
        xmltree = ET.ElementTree(self._node2xml(node))
        f = botslib.opendata(self.ta_info['filename'], "wb")
        if self.ta_info['indented']:
            indentstring = '\n'
        else:
            indentstring = ''

        #xml prolog: always use.*********************************
        #syntax parameter controls if stand-alone is used within prolog.
        #in ET 1.3.0: if standalone is to be used: should surpress ET-generated prolog - explicit parameter
        #in ET 1.2.6: always generates prolog if encoding != utf-8/ascii. SO: can not use stadnalone for encoding !=utf-8,ascii
        if ET.VERSION not in ['1.2.6', '1.0.6'] or self.ta_info['charset'] in [
                'us-ascii', 'utf-8'
        ]:
            if self.ta_info['standalone']:
                standalonestring = 'standalone="%s" ' % (
                    self.ta_info['standalone'])
            else:
                standalonestring = ''
            PI = ET.ProcessingInstruction(
                'xml', 'version="%s" encoding="%s" %s' %
                (self.ta_info['version'], self.ta_info['charset'],
                 standalonestring))
            f.write(
                ET.tostring(PI) + indentstring
            )  #do not use encoding here. gives double xml prolog; possibly because ET.ElementTree.write i used again by write()

        #doctype /DTD **************************************
        if self.ta_info['DOCTYPE']:
            f.write('<!DOCTYPE %s>' % (self.ta_info['DOCTYPE']) + indentstring)

        #processing instructions (other than prolog) ************
        if self.ta_info['processing_instructions']:
            for pi in self.ta_info['processing_instructions']:
                PI = ET.ProcessingInstruction(pi[0], pi[1])
                f.write(
                    ET.tostring(PI) + indentstring
                )  #do not use encoding here. gives double xml prolog; possibly because ET.ElementTree.write i used again by write()

        #indent the xml elements
        if self.ta_info['indented']:
            root = xmltree.getroot()
            self.botsindent(root)

        if ET.VERSION <= '1.2.6':
            xmltree.write(f, encoding=self.ta_info['charset'])
        else:
            xmltree.write(f,
                          encoding=self.ta_info['charset'],
                          xml_declaration=False)
Exemplo n.º 12
0
def init(label, fileoutput):

    conarypk = ConaryPk()

    cli = conarypk.cli
    cfg = conarypk.cfg
    log.info("Attempting to retrieve repository data for %s" % label)
    try:
        pkgs = getPackagesFromLabel(cfg, cli, label)
        troves = conarypk.repos.getTroves(pkgs, withFiles=False)
        nodes = generate_xml(troves, label)
        cElementTree.ElementTree(nodes).write(fileoutput)
        log.info("Successfully wrote XML data for label %s into file %s" %
                 (label, fileoutput))
    except:
        log.error("Failed to gather data from the repository")
def write(rootElement, filename):
    if isinstance(rootElement, ElementTree.ElementTree):
        rootElement = rootElement.getroot()
    indent(rootElement)
    # Create intermediate paths if needed
    if os.path.dirname(filename) != "" and not os.path.exists(
            os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    # Open the output file
    if filename.endswith(".gz"):
        out = GzipFile(filename, "wt")
    else:
        out = open(filename, "wt")
    print >> out, '<?xml version="1.0" encoding="UTF-8"?>'
    ElementTree.ElementTree(rootElement).write(out, "utf-8")
    out.close()
    # Fix newlines inside attributes
    encodeNewlines(filename)
Exemplo n.º 14
0
def loadKeys(fname=None, force_=False):
    global _KEYS, _KEYFILE
    if not fname:
        fname = os.path.expanduser('~/.isbndbkeys')
    if not os.path.exists(fname):
        a = ElementTree.Element('Keys')
        ElementTree.ElementTree(a).write(fname)
    _KEYFILE = fname
    _KEYS = dict()
    tree = ElementTree.parse(fname)
    for x in tree.findall('key'):
        try:
            _KEYS[x.get("name")] = Key(x.text, x.get("name"))
        except:
            if force_:
                _KEYS[x.get("name")] = Key(x.text, x.get("name"), force_=True)
            else:
                pass
Exemplo n.º 15
0
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False):
    print >> sys.stderr, "Loading Static Relations"
    events = {}
    for srFile in srFiles:
        readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText)
    
    if xmlFileName != None:
        xmlEvents = {}
        dataSets = {}
        srTexts = {} # original, unnormalized sentence texts from the SR corpus
        eventsToXML(events, xmlEvents, dataSets, srTexts)
        
        print >> sys.stderr, "Loading XML"
        xml = ETUtils.ETFromObj(xmlFileName)
        print >> sys.stderr, "Inserting XML events"
        insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        # update pre-existing parses
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")])
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False)
        STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False)
    else:
        xml = eventsToNewXML(events)
        xmlTree = ET.ElementTree(xml)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        xml = xmlTree
        # Parse
        bigfileName = outdir+corpusName
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")    
Exemplo n.º 16
0
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    
    bigfileName = os.path.join(outDir, "DDI")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    if trainUnified == None:
        trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"]
    if trainMTMX == None:
        trainMTMX = Settings.URL["DDI_TRAIN_MTMX"]
    if testUnified == None:
        testUnified = Settings.URL["DDI_TEST_UNIFIED"]
    if testMTMX == None:
        testMTMX = Settings.URL["DDI_TEST_MTMX"]
    
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    if True:
        documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir)
        
        sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
        for i in range(0, len(sortedDocCounts)-3, 4):
            for j in [0,1]:
                docById[sortedDocCounts[i+j][0]].set("set", "train")
                datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
                datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
            docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
            docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
            datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
            datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
            datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
        for document in documents: # epajaolliset jaa yli
            if document.get("set") == None:
                document.set("set", "train")
        
        print datasetCounts
        for key in datasetCounts.keys():
            if datasetCounts[key][1] != 0:
                print key, datasetCounts[key][0] / float(datasetCounts[key][1])
            else:
                print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
        
        if testUnified != None:
            testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir)
            for document in testDocuments:
                document.set("set", "test")
            documents = documents + testDocuments
        
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DrugDDI")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")
    #sys.exit()
        
    if False:
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
    
        #if True:
        #xml = bigfileName + "-stanford.xml"        
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
        #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #if "devel" in [x[0] for x in datasets]:
    #    print >> sys.stderr, "Creating empty devel set"
    #    deletionRules = {"interaction":{},"entity":{"isName":"False"}}
    #    InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules)
    #return xml
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Exemplo n.º 17
0
 def _write(self,node):
     ''' write normal XML messages (no envelope)'''
     xmltree = ET.ElementTree(self._node2xml(node))
     root = xmltree.getroot()
     self._xmlcorewrite(xmltree,root)
Exemplo n.º 18
0
        try:
            data = data.encode(encoding)
        except UnicodeDecodeError:
            pass

        # Guess from who we're getting this?
        data = data.replace('\x00', '')
        try:
            parser.feed(data)
        except Exception, error:
            if dump_invalid_data:
                print error, repr(data)
            parser.close()
            raise
        else:
            return ET.ElementTree(parser.close())

    def get_device_xml(self):
        """
        Get the main device descriptor xml file.
        """
        xml = """<root>
    <specVersion>
        <major>1</major>
        <minor>0</minor>
    </specVersion>
    <device>
        <deviceType>urn:schemas-upnp-org:device:MediaRenderer:1</deviceType>
        <friendlyName>{friendly_name}</friendlyName>
        <manufacturer>{manufacturer}</manufacturer>
        <manufacturerURL>{manufacturer_url}</manufacturerURL>
Exemplo n.º 19
0
def convertDDI(outDir,
               downloadDir=None,
               redownload=False,
               makeIntermediateFiles=True,
               debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir,
                                      downloadDir)

    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"

    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(),
                             key=lambda (k, v): (v, k),
                             reverse=True)
    datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]}
    for i in range(0, len(sortedDocCounts) - 3, 4):
        for j in [0, 1]:
            docById[sortedDocCounts[i + j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i + j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i + j][1][1]
        docById[sortedDocCounts[i + 2][0]].set(
            "set",
            "train")  #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i + 3][0]].set(
            "set",
            "devel")  #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i + 2][1][
            0]  #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i + 2][1][
            1]  #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][
            0]  #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][
            1]  #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents:  # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in [
            'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334',
            'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354',
            'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388',
            'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409',
            'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
            'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452',
            'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474',
            'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492',
            'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500',
            'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
            'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552',
            'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570',
            'DrugDDI.d578'
    ]:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"),
                                            testDocById[key].get("origId"),
                                            sorted(docById.keys()),
                                            sorted(testDocById.keys()),
                                            sorted(overlappingIds))
        docById.update(testDocById)

    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(trainMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(testMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")

    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"],
                                      os.path.join(Settings.DATAPATH,
                                                   "TEES-parses"),
                                      downloadDir,
                                      redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH,
                                     "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"source": "TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"stanfordSource": "TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml,
                              splitTarget,
                              tokenization=None,
                              output=None,
                              removeExisting=True)

    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")

    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Exemplo n.º 20
0
def convertChemProt(inDirs=None,
                    setNames=None,
                    outPath=None,
                    goldTestSet=True,
                    downloadDir=None,
                    extractDir=None,
                    redownload=False,
                    debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir,
                                                 setName.lower())
                inDirs.append(
                    downloadFile(Settings.URL["CP17_" + setName], downloadDir,
                                 currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv")
                    and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(
            dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source": corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["id", "title", "abstract"],
                    quoting=csv.QUOTE_NONE):
                document = ET.Element(
                    "document", {
                        "id": corpusName + ".d" + str(counts["documents"]),
                        "origId": row["id"],
                        "set": dataSetId
                    })
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset",
                             Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get(
                        "origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get(
                        "origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["docId", "id", "type", "begin", "end", "text"],
                    quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(
                        document, "entity", {
                            "id":
                            document.get("id") + ".e" +
                            str(len([x for x in document.findall("entity")]))
                        })
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set(
                        "normalized",
                        "True" if row["type"].endswith("-Y") else "False")
                    entity.set(
                        "charOffset",
                        Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row[
                        "docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f,
                                             delimiter="\t",
                                             fieldnames=[
                                                 "docId", "group", "groupEval",
                                                 "type", "arg1", "arg2"
                                             ],
                                             quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId +
                                                             ":")
                        row["arg" + argId] = row["arg" + argId][5:]
                    document = docById[row["docId"]]
                    e1 = entityById[row["docId"]].get(row["arg1"])
                    e2 = entityById[row["docId"]].get(row["arg2"])
                    if e1 != None and e2 != None:
                        interaction = ET.SubElement(
                            document, "interaction", {
                                "id":
                                document.get("id") + ".i" + str(
                                    len([
                                        x for x in document.findall(
                                            "interaction")
                                    ]))
                            })
                        interaction.set("directed", "True")
                        interaction.set("type", row["group"])
                        interaction.set("relType", row["type"])
                        row["groupEval"] = row["groupEval"].strip()
                        assert row["groupEval"] in ("Y", "N")
                        interaction.set(
                            "evaluated",
                            "True" if row["groupEval"] == "Y" else "False")
                        interaction.set("e1", e1.get("id"))
                        interaction.set("e2", e2.get("id"))
                        counts["interactions"] += 1
                    else:
                        counts["interaction-error"] += 1
                        docsWithErrors.add(row["docId"])
        else:
            print >> sys.stderr, "No relations for dataset", dataSetId
        print >> sys.stderr, "dataset", dataSetId, {
            x: counts[x] - prevCounts.get(x, 0)
            for x in counts if counts[x] - prevCounts.get(x, 0) > 0
        }
    if len(docsWithErrors) > 0:
        counts["documents-with-errors"] = len(docsWithErrors)
    print >> sys.stderr, "---", "All Datasets Done", "---"
    print >> sys.stderr, "ChemProt conversion:", dict(counts)
    if tempDir != None and not debug:
        print >> sys.stderr, "Removing temporary directory", tempDir
        shutil.rmtree(tempDir)
    if outPath != None:
        ETUtils.write(corpus, outPath)
    return ET.ElementTree(corpus)
Exemplo n.º 21
0
            elem.tail = i


def parse_xml(data, encoding="utf-8", dump_invalid_data=False):
    try:
        p = ET.XMLParser(encoding=encoding)
    except exceptions.TypeError:
        p = ET.XMLParser()

    # my version of twisted.web returns page_infos as a dictionary in
    # the second item of the data list
    if isinstance(data, (list, tuple)):
        data, _ = data

    try:
        data = data.encode(encoding)
    except UnicodeDecodeError:
        pass

    # Guess from who we're getting this?
    data = data.replace('\x00', '')
    try:
        p.feed(data)
    except Exception, error:
        if dump_invalid_data:
            print error, repr(data)
        p.close()
        raise Exception, error
    else:
        return ET.ElementTree(p.close())
Exemplo n.º 22
0
    print >> sys.stderr, 'Error: Module', module, 'or package', package, 'not found in CPAN.'
    sys.exit(1)

modules.sort()

if package_version:
    version = package_version
location = path

#print >>sys.stderr, 'We found package %s with version %s with modules:' % (package, version)

ppath = path.split('/')
mnemo = ppath[2]

### Find specific author in CPAN authors list
tree = ElementTree.ElementTree(file=os.path.join(tmppath, '00whois.xml'))
root = tree.getroot()
for elem in root.getiterator('{http://www.cpan.org/xmlns/whois}cpanid'):
    if mnemo == elem.find('{http://www.cpan.org/xmlns/whois}id').text:
        try:
            authorel = elem.find('{http://www.cpan.org/xmlns/whois}fullname')
            emailel = elem.find('{http://www.cpan.org/xmlns/whois}email')
            email = emailel.text.replace('@', '$').replace('.', ',').replace(
                ' at ', '$').replace(' dot ',
                                     ',').replace(' [at] ',
                                                  '$').replace('_dot_', ',')
            author = "%s <%s>" % (authorel.text, email)
            authors.append(author.encode('utf8', 'replace'))
        except:
            pass
        break