def buildExamplesForDocuments(self,
                                  documentSentences,
                                  output,
                                  idFileTag=None):
        examples = []
        counter = ProgressCounter(len(documentSentences), "Build examples")

        #calculatePredictedRange(self, sentences)

        outfile = open(output, "wt")
        exampleCount = 0
        for document in documentSentences:
            counter.update(
                1,
                "Building examples (" + document[0].sentence.get("id") + "): ")
            examples = self.buildExamples(document)
            exampleCount += len(examples)
            #examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Exemplo n.º 2
0
def buildExamples(exampleBuilder, sentences, options):
    print >> sys.stderr, "Defining predicted value range:",
    sentenceElements = []
    for sentence in sentences:
        sentenceElements.append(sentence[0].sentenceElement)
    exampleBuilder.definePredictedValueRange(sentenceElements, "entity")
    print >> sys.stderr, exampleBuilder.getPredictedValueRange()
    
    examples = []
    if hasattr(exampleBuilder, "styles") and "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    for sentence in sentences:
        counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
        sentence[1] = exampleBuilder.buildExamples(sentence[0])
        examples.extend(sentence[1])
    print >> sys.stderr, "Examples built:", len(examples)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Preprocessing examples:"
    examples = exampleBuilder.preProcessExamples(examples)
    # Save examples
#    if options.output != None:
#        print >> sys.stderr, "Saving examples to", options.output + "/examples.txt"
#        commentLines = []
#        commentLines.append("Input file: " + options.input)
#        commentLines.append("Example builder: " + options.exampleBuilder)
#        commentLines.append("Features:")
#        commentLines.extend(exampleBuilder.featureSet.toStrings())
#        Example.writeExamples(examples, options.output + "/examples.txt", commentLines)
    #examples = filterFeatures(exampleBuilder.featureSet, examples)
    #Example.normalizeFeatureVectors(examples)
    return examples
Exemplo n.º 3
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 4
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemplo n.º 5
0
def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0,0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
        parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
Exemplo n.º 6
0
    def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >>sys.stderr, "Examples built:", exampleCount
        print >>sys.stderr, "Features:", len(self.featureSet.getNames())
        # IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        # ENDIF
        # Save Ids
        if idFileTag != None:
            print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Exemplo n.º 7
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".",1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for k in sorted(rules.keys()):
        countsByType[k] = 0
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemplo n.º 8
0
 def prepareDocuments(self,
                      corpusRoot,
                      files,
                      conllFormat=None,
                      counts=None):
     print >> sys.stderr, "Generating document elements from the parses"
     docNames = sorted(files.keys())
     corpusName = corpusRoot.get("source", "CORPUS")
     #parseExtensions = set(["ptb", "conll", "conllx", "conllu"])
     counter = ProgressCounter(len(docNames), "Document Generation")
     for i in range(len(docNames)):
         docName = docNames[i]
         counter.update(
             1, "Making document element for document '" + str(docName) +
             "': ")
         #filePaths = files[docName]
         extensions = sorted(files[docName].keys())
         sentObjs = self.readParse(extensions[0],
                                   files[docName][extensions[0]],
                                   conllFormat)
         sentTexts = []
         for sentObj in sentObjs:
             if "tokens" in sentObj:
                 sentTexts.append(" ".join(
                     [x["text"] for x in sentObj["tokens"]]))
         docText = " ".join(sentTexts)
         ET.SubElement(corpusRoot,
                       "document",
                       id=corpusName + ".d" + str(i),
                       origId=docName,
                       text=docText)
     return [x for x in corpusRoot.findall("document")]
Exemplo n.º 9
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            if type(rules[eType][attrRule]) in types.StringTypes:
                rules[eType][attrRule] = rules[eType][attrRule].split("|")

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemplo n.º 10
0
def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets,
                   negativeClassId, entityMatchFunction):
    entityExamples = []
    entityPredictions = []
    interactionExamples = []
    interactionPredictions = []
    eventExamples = []
    eventPredictions = []
    falseEntity = defaultdict(lambda: defaultdict(int))
    counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing")
    # Loop through the sentences and collect all predictions
    toCorpusSentences = None
    if toCorpus != None:
        toCorpusSentences = toCorpus.documentSentences
    for i in range(len(fromCorpus.documentSentences)):
        if len(fromCorpus.documentSentences[i]) > 0:
            counter.update(
                len(fromCorpus.documentSentences[i]),
                fromCorpus.documentSentences[i][0].sentence.get("id").rsplit(
                    ".", 1)[0])
        if toCorpusSentences != None:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(
                fromCorpus.documentSentences[i], toCorpusSentences[i], target,
                classSets, negativeClassId, entityMatchFunction)
        else:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(
                fromCorpus.documentSentences[i], None, target, classSets,
                negativeClassId, entityMatchFunction)
        entityExamples.extend(newEntityExPred[0])
        entityPredictions.extend(newEntityExPred[1])
        interactionExamples.extend(newInteractionExPred[0])
        interactionPredictions.extend(newInteractionExPred[1])
        eventExamples.extend(newEventExPred[0])
        eventPredictions.extend(newEventExPred[1])
        for k, v in sentFalseEntity.iteritems():
            falseEntity[k][0] += v[0]
            falseEntity[k][1] += v[1]

    # Process the predictions with an evaluator and print the results
    evaluator = None
    if len(entityPredictions) > 0:
        evaluator = EvaluatorClass(entityExamples,
                                   entityPredictions,
                                   classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Entities")
    if len(interactionPredictions) > 0:
        evaluator = EvaluatorClass(interactionExamples,
                                   interactionPredictions,
                                   classSet=classSets["interaction"])
        print evaluator.toStringConcise(title="Interactions")
        #print "Interactions (fp ent->fp int, fn-ent->fn-int )"
        #for key in sorted(falseEntity.keys()):
        #    print "", key, falseEntity[key][0], "/", falseEntity[key][1]
    if len(eventPredictions) > 0:
        evaluator = EvaluatorClass(eventExamples,
                                   eventPredictions,
                                   classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Events")
    return evaluator
Exemplo n.º 11
0
 def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
     typeCounts = {x:defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
                 matchFound = True
                 counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
                     self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemplo n.º 12
0
    def run(
        cls,
        fileIn,
        fileOut=None,
        tokenization="split-Charniak-Lease",
        entityOffsetKey="charOffset",
        includeNeg=False,
        stem=False,
    ):
        """Builds the master gazzeteer.
        fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element
        fileOut: a string or None. If given, the resulting gazzetteer will be written out
        tokenization: name of the tokenization to be used

        Produces a dictionary with...
        """

        print >>sys.stderr, "Building gazetteer"

        gztr = {}  # key: token value: dictionary (key: className, value count)
        root = ETUtils.ETFromObj(fileIn)
        if not ET.iselement(root):
            assert isinstance(root, ET.ElementTree)
            root = root.getroot()
        sentences = []
        for sNode in root.getiterator("sentence"):
            sentences.append(sNode)
        counter = ProgressCounter(len(sentences), "Build gazetteer")
        for sNode in sentences:
            counter.update(1, "Adding to gazetteer sentence " + sNode.get("id") + ", ")
            for tokenizationNode in sNode.getiterator("tokenization"):
                if tokenizationNode.get("tokenizer") == tokenization:
                    break
            else:
                assert False, "Did not find %s tokenization" % tokenization
            tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey)
            assert len(tClasses) == len(tokenizationNode)
            for tokIdx, tokNode in enumerate(tokenizationNode):
                gsClass = tClasses[tokIdx]
                b, e = charOffStr2tuple(tokNode.get("charOffset"))
                tokNodeTxt = tokTxt(b, e, sNode, stem).lower()
                tokDict = gztr.setdefault(tokNodeTxt, {})
                tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                # for multi-part texts, add collapsed and last token versions
                if tokNodeTxt.find("-") != -1:
                    # collapsed
                    text = tokNodeTxt.replace("-", "")
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                    # last part
                    text = tokNodeTxt.rsplit("-", 1)[-1]
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
        if fileOut:
            Gazetteer.saveGztr(gztr, fileOut, includeNeg)
        return gztr
Exemplo n.º 13
0
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
    """
    Load an entire corpus through CorpusElements and add SentenceGraph-objects
    to its SentenceElements-objects.
    """
    import cElementTreeUtils as ETUtils
    import sys
    sys.path.append("..")
    from Utils.ProgressCounter import ProgressCounter
    from InteractionXML.CorpusElements import CorpusElements
    
    # Corpus may be in file or not
    if type(corpus) == types.StringType:
        print >> sys.stderr, "Loading corpus file", corpus
    corpusTree = ETUtils.ETFromObj(corpus)
    corpusRoot = corpusTree.getroot()
    # Use CorpusElements-class to access xml-tree
    corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
    print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
    # Make sentence graphs
    duplicateInteractionEdgesRemoved = 0
    sentences = []
    counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
    counter.showMilliseconds = True
    for sentence in corpusElements.sentences[:]:
        counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
        # No tokens, no sentence. No also no dependencies = no sentence.
        # Let's not remove them though, so that we don't lose sentences from input.
        if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: 
            #corpusElements.sentences.remove(sentence)
            sentence.sentenceGraph = None
            continue
        for pair in sentence.pairs:
            # gif-xml defines two closely related element types, interactions and
            # pairs. Pairs are like interactions, but they can also be negative (if
            # interaction-attribute == False). Sometimes pair-elements have been
            # (incorrectly) used without this attribute. To work around these issues
            # we take all pair-elements that define interaction and add them to
            # the interaction-element list.
            isInteraction = pair.get("interaction")
            if isInteraction == "True" or isInteraction == None:
                sentence.interactions.append(pair) # add to interaction-elements
                if pair.get("type") == None: # type-attribute must be explicitly defined
                    pair.set("type", "undefined")
        # Construct the basic SentenceGraph (only syntactic information)
        graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
        # Add semantic information, i.e. the interactions
        graph.mapInteractions(sentence.entities, sentence.interactions)
        graph.interSentenceInteractions = sentence.interSentenceInteractions
        duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
        sentence.sentenceGraph = graph
        
        graph.parseElement = sentence.parseElement
        
        #graph.mapEntityHints()
    print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
    return corpusElements
Exemplo n.º 14
0
    def run(cls,
            fileIn,
            fileOut=None,
            tokenization="split-Charniak-Lease",
            entityOffsetKey="charOffset",
            includeNeg=False,
            stem=False):
        """Builds the master gazzeteer.
        fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element
        fileOut: a string or None. If given, the resulting gazzetteer will be written out
        tokenization: name of the tokenization to be used

        Produces a dictionary with...
        """

        print >> sys.stderr, "Building gazetteer"

        gztr = {}  #key: token value: dictionary (key: className, value count)
        root = ETUtils.ETFromObj(fileIn)
        if not ET.iselement(root):
            assert isinstance(root, ET.ElementTree)
            root = root.getroot()
        sentences = []
        for sNode in root.getiterator("sentence"):
            sentences.append(sNode)
        counter = ProgressCounter(len(sentences), "Build gazetteer")
        for sNode in sentences:
            counter.update(
                1, "Adding to gazetteer sentence " + sNode.get("id") + ", ")
            for tokenizationNode in sNode.getiterator("tokenization"):
                if tokenizationNode.get("tokenizer") == tokenization:
                    break
            else:
                assert False, "Did not find %s tokenization" % tokenization
            tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey)
            assert len(tClasses) == len(tokenizationNode)
            for tokIdx, tokNode in enumerate(tokenizationNode):
                gsClass = tClasses[tokIdx]
                b, e = charOffStr2tuple(tokNode.get("charOffset"))
                tokNodeTxt = tokTxt(b, e, sNode, stem).lower()
                tokDict = gztr.setdefault(tokNodeTxt, {})
                tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                # for multi-part texts, add collapsed and last token versions
                if tokNodeTxt.find("-") != -1:
                    # collapsed
                    text = tokNodeTxt.replace("-", "")
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                    # last part
                    text = tokNodeTxt.rsplit("-", 1)[-1]
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
        if fileOut:
            Gazetteer.saveGztr(gztr, fileOut, includeNeg)
        return gztr
Exemplo n.º 15
0
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if stem == None:
        outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
Exemplo n.º 16
0
def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=False, verbose=False):
    counts = defaultdict(int)
    entityExamples = []
    entityPredictions = []
    interactionExamples = []
    interactionPredictions = []
    eventExamples = []
    eventPredictions = []
    falseEntity = defaultdict(lambda: defaultdict(int))
    if not verbose:
        counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing")
    # Loop through the sentences and collect all predictions
    toCorpusSentences = None
    if toCorpus != None:
        toCorpusSentences = toCorpus.documentSentences
    for i in range(len(fromCorpus.documentSentences)):
        if len(fromCorpus.documentSentences[i]) > 0 and not verbose:
            counter.update(len(fromCorpus.documentSentences[i]), fromCorpus.documentSentences[i][0].sentence.get("id").rsplit(".", 1)[0])
        if toCorpusSentences != None:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], toCorpusSentences[i], target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
        else:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], None, target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
        entityExamples.extend(newEntityExPred[0])
        entityPredictions.extend(newEntityExPred[1])
        interactionExamples.extend(newInteractionExPred[0])
        interactionPredictions.extend(newInteractionExPred[1])
        eventExamples.extend(newEventExPred[0])
        eventPredictions.extend(newEventExPred[1])
        for k,v in sentFalseEntity.iteritems():
            falseEntity[k][0] += v[0]
            falseEntity[k][1] += v[1]
    
    # Process the predictions with an evaluator and print the results
    evaluator = None
    if len(entityPredictions) > 0:
        evaluator = EvaluatorClass(entityExamples, entityPredictions, classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Entities")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
    if len(interactionPredictions) > 0:
        evaluator = EvaluatorClass(interactionExamples, interactionPredictions, classSet=classSets["interaction"])
        print evaluator.toStringConcise(title="Interactions")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
        #print "Interactions (fp ent->fp int, fn-ent->fn-int )"
        #for key in sorted(falseEntity.keys()):
        #    print "", key, falseEntity[key][0], "/", falseEntity[key][1]
    if len(eventPredictions) > 0:
        evaluator = EvaluatorClass(eventExamples, eventPredictions, classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Events")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
    return evaluator
Exemplo n.º 17
0
def compareToBinary(complexSentencesById, classifications, exampleBuilder, options):
    # Load corpus and make sentence graphs
    print >> sys.stderr, "Calculating performance on binary corpus"
    classificationsBySentence = {}
    for classification in classifications:
        example = classification[0][0]
        sentenceId = example[0].rsplit(".",1)[0]
        sentenceOrigId = complexSentencesById[sentenceId].sentence.attrib["origId"]
        if not classificationsBySentence.has_key(sentenceOrigId):
            classificationsBySentence[sentenceOrigId] = []
        classificationsBySentence[sentenceOrigId].append(classification)
    
    print >> sys.stderr, "Loading Binary corpus"
    binaryCorpusElements = loadCorpus(options.binaryCorpus)
    binaryClassifications = []
    counter = ProgressCounter(len(binaryCorpusElements.sentences), "Build binary classifications")
    for binarySentence in binaryCorpusElements.sentences:
        counter.update(1, "Building binary classifications ("+binarySentence.sentence.attrib["id"]+"): ")
        if(classificationsBySentence.has_key(binarySentence.sentence.attrib["origId"])):
            complexClassificationGraph = NX.XGraph(multiedges = multiedges)
            for token in binarySentence.sentenceGraph.tokens:
                complexClassificationGraph.add_node(token)
            for classification in classificationsBySentence[binarySentence.sentence.attrib["origId"]]:
                if classification[1] > 0:
                    example = classification[0][0]       
                    t1 = example[3]["t1"]
                    t2 = example[3]["t2"]
                    t1Binary = None
                    for token in binarySentence.sentenceGraph.tokens:
                        if token.attrib["charOffset"] == t1.attrib["charOffset"]:
                            t1Binary = token
                    t2Binary = None
                    for token in binarySentence.sentenceGraph.tokens:
                        if token.attrib["charOffset"] == t2.attrib["charOffset"]:
                            t2Binary = token
                    assert(t1Binary != None and t2Binary != None)
                    complexClassificationGraph.add_edge(t1Binary, t2Binary)
            paths = NX.all_pairs_shortest_path(complexClassificationGraph, cutoff=999)
            for pair in binarySentence.pairs:
                t1 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e1"]]
                t2 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e2"]]
                assert(pair.attrib["interaction"] == "True" or pair.attrib["interaction"] == "False")
                if pair.attrib["interaction"] == "True":
                    pairClass = 1
                else:
                    pairClass = -1
                extra = {"xtype":"edge","type":"i","t1":t1,"t2":t2}
                if paths.has_key(t1) and paths[t1].has_key(t2):
                    binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], 1, "binary"] )
                else:
                    binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], -1, "binary"] )
    print >> sys.stderr, "Evaluating binary classifications"
    evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
    print >> sys.stderr, evaluation.toStringConcise()
    if options.output != None:
        evaluation.saveCSV(options.output + "/binary_comparison_results.csv")                    
Exemplo n.º 18
0
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
    """
    Load an entire corpus through CorpusElements and add SentenceGraph-objects
    to its SentenceElements-objects.
    """
    import Utils.ElementTreeUtils as ETUtils
    import sys
    from Utils.ProgressCounter import ProgressCounter
    from Utils.InteractionXML.CorpusElements import CorpusElements
    
    # Corpus may be in file or not
    if type(corpus) == types.StringType:
        print >> sys.stderr, "Loading corpus file", corpus
    corpusTree = ETUtils.ETFromObj(corpus)
    corpusRoot = corpusTree.getroot()
    # Use CorpusElements-class to access xml-tree
    corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
    print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
    # Make sentence graphs
    duplicateInteractionEdgesRemoved = 0
    sentences = []
    counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
    counter.showMilliseconds = True
    for sentence in corpusElements.sentences[:]:
        counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
        # No tokens, no sentence. No also no dependencies = no sentence.
        # Let's not remove them though, so that we don't lose sentences from input.
        if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: 
            #corpusElements.sentences.remove(sentence)
            sentence.sentenceGraph = None
            continue
        for pair in sentence.pairs:
            # gif-xml defines two closely related element types, interactions and
            # pairs. Pairs are like interactions, but they can also be negative (if
            # interaction-attribute == False). Sometimes pair-elements have been
            # (incorrectly) used without this attribute. To work around these issues
            # we take all pair-elements that define interaction and add them to
            # the interaction-element list.
            isInteraction = pair.get("interaction")
            if isInteraction == "True" or isInteraction == None:
                sentence.interactions.append(pair) # add to interaction-elements
                if pair.get("type") == None: # type-attribute must be explicitly defined
                    pair.set("type", "undefined")
        # Construct the basic SentenceGraph (only syntactic information)
        graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
        # Add semantic information, i.e. the interactions
        graph.mapInteractions(sentence.entities, sentence.interactions)
        graph.interSentenceInteractions = sentence.interSentenceInteractions
        duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
        sentence.sentenceGraph = graph
        
        graph.parseElement = sentence.parseElement
        
        #graph.mapEntityHints()
    print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
    return corpusElements
Exemplo n.º 19
0
def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000):
    print >> sys.stderr, "Making vocabulary"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    vocabulary = {"indices":{}, "vectors":[]}
    
    print >> sys.stderr, "Loading word vectors from", wordVectorPath
    print >> sys.stderr, "max_rank_mem", max_rank_mem
    print >> sys.stderr, "max_rank", max_rank
    max_rank_mem = int(max_rank_mem)
    max_rank = int(max_rank)
    wv = WV.load(wordVectorPath, max_rank_mem, max_rank)
    dimVector = wv.vectors.shape[1]
    print >> sys.stderr, "WordVector length", dimVector
    #addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    #addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    counts = defaultdict(int)
    for document in documents:
        counter.update()
        counts["document"] += 1
        for sentence in document.findall("sentence"):
            counts["sentence"] += 1
            tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName)
            if tokenization != None:
                counts["tokenization"] += 1
                for token in tokenization.findall("token"):
                    counts["token"] += 1
                    text = token.get("text")
                    if text not in vocabulary["indices"]:
                        counts["token-unique"] += 1
                        vector = wv.w_to_normv(token.get("text").lower())
                        if vector is not None:
                            counts["vector"] += 1
                            vector = vector.tolist() + [0.0, 0.0]
                            addVector(text, vector, vocabulary)
                        else:
                            counts["no-vector"] += 1           
    
    print >> sys.stderr, "Counts:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz"
        with gzip.open(output + "-vectors.json.gz", "wt") as f:
            json.dump(vocabulary, f)
        print >> sys.stderr, "Writing indices to", output + "-indices.json.gz"
        with gzip.open(output + "-indices.json.gz", "wt") as f:
            json.dump({"indices":vocabulary["indices"], "vectors":None}, f)
    return vocabulary
Exemplo n.º 20
0
 def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True):
     # Create intermediate paths if needed
     if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)):
         os.makedirs(os.path.dirname(output))
     # Open output file
     openStyle = "wt"
     if append:
         #print "Appending examples"
         openStyle = "at"
     if output.endswith(".gz"):
         outfile = gzip.open(output, openStyle)
     else:
         outfile = open(output, openStyle)
     
     # Build examples
     self.exampleCount = 0
     if type(input) in types.StringTypes:
         self.elementCounts = self.getElementCounts(input)
         if self.elementCounts["sentences"] > 0:
             self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples")
         else:
             self.elementCounts = None
             self.progress = ProgressCounter(None, "Build examples")
     else:
         self.elementCounts = None
         self.progress = ProgressCounter(None, "Build examples")
     
     self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization))
     
     inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization)            
     
     #goldIterator = []
     if gold != None:
         goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization)
         for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None):
             assert inputSentences != None
             assert goldSentences != None
             self.processDocument(inputSentences, goldSentences, outfile)
     else:
         for inputSentences in inputIterator:
             self.processDocument(inputSentences, None, outfile)
     outfile.close()
     self.progress.endUpdate()
     
     # Show statistics
     print >> sys.stderr, "Examples built:", self.exampleCount
     print >> sys.stderr, "Features:", len(self.featureSet.getNames())
     print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles))
     if self.exampleStats.getExampleCount() > 0:
         self.exampleStats.printStats()
 
     # Save Ids
     if allowNewIds:
         self.saveIds()
Exemplo n.º 21
0
def compareDocuments(documentMap, targetFiles, options):
    documentIds = sorted(documentMap.keys())
    counter = ProgressCounter(len(documentIds))
    stats = {}
    eventStats = {
        "Start Events": 0,
        "End Events": 0,
        "False Positive Trigger": 0
    }  #,
    #"Cause FN":0,
    #"Cause FP":0,
    #"Theme FN":0,
    #"Theme FP":0}
    for docId in documentIds:
        counter.update(1, "Processing: ")  # document " + str(docId) + ": " )
        for fileName in sorted(documentMap[docId]):
            extension = fileName.split(".", 1)[-1]
            addStat(stats, extension, "source")
            if os.path.exists(os.path.join(options.output, fileName)):
                addStat(stats, extension, "target")
                if extension == "txt" or extension == "a1":
                    if compareByLine(fileName, options):
                        addStat(stats, extension, "identical")
                    else:
                        addStat(stats, extension, "different")
                        if options.verbose:
                            print >> sys.stderr, " ...in comparing", fileName
                elif extension == "a2.t1":
                    if compareA2T1Files(fileName, options, eventStats):
                        addStat(stats, extension, "identical")
                    else:
                        addStat(stats, extension, "different")
                        if options.verbose:
                            print >> sys.stderr, " ...in comparing", fileName
    print >> sys.stderr, "Files (source, target, identical, different):"
    for key in sorted(stats.keys()):
        print >> sys.stderr, " " + key + ":" + (10 - len(key)) * " " + "\t",
        for value in stats[key]:
            print >> sys.stderr, "\t" + str(value),
        print >> sys.stderr
    print >> sys.stderr, "Event stats:"
    for key in sorted(eventStats.keys()):
        print >> sys.stderr, " " + key + ": " + str(eventStats[key])
    print >> sys.stderr, "Event extraction:"
    eventsSource = eventStats["Start Events"]
    events0 = 0
    if eventStats.has_key("Error Level 0"):
        events0 = eventStats["Error Level 0"]
    if eventsSource == 0:
        percent = 0
    else:
        percent = (100.0 * events0 / eventsSource)
    print >> sys.stderr, " Exact:", events0, "/", eventsSource, "(%.2f" % percent + " %)"
Exemplo n.º 22
0
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Exemplo n.º 23
0
def processCorpus(input, outDir, stem, tail, mergedSets=[]):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key(None):
                countsByType[None] = 0
            countsByType[docSet] += 1
            continue
        if not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "New Sets"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
Exemplo n.º 24
0
def compareDocuments(documentMap, targetFiles, options):
    documentIds = sorted(documentMap.keys())
    counter = ProgressCounter(len(documentIds))
    stats = {}
    eventStats = {"Start Events":0, 
                  "End Events":0,
                  "False Positive Trigger":0}#,
                  #"Cause FN":0,
                  #"Cause FP":0,
                  #"Theme FN":0,
                  #"Theme FP":0}
    for docId in documentIds:
        counter.update(1, "Processing: ")# document " + str(docId) + ": " )
        for fileName in sorted(documentMap[docId]):
            extension = fileName.split(".",1)[-1]
            addStat(stats, extension, "source")
            if os.path.exists(os.path.join(options.output, fileName)):
                addStat(stats, extension, "target")
                if extension == "txt" or extension == "a1":
                    if compareByLine(fileName, options):
                        addStat(stats, extension, "identical")
                    else:
                        addStat(stats, extension, "different")
                        if options.verbose: print >> sys.stderr, " ...in comparing", fileName
                elif extension == "a2.t1":
                    if compareA2T1Files(fileName, options, eventStats):
                        addStat(stats, extension, "identical")
                    else:
                        addStat(stats, extension, "different")
                        if options.verbose: print >> sys.stderr, " ...in comparing", fileName
    print >> sys.stderr, "Files (source, target, identical, different):"
    for key in sorted(stats.keys()):
        print >> sys.stderr, " " + key + ":" + (10-len(key)) * " " + "\t",
        for value in stats[key]:
            print >> sys.stderr, "\t" + str(value),
        print >> sys.stderr
    print >> sys.stderr, "Event stats:"
    for key in sorted(eventStats.keys()):
        print >> sys.stderr, " " + key + ": " + str(eventStats[key])
    print >> sys.stderr, "Event extraction:"
    eventsSource = eventStats["Start Events"]
    events0 = 0
    if eventStats.has_key("Error Level 0"):
        events0 = eventStats["Error Level 0"]
    if eventsSource == 0:
        percent = 0
    else:
        percent = (100.0 * events0 / eventsSource)
    print >> sys.stderr, " Exact:", events0, "/", eventsSource, "(%.2f" % percent + " %)"
Exemplo n.º 25
0
 def build(cls, input, output, parse, tokenization=None, includeNeg=False):
     p = PathGazetteer(includeNeg)
     sentences = cls.getSentences(input, parse, tokenization)
     
     counter = ProgressCounter(len(sentences), "Build path gazetteer")
     for sentence in sentences:
         counter.update(1, "Building path gazetteer ("+sentence[0].getSentenceId()+"): ")
         p.processSentence(sentence[0])
     p.calculateFractions()
     
     f = open(output, "wt")
     for key in sorted(p.gazetteer.keys()):
         v = p.gazetteer[key]
         f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) + " " + str(v[3]) + "\n")
     f.close()
Exemplo n.º 26
0
def processCorpus(input, attrs=["text"]):
    print attrs
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    interactors = {}
    for document in documents:
        entDict = {}
        for entity in document.getiterator("entity"):
            entDict[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entDict[interaction.get("e1")]
            e2 = entDict[interaction.get("e2")]
            # form identifier tuples
            e1Tuple = []
            for attr in attrs:
                e1Tuple.append(e1.get(attr))
            e1Tuple = tuple(e1Tuple)
            e2Tuple = []
            for attr in attrs:
                e2Tuple.append(e2.get(attr))
            e2Tuple = tuple(e2Tuple)
            interactors = [e1Tuple, e2Tuple]
            #interactors.sort()
            print interactors
Exemplo n.º 27
0
def process(input, output=None, preprocess=True, debug=False):
    """
    Run MetaMap.
    """    
    counter = ProgressCounter(id="MetaMap")
    
    # Create working directory
    workdir = tempfile.mkdtemp()
    
    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    
    # Loop iteratively over elements
    skip = False
    for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event == "start": # element start message, element may not be fully read yet
            if element.tag == "sentence":
                sentence = element
                counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ")
                # Run metamap for the sentence element
            elif element.tag == "metamap": # skip the metamap element to remove the original one
                skip = True
            if not skip and output != None:
                outWriter.begin(element)
        
        elif event == "end": # element is fully read in memory
            if not skip and output != None:
                outWriter.end(element)

            if element.tag == "metamap":
                skip = False # write elements again after this one
                if preprocess:
                    element = convert(element, sentence)
                outWriter.write(element) # insert the new metamap element into the output stream
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        outWriter.close()
        ETUtils.encodeNewlines(output)

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        shutil.rmtree(workdir)

    return output
Exemplo n.º 28
0
def readARFF(filename):
    featureSet = IdSet(1)
    classSet = IdSet(0)
    f = open(filename,"rt")
    inData = False
    lines = f.readlines()
    counter = ProgressCounter(len(lines),"ARFFLine")
    examples = []
    for line in lines:
        counter.update(string="Processing line " + str(counter.current + 1) + ": ")
        line = line.strip()
        if len(line) == 0 or line[0] == "%":
            continue
        elif line[0] == "@":
            #print line
            category = line.split()[0].lower()
            if category == "@attribute":
                category, name, type = line.split()
                assert(not inData)
                if name.lower() == "class":
                    name = name.lower()
                    classNames = type[1:-1].split(",")
                    assert(len(classNames)==2)
                    classSet.defineId(classNames[0].strip(),1)
                    classSet.defineId(classNames[1].strip(),-1)
                featureSet.getId(name)
            elif category.lower() == "@relation":
                assert(not inData)
            elif category == "@data":
                inData = True
        else:
            assert(inData)
            count = 1
            features = {}
            for column in line.split(","):
                if featureSet.getName(count) != "class":
                    features[count] = float(column)
                else:
                    classId = classSet.getId(column, False)
                    assert(classId != None)
                count += 1
            exampleCount = str(len(examples))
            exampleId = "BreastCancer.d" + exampleCount + ".s0.x0"
            examples.append([exampleId,classId,features,{}])
                    
    return examples
Exemplo n.º 29
0
    def build(cls, input, output, parse, tokenization=None, includeNeg=False):
        p = PathGazetteer(includeNeg)
        sentences = cls.getSentences(input, parse, tokenization)

        counter = ProgressCounter(len(sentences), "Build path gazetteer")
        for sentence in sentences:
            counter.update(
                1, "Building path gazetteer (" + sentence[0].getSentenceId() +
                "): ")
            p.processSentence(sentence[0])
        p.calculateFractions()

        f = open(output, "wt")
        for key in sorted(p.gazetteer.keys()):
            v = p.gazetteer[key]
            f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) +
                    " " + str(v[3]) + "\n")
        f.close()
Exemplo n.º 30
0
 def prepareDocuments(self, corpusRoot, files, conllFormat=None, counts=None):
     print >> sys.stderr, "Generating document elements from the parses"
     docNames = sorted(files.keys())
     corpusName = corpusRoot.get("source", "CORPUS")
     #parseExtensions = set(["ptb", "conll", "conllx", "conllu"])
     counter = ProgressCounter(len(docNames), "Document Generation")
     for i in range(len(docNames)):
         docName = docNames[i]
         counter.update(1, "Making document element for document '" + str(docName) + "': ")
         #filePaths = files[docName]
         extensions = sorted(files[docName].keys())
         sentObjs = self.readParse(extensions[0], files[docName][extensions[0]], conllFormat)
         sentTexts = []
         for sentObj in sentObjs:
             if "tokens" in sentObj:
                 sentTexts.append(" ".join([x["text"] for x in sentObj["tokens"]]))
         docText = " ".join(sentTexts)
         ET.SubElement(corpusRoot, "document", id=corpusName + ".d" + str(i), origId=docName, text=docText)
     return [x for x in corpusRoot.findall("document")]
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0

    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)

    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId

    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example (" + example[0] + "): ")
        features = example[2]
        for i in range(len(weightFeatures) - 1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0
    
    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)
    
    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId
    
    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example ("+example[0]+"): ")
        features = example[2]
        for i in range(len(weightFeatures)-1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
Exemplo n.º 33
0
    def buildExamplesForSentences(self,
                                  sentences,
                                  goldSentences,
                                  output,
                                  idFileTag=None,
                                  append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(
                1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0],
                                          goldSentence[0],
                                          append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Exemplo n.º 34
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][0], "created", countsByType[k][1]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemplo n.º 35
0
def buildExamples(exampleBuilder, sentences, options):
    print >> sys.stderr, "Defining predicted value range:",
    sentenceElements = []
    for sentence in sentences:
        sentenceElements.append(sentence[0].sentenceElement)
    exampleBuilder.definePredictedValueRange(sentenceElements, "entity")
    print >> sys.stderr, exampleBuilder.getPredictedValueRange()

    examples = []
    if hasattr(exampleBuilder,
               "styles") and "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    for sentence in sentences:
        counter.update(
            1, "Building examples (" + sentence[0].getSentenceId() + "): ")
        sentence[1] = exampleBuilder.buildExamples(sentence[0])
        examples.extend(sentence[1])
    print >> sys.stderr, "Examples built:", len(examples)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Preprocessing examples:"
    examples = exampleBuilder.preProcessExamples(examples)
    # Save examples
    #    if options.output != None:
    #        print >> sys.stderr, "Saving examples to", options.output + "/examples.txt"
    #        commentLines = []
    #        commentLines.append("Input file: " + options.input)
    #        commentLines.append("Example builder: " + options.exampleBuilder)
    #        commentLines.append("Features:")
    #        commentLines.extend(exampleBuilder.featureSet.toStrings())
    #        Example.writeExamples(examples, options.output + "/examples.txt", commentLines)
    #examples = filterFeatures(exampleBuilder.featureSet, examples)
    #Example.normalizeFeatureVectors(examples)
    return examples
Exemplo n.º 36
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][
            0], "created", countsByType[k][1]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemplo n.º 37
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    
    calculatePredictedRange(exampleBuilder, sentences)
    
    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Exemplo n.º 38
0
def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0, 0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(
            sentence, "sentenceanalyses/tokenizations/tokenization",
            {"tokenizer": tokenization})
        parseElement = ETUtils.getElementByAttrib(
            sentence, "sentenceanalyses/parses/parse", {"parser": parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get(
                "id"), "missing parse or tokenization"
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(
            tokens,
            parseElement.findall("dependency"),
            sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
Exemplo n.º 39
0
def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None):
    """
    Waits for a process to finish, and tracks the number of entities it writes
    to it's outputfile. If writing a sentence takes longer than the timeout, 
    the process is considered stalled and is killed.
    """
    maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file)
    counter = ProgressCounter(numCorpusSentences, counterName)
    counter.showMilliseconds = True
    prevNumSentences = 0 # Number of output sentences on previous check
    finalCheckLeft = True # Make one final check to update counters
    processStatus = None # When None, process not finished
    prevTime = time.time()
    startTime = time.time()
    # Wait until process is finished and periodically check it's progress.
    while processStatus == None or finalCheckLeft:
        if processStatus != None: # Extra loop to let counters finish
            finalCheckLeft = False # Done only once
        if os.path.exists(outputFile[0]): # Output file has already appeared on disk
            # Measure number of sentences in output file
            numSentences = 0
            f = codecs.open(outputFile[0], "rt", **outputFile[1])
            for line in f:
                if measureByGap:
                    if line.strip() == "":
                        numSentences += 1
                else:
                    numSentences += 1
            f.close()
            # Update status
            if numSentences - prevNumSentences != 0: # Process has progressed
                counter.update(numSentences - prevNumSentences, updateMessage + ": ")
            if finalCheckLeft: # This is a normal loop, not the final check
                # Startuptime hasn't yet passed or process has made progress
                if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0:
                #if prevNumSentences == 0 or numSentences - prevNumSentences != 0:
                    prevTime = time.time() # reset timeout
                else: # Nothing happened on this update, check whether process hung
                    elapsedTime = time.time() - prevTime
                    if timeout != None and elapsedTime > timeout:
                        print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")"
                        print >> sys.stderr, "Killing process"
                        process.kill()
                prevNumSentences = numSentences
                time.sleep(1)
        else: # Output file doesn't exist yet
            prevTime = time.time() # reset counter if output file hasn't been created
        processStatus = process.poll() # Get process status, None == still running
    
    counter.markFinished() # If we get this far, don't show the error message even if process didn't finish
    return (numSentences, numCorpusSentences)
Exemplo n.º 40
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")

    calculatePredictedRange(exampleBuilder, sentences)

    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(
            1, "Building examples (" + sentence[0].getSentenceId() + "): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Exemplo n.º 41
0
def mainFunc(input,
             output=None,
             parseName="McCC",
             tokenizationName=None,
             newParseName=None,
             newTokenizationName=None,
             logFileName=None,
             removeOld=True):
    print >> sys.stderr, "Protein Name Splitter"
    if logFileName != None:
        print >> sys.stderr, "Writing log to", logFileName
        logFile = open(logFileName, "wt")
    else:
        logFile = None
    #if input.endswith(".gz"):
    #    inFile = gzip.GzipFile(input)
    #else:
    #    inFile = open(input)
    tree = ETUtils.ETFromObj(input)

    if tokenizationName == None:
        tokenizationName = parseName

    #tree = ElementTree.parse(inFile)
    root = tree.getroot()

    sentences = [x for x in root.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "Split Protein Names")
    counter.showMilliseconds = True
    missingTokCount = 0
    for sentence in sentences:
        sId = sentence.get("id")
        counter.update(1, "Splitting names (" + sId + "): ")

        tok = getTokenization(tokenizationName,
                              sentence,
                              sId,
                              remove=removeOld)
        if tok == None:
            missingTokCount += 1
            continue

        assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (
            tokenizationName, sId)

        parse = getParse(parseName,
                         tokenizationName,
                         sentence,
                         sId,
                         remove=removeOld)
        assert parse is not None, "Missing parse '%s' in sentence %s!" % (
            parseName, sId)

        split = splitTokens(tok, sentence, logFile)

        # Default names
        if removeOld:
            if newTokenizationName == None:
                newTokenizationName = tok.get("tokenizer")
            if newParseName == None:
                newParseName = parse.get("parser")
        else:
            if newTokenizationName == None:
                newTokenizationName = "split-" + tok.get("tokenizer")
            if newParseName == None:
                newParseName = "split-" + parse.get("parser")

        # add a new tokenization with the split tokens.
        splittok = addTokenization(newTokenizationName, sentence, sId)
        addTokensToTree(split, splittok)
        for a in tok.attrib:
            if splittok.get(a) == None:
                splittok.set(a, tok.get(a))
        #splittok.set("split-")

        # make a mapping from original to split token ids. Store the
        # head token when given.
        tokenIdMap = {}
        for t in split:
            if t.head:
                head = t.head
                # traverse
                while head.head is not None:
                    assert head.head != t, "Cyclic heads"
                    head = head.head

                # should match (nah, punctuation problems)
                # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict"
                tokenIdMap[t.origId] = head.id
            else:
                # only allow overwrite of existing entry if the current token
                # is not punctuation.
                if t.origId not in tokenIdMap or not t.isPunct():
                    tokenIdMap[t.origId] = t.id

        # make a copy of the specified parse that refers to the split tokens
        # instead of the originals.
        newparse = addParse(newParseName, newTokenizationName, sentence, sId)
        for a in parse.attrib:
            if newparse.get(a) == None:
                newparse.set(a, parse.get(a))
        newparse.set("ProteinNameSplitter", "True")
        splittok.set("ProteinNameSplitter", "True")

        depSeqId = 0  #1
        for d in parse.getiterator("dependency"):
            t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
            assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"

            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", tokenIdMap[t1])
            dep.set("t2", tokenIdMap[t2])
            dep.set("type", dType)
            dep.set("id", "sd_%d" % depSeqId)
            depSeqId += 1

        # Add in new dependencies between the split parts.
        for t in [tok for tok in split if tok.head is not None]:
            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", t.head.id)
            dep.set("t2", t.id)
            dep.set("type", t.depType)
            dep.set("split", "PNS")
            dep.set("id", "spd_%d" % depSeqId)
            depSeqId += 1

        for phrase in parse.getiterator("phrase"):
            newparse.append(phrase)

            # debugging
            #print >> sys.stderr, "NEW DEP IN", sId

    print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences"

    #indent(root)
    if logFile != None:
        logFile.close()

    # debugging
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(tree, output)
    return tree
Exemplo n.º 42
0
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
        newFeatures = {}
        for k,v in example[2].iteritems():
            newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v
        example[2] = newFeatures
        
    ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
Exemplo n.º 43
0
 def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None):
     #print >> sys.stderr, "Writing output to Interaction XML"
     corpus = self.loadCorpus(corpus, parse, tokenization)
     if goldCorpus != None:
         goldCorpus = self.loadCorpus(corpus, parse, tokenization)
     examples, predictions = self.loadExamples(examples, predictions)
     
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     classIds = None
     if classSet != None:
         classIds = classSet.getIds()
         
     #counter = ProgressCounter(len(corpus.sentences), "Write Examples")
             
     exampleQueue = [] # One sentence's examples
     predictionsByExample = {}
     currentMajorId = None
     prevMajorIds = set()
     processedSentenceIds = set()
     xType = None
     
     count = 0
     for example in examples:
         count += 1
     assert count > 0
     progress = ProgressCounter(count, "Write Examples")
     
     for example, prediction in itertools.izip_longest(examples, predictions):
         assert example != None
         assert prediction != None
         majorId, minorId = example[0].rsplit(".x", 1)
         #if currentMajorId == "GENIA.d114.s9": print "Start"
         if majorId != currentMajorId: # new sentence
             if currentMajorId != None:
                 #if currentMajorId == "GENIA.d114.s9": print "JAA"
                 processedSentenceIds.add(currentMajorId)
                 sentenceObject = corpus.sentencesById[currentMajorId]
                 goldSentence = None
                 if goldCorpus != None:
                     goldSentence = goldCorpus.sentencesById[currentMajorId]
                 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
                 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
             exampleQueue = []
             predictionsByExample = {}
             prevMajorIds.add(currentMajorId)
             assert majorId not in prevMajorIds, majorId
             currentMajorId = majorId 
         exampleQueue.append(example) # queue example
         predictionsByExample[example[0]] = prediction
         assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType)
     
     # Process what is still in queue
     if currentMajorId != None:
         processedSentenceIds.add(currentMajorId)
         sentenceObject = corpus.sentencesById[currentMajorId]
         goldSentence = None
         if goldCorpus != None:
             goldSentence = goldCorpus.sentencesById[currentMajorId]
         self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
         progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
         exampleQueue = []
         predictionsByExample = {}
     
     # Process sentences with no examples (e.g. to clear interactions)
     for sentenceId in sorted(corpus.sentencesById.keys()):
         if sentenceId not in processedSentenceIds:
             sentenceObject = corpus.sentencesById[sentenceId]
             goldSentence = None
             if goldCorpus != None:
                 goldSentence = goldCorpus.sentencesById[currentMajorId]
             self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence)
     
     # Print statistics
     if len(self.counts) > 0:
         print >> sys.stderr, self.counts
         self.counts = defaultdict(int)
 
     # Write corpus
     if outputFile != None:
         print >> sys.stderr, "Writing corpus to", outputFile
         ETUtils.write(corpus.rootElement, outputFile)
     return corpus.tree
Exemplo n.º 44
0
def processCorpus(inputCorpus, outputPath, task=1, outputIsA2File=False, verbose=True, strengths=False):
    if outputIsA2File:
        a2File = open(outputPath, "wt")
        if len(inputCorpus.documents) > 1:
            print >> sys.stderr, "Warning: Input file has more than one document, a2-file events will have overlapping ids"
            
    
    if verbose: counter = ProgressCounter(len(inputCorpus.documents), "Document")
    # Each document is written to an output file
    for document in inputCorpus.documents:
        docSentence = document.find("sentence")
        if docSentence == None:
            counter.update(1, "Processing empty document")
            continue
        documentId = docSentence.get("origId")
        if documentId == None:
            documentId = document.get("origId")
        else:
            documentId = documentId.rsplit(".", 1)[0]
        if verbose: counter.update(1, "Processing document " + document.get("id") + " (origId " + documentId + "): ")
        
        # Write a1 file
        if outputIsA2File: 
            outputFile = None
        else:
            outputFile = codecs.open(os.path.join(outputPath,documentId + ".a1"), "wt", "utf-8")
            #outputFile = open(os.path.join(outputPath,documentId + ".a1"), "wt")
        namedEntityTriggerIds = writeProteins(document, inputCorpus, outputFile)
        if not outputIsA2File:
            outputFile.close()

        # Write a2.t1 file
        if task == 1:
            strengthFile = None
            if outputIsA2File: 
                outputFile = a2File
            else:
                outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t1"), "wt", "utf-8")
                #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t1.scores"), "wt", "utf-8")
                #outputFile = open(os.path.join(outputPath,documentId + ".a2.t1"), "wt")
            events, entityMap = getEvents(document, inputCorpus, 1)
            #print "EVENTS-FINAL", events, "\nENTITY_MAP", entityMap
            triggerIds = copy.copy(namedEntityTriggerIds)
            writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 1, strengths=strengthFile)
            writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, strengths=strengthFile)
            #outputFile.close()
        # Write a2.t12 file
        elif task == 2:
            strengthFile = None
            if outputIsA2File: 
                outputFile = a2File
            else:
                outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t12"), "wt", "utf-8")
                #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t12.scores"), "wt", "utf-8")
                #outputFile = open(os.path.join(outputPath,documentId + ".a2.t12"), "wt")
            events, entityMap = getEvents(document, inputCorpus, 2)
            triggerIds = copy.copy(namedEntityTriggerIds)
            writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 2, strengths=strengthFile)
            writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, strengths=strengthFile)
            #outputFile.close()
        # Write a2.t123 file
        elif task == 3:
            strengthFile = None
            if outputIsA2File: 
                outputFile = a2File
            else:
                outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t123"), "wt", "utf-8")
                #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t123.scores"), "wt", "utf-8")
                #outputFile = open(os.path.join(outputPath,documentId + ".a2.t123"), "wt")
            events, entityMap = getEvents(document, inputCorpus, 2)
            triggerIds = copy.copy(namedEntityTriggerIds)
            writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 2, strengths=strengthFile)
            writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, True, strengths=strengthFile)
            #outputFile.close()
        if not outputIsA2File: 
            outputFile.close()
            
            # Write txt file
            outputFile = codecs.open(os.path.join(outputPath,documentId + ".txt"), "wt", "utf-8")
            #outputFile = open(os.path.join(outputPath,documentId + ".txt"), "wt")
            writeDocumentText(document, outputFile)
            outputFile.close()
    
    if outputIsA2File:
        a2File.close()
Exemplo n.º 45
0
class ExampleBuilder:
    structureAnalyzer = None
    """
    ExampleBuilder is the abstract base class for specialized example builders.
    Example builders take some data and convert it to examples usable by e.g. SVMs.
    An example builder writes three files, an example-file (in extended Joachim's
    SVM format) and .class_names and .feature_names files, which contain the names
    for the class and feature id-numbers. An example builder can also be given
    pre-existing sets of class and feature ids (optionally in files) so that the
    generated examples are consistent with other, previously generated examples.
    """
    def __init__(self, classSet=None, featureSet=None):
        if (type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet

        if (type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet

        self.featureTag = ""
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None

        self.styles = {}
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False

    def hasStyle(self, style):
        return style in self.styles and not self.styles[style]

    def _setDefaultParameters(self, defaults=None, valueLimits=None):
        # Initialize
        if self._defaultParameters == None:
            self._defaultParameters = {}
        if self._parameterValueLimits == None:
            self._parameterValueLimits = {}
        newParameters = Utils.Parameters.get({},
                                             defaults,
                                             valueLimits=valueLimits)
        self._defaultParameters.update(newParameters)
        if valueLimits != None:
            self._parameterValueLimits.update(valueLimits)

    def getParameters(self, parameters):
        return Utils.Parameters.get(parameters,
                                    defaults=self._defaultParameters,
                                    valueLimits=self._parameterValueLimits)

    def setFeature(self, name, value):
        self.features[self.featureSet.getId(self.featureTag + name)] = value

    def getElementCounts(self, filename):
        print >> sys.stderr, "Counting elements:",
        if filename.endswith(".gz"):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")
        counts = {"documents": 0, "sentences": 0}
        for line in f:
            if "<document" in line:
                counts["documents"] += 1
            elif "<sentence" in line:
                counts["sentences"] += 1
        f.close()
        print >> sys.stderr, counts
        return counts

    def saveIds(self):
        if self.classIdFilename != None:
            print >> sys.stderr, "Saving class names to", self.classIdFilename
            self.classSet.write(self.classIdFilename)
        else:
            print >> sys.stderr, "Class names not saved"
        if self.featureIdFilename != None:
            print >> sys.stderr, "Saving feature names to", self.featureIdFilename
            self.featureSet.write(self.featureIdFilename)
        else:
            print >> sys.stderr, "Feature names not saved"

    def processCorpus(self,
                      input,
                      output,
                      gold=None,
                      append=False,
                      allowNewIds=True,
                      structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(
                os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)

        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:  # Entered here - Mu
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:  # Entered here, 1448 - Mu
                self.progress = ProgressCounter(
                    self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")
        # pdb.set_trace()

        # This line generates log below:(getSentences function generates the first 2 lines)
        # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113)
        # Skipped 381 duplicate interaction edges in SentenceGraphs
        # Defining predicted value range: None - Mu
        self.calculatePredictedRange(
            self.getSentences(input, self.parse, self.tokenization)
        )  # self.parse: mccc; self.tokenization: None

        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles[
                "keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False  # this is True  - Mu
        inputIterator = getCorpusIterator(
            input,
            None,
            self.parse,
            self.tokenization,
            removeIntersentenceInteractions=removeIntersentenceInteractions)

        # pdb.set_trace()
        #goldIterator = []
        if gold != None:  # Entered here - Mu
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles[
                    "keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False  # this is False - Mu
            goldIterator = getCorpusIterator(
                gold,
                None,
                self.parse,
                self.tokenization,
                removeIntersentenceInteractions=
                removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(
                    inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                # pdb.set_trace()
                # see the documentation of function processSentence() in this script
                # inputSentences[1].sentence is the unmerged version
                # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph,
                # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu
                self.processDocument(inputSentences,
                                     goldSentences,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences,
                                     None,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()

        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Classes:", len(self.classSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(
            self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()

        # Save Ids
        if allowNewIds:
            self.saveIds()

    def processDocument(self,
                        sentences,
                        goldSentences,
                        outfile,
                        structureAnalyzer=None):
        #calculatePredictedRange(self, sentences)
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(
                1, "Building examples (" + sentence.sentence.get("id") + "): ")
            self.processSentence(sentence,
                                 outfile,
                                 goldSentence,
                                 structureAnalyzer=structureAnalyzer)

    def processSentence(self,
                        sentence,
                        outfile,
                        goldSentence=None,
                        structureAnalyzer=None):
        '''
        sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance
        sentence.sentence: Element 'sentence' in the xml file
        '''
        # pdb.set_trace()
        # Process filtering rules
        # does NOT entered here since self.styles["sentenceLimit"] is None - Mu
        if "sentenceLimit" in self.styles and self.styles[
                "sentenceLimit"]:  # Rules for limiting which sentences to process
            # Get the rule list
            limitRules = self.styles["sentenceLimit"]
            if type(limitRules) in types.StringTypes:
                limitRules = [limitRules]
            # Get the list of sentence element attribute names
            sentenceElement = sentence.sentence
            sentenceAttributes = sorted(sentenceElement.attrib.keys())
            # Filter sentences based on matching rules to their attribute values
            for rule in limitRules:
                for sentAttr in sentenceAttributes:
                    # Rule are of the form "attr.value" where "attr" is the name
                    # of the attribute to match, and "value" a substring within
                    # that attribute
                    if rule.startswith(sentAttr +
                                       "."):  # rule matches the attribute
                        value = rule.split(
                            ".", 1)[-1]  # get the value part of the rule
                        if value not in sentenceElement.get(
                                sentAttr
                        ):  # rule value must be a substring of the attribute value
                            return  # discard all sentences that do not match all rules
        # Process the sentence
        if sentence.sentenceGraph != None:
            goldGraph = None
            if goldSentence != None:
                goldGraph = goldSentence.sentenceGraph
            # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer)
            # self.exampleCount += c
            self.exampleCount += self.buildExamplesFromGraph(
                sentence.sentenceGraph,
                outfile,
                goldGraph,
                structureAnalyzer=structureAnalyzer)
        # return sentenceGraph_return, argCombinations_return

    @classmethod
    def run(cls,
            input,
            output,
            parse,
            tokenization,
            style,
            classIds=None,
            featureIds=None,
            gold=None,
            append=False,
            allowNewIds=True,
            structureAnalyzer=None,
            debug=False):
        print >> sys.stderr, "Running", cls.__name__
        print >> sys.stderr, "  input:", input
        if gold != None:
            print >> sys.stderr, "  gold:", gold
        print >> sys.stderr, "  output:", output, "(append:", str(append) + ")"
        print >> sys.stderr, "  add new class/feature ids:", allowNewIds
        if not isinstance(style, types.StringTypes):
            style = Utils.Parameters.toString(style)
        print >> sys.stderr, "  style:", style
        if tokenization == None:
            print >> sys.stderr, "  parse:", parse
        else:
            print >> sys.stderr, "  parse:", parse + ", tokenization:", tokenization
        classSet, featureSet = cls.getIdSets(
            classIds, featureIds, allowNewIds)  #cls.getIdSets(idFileTag)
        builder = cls(style=style, classSet=classSet, featureSet=featureSet)
        builder.debug = debug
        #builder.idFileTag = idFileTag
        builder.classIdFilename = classIds
        builder.featureIdFilename = featureIds
        builder.parse = parse
        builder.tokenization = tokenization
        builder.processCorpus(input,
                              output,
                              gold,
                              append=append,
                              allowNewIds=allowNewIds,
                              structureAnalyzer=structureAnalyzer)
        return builder

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        raise NotImplementedError

    def definePredictedValueRange(self, sentences, elementName):
        pass

    def getPredictedValueRange(self):
        return None

    @classmethod
    def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
        # Class ids
        #print classIds
        #print featureIds
        if classIds != None and os.path.exists(classIds):
            print >> sys.stderr, "Using predefined class names from", classIds
            classSet = IdSet(allowNewIds=allowNewIds)
            classSet.load(classIds)
        else:
            print >> sys.stderr, "No predefined class names"
            classSet = None
        # Feature ids
        if featureIds != None and os.path.exists(featureIds):
            print >> sys.stderr, "Using predefined feature names from", featureIds
            featureSet = IdSet(allowNewIds=allowNewIds)
            featureSet.load(featureIds)
        else:
            print >> sys.stderr, "No predefined feature names"
            featureSet = None
        return classSet, featureSet


#        if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"):
#            print >> sys.stderr, "Using predefined class and feature names"
#            featureSet = IdSet()
#            featureSet.load(idFileTag + ".feature_names.gz")
#            classSet = IdSet()
#            classSet.load(idFileTag + ".class_names")
#            return classSet, featureSet
#        else:
#            print >> sys.stderr, "No predefined class or feature-names"
#            if idFileTag != None:
#                assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag
#                assert(not os.path.exists(idFileTag + ".class_names")), idFileTag
#            return None, None

    def getSentences(self, input, parse, tokenization, removeNameInfo=False):
        # pdb.set_trace()
        # input is the path to the corpus xml file
        if type(input) != types.ListType:  # Program entered here - Mu
            # Load corpus and make sentence graphs
            # pdb.set_trace()
            corpusElements = Core.SentenceGraph.loadCorpus(
                input, parse, tokenization, removeNameInfo=removeNameInfo)
            sentences = []
            for sentence in corpusElements.sentences:
                if sentence.sentenceGraph != None:  # required for event detection
                    sentences.append([sentence.sentenceGraph, None])
            return sentences
        else:  # assume input is already a list of sentences
            assert (removeNameInfo == False)
            return input

    def calculatePredictedRange(self, sentences):
        print >> sys.stderr, "Defining predicted value range:",
        sentenceElements = []
        for sentence in sentences:
            sentenceElements.append(sentence[0].sentenceElement)
        self.definePredictedValueRange(sentenceElements, "entity")
        print >> sys.stderr, self.getPredictedValueRange()
Exemplo n.º 46
0
    def processCorpus(self,
                      input,
                      output,
                      gold=None,
                      append=False,
                      allowNewIds=True,
                      structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(
                os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)

        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:  # Entered here - Mu
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:  # Entered here, 1448 - Mu
                self.progress = ProgressCounter(
                    self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")
        # pdb.set_trace()

        # This line generates log below:(getSentences function generates the first 2 lines)
        # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113)
        # Skipped 381 duplicate interaction edges in SentenceGraphs
        # Defining predicted value range: None - Mu
        self.calculatePredictedRange(
            self.getSentences(input, self.parse, self.tokenization)
        )  # self.parse: mccc; self.tokenization: None

        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles[
                "keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False  # this is True  - Mu
        inputIterator = getCorpusIterator(
            input,
            None,
            self.parse,
            self.tokenization,
            removeIntersentenceInteractions=removeIntersentenceInteractions)

        # pdb.set_trace()
        #goldIterator = []
        if gold != None:  # Entered here - Mu
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles[
                    "keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False  # this is False - Mu
            goldIterator = getCorpusIterator(
                gold,
                None,
                self.parse,
                self.tokenization,
                removeIntersentenceInteractions=
                removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(
                    inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                # pdb.set_trace()
                # see the documentation of function processSentence() in this script
                # inputSentences[1].sentence is the unmerged version
                # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph,
                # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu
                self.processDocument(inputSentences,
                                     goldSentences,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences,
                                     None,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()

        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Classes:", len(self.classSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(
            self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()

        # Save Ids
        if allowNewIds:
            self.saveIds()
Exemplo n.º 47
0
def combine(inputA,
            inputB,
            inputGold,
            outPath=None,
            mode="OR",
            skip=None,
            logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(
                outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]),
                              "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(
            *[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(
            set([x.get("id")
                 for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[
                x.findall("sentence")
                for x in (docA, docB, docGold, docTemplate)
        ]):
            assert len(
                set([
                    x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)
                ])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip,
                                           counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses")
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode,
                                                     counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(
            a, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(
            b, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(
            template, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
Exemplo n.º 48
0
def makeSentences(input, tokenizationPath, output=None, removeText=False):
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting tokenizations from", tokenizationPath
    if tokenizationPath.find(".tar.gz") != -1:
        tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if tokenizationPath[0] == "/":
            tokenizationPath = tokenizationPath[1:]
    else:
        tarFile = None
    
    docCount = 0
    docsWithSentences = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "Sentence Splitting")
    for document in sourceElements:
        docCount += 1
        counter.update(1, "Splitting Documents ("+document.get("id")+"/" + document.get("pmid") + "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        if document.find("sentence") == None: # no existing sentence split                
            text = document.get("text")
            if text == None or text.strip() == "":
                continue
            
            newFile = os.path.join(tokenizationPath, document.get("pmid") + ".tok")
            f = openFile(newFile, tarFile)
            if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
                oldFile = os.path.join(tokenizationPath, document.get("pmid") + ".tokenized")
                f = openFile(newFile, oldFile)
                if f == None: # no tokenization found
                    continue
            sentencesCreated += alignSentences(document, f.readlines())
            f.close()
    
            # Remove original text
            if removeText:
                del document["text"]
            # Move elements from document element to sentences
            moveElements(document)
            docsWithSentences += 1
        else:
            docsWithSentences += 1
    
    if tarFile != None:
        tarFile.close()
    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 49
0
    def processCorpus(self,
                      input,
                      output,
                      gold=None,
                      append=False,
                      allowNewIds=True,
                      structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(
                os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)

        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:
                self.progress = ProgressCounter(
                    self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")

        self.calculatePredictedRange(
            self.getSentences(input, self.parse, self.tokenization))

        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles[
                "keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False
        inputIterator = getCorpusIterator(
            input,
            None,
            self.parse,
            self.tokenization,
            removeIntersentenceInteractions=removeIntersentenceInteractions)

        #goldIterator = []
        if gold != None:
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles[
                    "keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False
            goldIterator = getCorpusIterator(
                gold,
                None,
                self.parse,
                self.tokenization,
                removeIntersentenceInteractions=
                removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(
                    inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                self.processDocument(inputSentences,
                                     goldSentences,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences,
                                     None,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()

        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(
            self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()

        # Save Ids
        if allowNewIds:
            self.saveIds()
def makeSentences(input,
                  output=None,
                  removeText=False,
                  postProcess=True,
                  debug=False):
    """
    Run GENIA Sentence Splitter
    
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    global sentenceSplitterDir

    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
    if postProcess:
        print >> sys.stderr, "(Using post-processing)"
    else:
        print >> sys.stderr, "(No post-processing)"
    docCount = 0
    sentencesCreated = 0
    redivideCount = 0
    emptySentenceCount = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")
                      ] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
    counter.showMilliseconds = True
    # Create working directory
    workdir = tempfile.mkdtemp()
    for document in sourceElements:
        counter.update(1, "Splitting Documents (" + document.get("id") + "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        docTag = "-" + str(docCount)
        assert document.find("sentence") == None
        text = document.get("text")
        if text == None or text.strip() == "":
            continue
        #print type(text)
        # Write text to workfile
        #workdir = tempfile.mkdtemp()
        workfile = codecs.open(
            os.path.join(workdir, "sentence-splitter-input.txt" + docTag),
            "wt", "utf-8")
        # From http://themoritzfamily.com/python-encodings-and-unicode.html
        # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode
        # object otherwise it will try to automatically decode the byte stream as ASCII"
        # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII,
        # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this
        # point we should be able to safely write(text), as the output file is unicode, and reading with
        # the correct coded is taken care of earlier in the pipeline.
        workfile.write(text)  #.encode("utf-8"))
        workfile.close()
        # Run sentence splitter
        assert os.path.exists(
            Settings.GENIA_SENTENCE_SPLITTER_DIR +
            "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
        args = [
            Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh",
            os.path.join(workdir, "sentence-splitter-input.txt" + docTag),
            os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
            Settings.RUBY_PATH
        ]
        #p = subprocess.call(args)
        p = subprocess.Popen(args,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if stdout != "":
            print >> sys.stderr, stdout
        if stderr != 'Extracting events.roading model file.\nstart classification.\n':
            print >> sys.stderr, stderr
        #print "stdout<", p.stdout.readlines(), ">"
        #print "stderr<", p.stderr.readlines(), ">"
        if postProcess:
            postProcessorPath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "geniass-postproc.pl")
            assert os.path.exists(postProcessorPath), postProcessorPath
            ppIn = codecs.open(
                os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
                "rt", "utf-8")
            ppOut = codecs.open(
                os.path.join(
                    workdir,
                    "sentence-splitter-output-postprocessed.txt" + docTag),
                "wt", "utf-8")
            perlReturnValue = subprocess.call(["perl", postProcessorPath],
                                              stdin=ppIn,
                                              stdout=ppOut)
            assert perlReturnValue == 0, perlReturnValue
            ppIn.close()
            ppOut.close()
            # Read split sentences
            workfile = codecs.open(
                os.path.join(
                    workdir,
                    "sentence-splitter-output-postprocessed.txt" + docTag),
                "rt", "utf-8")
        else:
            workfile = codecs.open(
                os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
                "rt", "utf-8")
        start = 0  # sentences are consecutively aligned to the text for charOffsets
        sentenceCount = 0
        #text = text.replace("\n", " ") # should stop sentence splitter from crashing.
        #text = text.replace("  ", " ") # should stop sentence splitter from crashing.
        #alignmentText = text.replace("\n", " ").replace("\r", " ")
        #docTokens = reWhiteSpace.split(text)
        docIndex = 0
        sentenceBeginIndex = -1
        prevSentence = None
        prevEndIndex = None
        #emptySentenceCount = 0
        prevText = None
        for sText in workfile.readlines():
            sText = sText.strip()  # The text of the sentence
            if sText == "":
                emptySentenceCount += 1
                continue

            for i in range(len(sText)):
                if sText[i].isspace():
                    assert sText[i] not in ["\n", "\r"]
                    continue
                while text[docIndex].isspace():
                    if text[docIndex] in ["\n", "\r"
                                          ] and sentenceBeginIndex != -1:
                        redivideCount += 1
                        prevSentence = makeSentence(text, sentenceBeginIndex,
                                                    docIndex, prevSentence,
                                                    prevEndIndex)
                        prevSentence.set("id",
                                         docId + ".s" + str(sentenceCount))
                        prevSentence.set("redevided", "True")
                        sentencesCreated += 1
                        sentenceCount += 1
                        prevEndIndex = docIndex - 1
                        sentenceBeginIndex = -1
                        document.append(prevSentence)
                    docIndex += 1
                assert sText[i] == text[docIndex], (
                    text, sText, prevText, sText[i:i + 10],
                    text[docIndex:docIndex + 10], (i, docIndex),
                    sentenceBeginIndex)  # tokens[i].isspace() == False
                if sentenceBeginIndex == -1:
                    sentenceBeginIndex = docIndex
                docIndex += 1
                prevText = sText
            if sentenceBeginIndex != -1:
                prevSentence = makeSentence(text, sentenceBeginIndex, docIndex,
                                            prevSentence, prevEndIndex)
                prevSentence.set("id", docId + ".s" + str(sentenceCount))
                prevEndIndex = docIndex - 1
                sentenceBeginIndex = -1
                sentencesCreated += 1
                sentenceCount += 1
                document.append(prevSentence)
        # Add possible tail for last sentence
        if prevEndIndex < len(text) - 1 and prevSentence != None:
            assert prevSentence.get("tail") == None, prevSentence.get("tail")
            prevSentence.set("tail", text[prevEndIndex + 1:])

        #if emptySentenceCount > 0:
        #    print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id")
        # Remove original text
        if removeText:
            del document["text"]
        # Move elements from document element to sentences
        moveElements(document)
        docCount += 1

    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, "Redivided", redivideCount, "sentences"
    if emptySentenceCount > 0:
        print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences"

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        # Remove work directory
        shutil.rmtree(workdir)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
                         "--tokenization",
                         default=None,
                         dest="tokenization",
                         help="Tokenization element name")
    optparser.add_option("-p",
                         "--parse",
                         default=None,
                         dest="parse",
                         help="Parse element name")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Loading input file", options.input
    corpusElements = SentenceGraph.loadCorpus(options.input, options.parse,
                                              options.tokenization)

    counter = ProgressCounter(len(corpusElements.sentences),
                              "Resolving chains")
    tags = ["e1", "e2"]
    for sentence in corpusElements.sentences:
        counter.update(
            1,
            "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ")
        identityChainDict = {}
        tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores()
        for interaction in sentence.interactions:
            if interaction.attrib["type"] == "identity":
                e1 = sentence.entitiesById[interaction.attrib["e1"]]
                e2 = sentence.entitiesById[interaction.attrib["e2"]]
                t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1]
                t2 = sentence.sentenceGraph.entityHeadTokenByEntity[e2]
                if tokenHeadScores[t2] > tokenHeadScores[t1]:
                    identityChainDict[
Exemplo n.º 52
0
def makeSentences(input,
                  tokenizationPath,
                  output=None,
                  removeText=False,
                  escDict={},
                  ignoreErrors=False):
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    print >> sys.stderr, "Inserting tokenizations from", tokenizationPath
    assert os.path.exists(tokenizationPath)
    if tokenizationPath.find(".tar.gz") != -1:
        tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if tokenizationPath[0] == "/":
            tokenizationPath = tokenizationPath[1:]
    else:
        tarFile = None

    docCount = 0
    docsWithSentences = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")
                      ] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "Sentence Splitting")
    for document in sourceElements:
        docCount += 1
        origId = document.get("pmid")
        if origId == None:
            origId = document.get("origId")
        origId = str(origId)
        counter.update(
            1, "Splitting Documents (" + document.get("id") + "/" + origId +
            "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        if document.find("sentence") == None:  # no existing sentence split
            text = document.get("text")
            if text == None or text.strip() == "":
                continue

            newFile = os.path.join(tokenizationPath, origId + ".tok")
            f = openFile(newFile, tarFile)
            if f == None:  # file with BioNLP'11 extension not found, try BioNLP'09 extension
                oldFile = os.path.join(tokenizationPath, origId + ".tokenized")
                f = openFile(oldFile, tarFile)
                if f == None:  # no tokenization found
                    continue
            sentencesCreated += alignSentences(document,
                                               f.readlines(),
                                               escDict,
                                               ignoreErrors=ignoreErrors)
            f.close()

            # Remove original text
            if removeText:
                del document["text"]
            # Move elements from document element to sentences
            moveElements(document)
            docsWithSentences += 1
        else:
            docsWithSentences += 1

    if tarFile != None:
        tarFile.close()
    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 53
0
def findHeads(input,
              parse,
              tokenization=None,
              output=None,
              removeExisting=True,
              iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(
                input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence,
                                                    sentence.tokens,
                                                    sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(
                len(sentences),
                "Finding heads (" + sentences[-1].sentence.get("id") + "): ")
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"

        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)

        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()

        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Exemplo n.º 54
0
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False):
    """
    Run GENIA Sentence Splitter
    
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    global sentenceSplitterDir
    
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
    if postProcess:
        print >> sys.stderr, "(Using post-processing)"
    else:
        print >> sys.stderr, "(No post-processing)"
    docCount = 0
    sentencesCreated = 0
    redivideCount = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
    counter.showMilliseconds = True
    # Create working directory
    workdir = tempfile.mkdtemp()
    for document in sourceElements:
        counter.update(1, "Splitting Documents ("+document.get("id")+"): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        docTag = "-" + str(docCount)
        assert document.find("sentence") == None
        text = document.get("text")
        if text == None or text.strip() == "":
            continue
        #print type(text)
        # Write text to workfile
        #workdir = tempfile.mkdtemp()
        workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8")
        # From http://themoritzfamily.com/python-encodings-and-unicode.html
        # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode 
        # object otherwise it will try to automatically decode the byte stream as ASCII"
        # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII,
        # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this
        # point we should be able to safely write(text), as the output file is unicode, and reading with
        # the correct coded is taken care of earlier in the pipeline.
        workfile.write(text) #.encode("utf-8"))
        workfile.close()
        # Run sentence splitter
        assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
        args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH]
        #p = subprocess.call(args)
        p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if stdout != "":
            print >> sys.stderr, stdout
        if stderr != 'Extracting events.roading model file.\nstart classification.\n':
            print >> sys.stderr, stderr
        #print "stdout<", p.stdout.readlines(), ">"
        #print "stderr<", p.stderr.readlines(), ">"
        if postProcess:
            ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
            ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8")
            subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut)
            ppIn.close()
            ppOut.close()
            # Read split sentences
            workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8")
        else:
            workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
        start = 0 # sentences are consecutively aligned to the text for charOffsets
        sentenceCount = 0
        #text = text.replace("\n", " ") # should stop sentence splitter from crashing.
        #text = text.replace("  ", " ") # should stop sentence splitter from crashing.
        #alignmentText = text.replace("\n", " ").replace("\r", " ")
        #docTokens = reWhiteSpace.split(text)
        docIndex = 0
        sentenceBeginIndex = -1
        prevSentence = None
        prevEndIndex = None
        emptySentenceCount = 0
        prevText = None
        for sText in workfile.readlines():
            sText = sText.strip() # The text of the sentence
            if sText == "":
                emptySentenceCount += 1
                continue

            for i in range(len(sText)):
                if sText[i].isspace():
                    assert sText[i] not in ["\n", "\r"]
                    continue
                while text[docIndex].isspace():
                    if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1:
                        redivideCount += 1
                        prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex)
                        prevSentence.set("id", docId + ".s" + str(sentenceCount))
                        prevSentence.set("redevided", "True")
                        sentencesCreated += 1
                        sentenceCount += 1
                        prevEndIndex = docIndex-1
                        sentenceBeginIndex = -1
                        document.append(prevSentence)
                    docIndex += 1
                assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False
                if sentenceBeginIndex == -1:
                    sentenceBeginIndex = docIndex
                docIndex += 1
                prevText = sText
            if sentenceBeginIndex != -1:
                prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex)
                prevSentence.set("id", docId + ".s" + str(sentenceCount))
                prevEndIndex = docIndex-1
                sentenceBeginIndex = -1
                sentencesCreated += 1
                sentenceCount += 1
                document.append(prevSentence)
        # Add possible tail for last sentence
        if prevEndIndex < len(text) - 1 and prevSentence != None:
            assert prevSentence.get("tail") == None, prevSentence.get("tail")
            prevSentence.set("tail", text[prevEndIndex+1:])
            
        if emptySentenceCount > 0:
            print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") 
        # Remove original text
        if removeText:
            del document["text"]
        # Move elements from document element to sentences
        moveElements(document)
        docCount += 1
    
    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, "Redivided", redivideCount, "sentences"
    
    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        # Remove work directory
        shutil.rmtree(workdir)
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 55
0
 def insertParses(self,
                  parseDir,
                  input,
                  output=None,
                  parseName="McCC",
                  extensions=None,
                  subDirs=None,
                  debug=False,
                  skipParsed=False,
                  docMatchKeys=None,
                  conllFormat=None,
                  splitting=True,
                  unescapeFormats="AUTO",
                  tokenMerging=True,
                  extMap=None,
                  sdFailedFormat="empty",
                  origIdType=None,
                  posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) +
                         "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir,
                                extensions,
                                subDirs,
                                counts,
                                extMap=extMap,
                                origIdType=origIdType)
     typeCounts = {x: defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(
         files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [
                 document.get(x) for x in docMatchKeys
                 if document.get(x) != None
         ]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception(
                         "Multiple matching parses for document " +
                         str(document.attrib) + " using keys " +
                         str(docMatchKeys))
                 matchFound = True
                 counter.update(
                     1, "Inserting parses for (" + document.get("id") +
                     "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [
                         x for x in self.getSentences(document,
                                                      skipParsed=skipParsed)
                     ]
                     self.insertParse(document,
                                      sentences,
                                      ext,
                                      files[docMatchValue][ext],
                                      parseName,
                                      splitting,
                                      typeCounts,
                                      conllFormat,
                                      unescapeFormats=unescapeFormats,
                                      tokenMerging=tokenMerging,
                                      sdFailedFormat=sdFailedFormat,
                                      posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(
             typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(
                 typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree