def buildExamplesForDocuments(self, documentSentences, output, idFileTag=None): examples = [] counter = ProgressCounter(len(documentSentences), "Build examples") #calculatePredictedRange(self, sentences) outfile = open(output, "wt") exampleCount = 0 for document in documentSentences: counter.update( 1, "Building examples (" + document[0].sentence.get("id") + "): ") examples = self.buildExamples(document) exampleCount += len(examples) #examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def buildExamples(exampleBuilder, sentences, options): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) exampleBuilder.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, exampleBuilder.getPredictedValueRange() examples = [] if hasattr(exampleBuilder, "styles") and "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") for sentence in sentences: counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ") sentence[1] = exampleBuilder.buildExamples(sentence[0]) examples.extend(sentence[1]) print >> sys.stderr, "Examples built:", len(examples) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Preprocessing examples:" examples = exampleBuilder.preProcessExamples(examples) # Save examples # if options.output != None: # print >> sys.stderr, "Saving examples to", options.output + "/examples.txt" # commentLines = [] # commentLines.append("Input file: " + options.input) # commentLines.append("Example builder: " + options.exampleBuilder) # commentLines.append("Features:") # commentLines.extend(exampleBuilder.featureSet.toStrings()) # Example.writeExamples(examples, options.output + "/examples.txt", commentLines) #examples = filterFeatures(exampleBuilder.featureSet, examples) #Example.normalizeFeatureVectors(examples) return examples
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def findHeadsSyntactic(corpus, parse, tokenization): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ counts = [0,0] sentences = [x for x in corpus.getiterator("sentence")] counter = ProgressCounter(len(sentences), "SYNTAX") for sentence in sentences: counter.update() tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization}) parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse}) if tokElement == None or parseElement == None: print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" tokens = tokElement.findall("token") tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) for entity in sentence.findall("entity"): if entity.get("headOffset") == None: headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute entity.set("headOffset", headToken.get("charOffset")) entity.set("headMethod", "Syntax") entity.set("headString", headToken.get("text")) counts[0] += 1 return counts
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >>sys.stderr, "Examples built:", exampleCount print >>sys.stderr, "Features:", len(self.featureSet.getNames()) # IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # ENDIF # Save Ids if idFileTag != None: print >>sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename if inputFilename.rsplit(".",1)[-1] == "gz": import gzip corpusTree = ET.parse(gzip.open(inputFilename)) else: corpusTree = ET.parse(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for k in sorted(rules.keys()): countsByType[k] = 0 for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def prepareDocuments(self, corpusRoot, files, conllFormat=None, counts=None): print >> sys.stderr, "Generating document elements from the parses" docNames = sorted(files.keys()) corpusName = corpusRoot.get("source", "CORPUS") #parseExtensions = set(["ptb", "conll", "conllx", "conllu"]) counter = ProgressCounter(len(docNames), "Document Generation") for i in range(len(docNames)): docName = docNames[i] counter.update( 1, "Making document element for document '" + str(docName) + "': ") #filePaths = files[docName] extensions = sorted(files[docName].keys()) sentObjs = self.readParse(extensions[0], files[docName][extensions[0]], conllFormat) sentTexts = [] for sentObj in sentObjs: if "tokens" in sentObj: sentTexts.append(" ".join( [x["text"] for x in sentObj["tokens"]])) docText = " ".join(sentTexts) ET.SubElement(corpusRoot, "document", id=corpusName + ".d" + str(i), origId=docName, text=docText) return [x for x in corpusRoot.findall("document")]
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): if type(rules[eType][attrRule]) in types.StringTypes: rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets, negativeClassId, entityMatchFunction): entityExamples = [] entityPredictions = [] interactionExamples = [] interactionPredictions = [] eventExamples = [] eventPredictions = [] falseEntity = defaultdict(lambda: defaultdict(int)) counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing") # Loop through the sentences and collect all predictions toCorpusSentences = None if toCorpus != None: toCorpusSentences = toCorpus.documentSentences for i in range(len(fromCorpus.documentSentences)): if len(fromCorpus.documentSentences[i]) > 0: counter.update( len(fromCorpus.documentSentences[i]), fromCorpus.documentSentences[i][0].sentence.get("id").rsplit( ".", 1)[0]) if toCorpusSentences != None: newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument( fromCorpus.documentSentences[i], toCorpusSentences[i], target, classSets, negativeClassId, entityMatchFunction) else: newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument( fromCorpus.documentSentences[i], None, target, classSets, negativeClassId, entityMatchFunction) entityExamples.extend(newEntityExPred[0]) entityPredictions.extend(newEntityExPred[1]) interactionExamples.extend(newInteractionExPred[0]) interactionPredictions.extend(newInteractionExPred[1]) eventExamples.extend(newEventExPred[0]) eventPredictions.extend(newEventExPred[1]) for k, v in sentFalseEntity.iteritems(): falseEntity[k][0] += v[0] falseEntity[k][1] += v[1] # Process the predictions with an evaluator and print the results evaluator = None if len(entityPredictions) > 0: evaluator = EvaluatorClass(entityExamples, entityPredictions, classSet=classSets["entity"]) print evaluator.toStringConcise(title="Entities") if len(interactionPredictions) > 0: evaluator = EvaluatorClass(interactionExamples, interactionPredictions, classSet=classSets["interaction"]) print evaluator.toStringConcise(title="Interactions") #print "Interactions (fp ent->fp int, fn-ent->fn-int )" #for key in sorted(falseEntity.keys()): # print "", key, falseEntity[key][0], "/", falseEntity[key][1] if len(eventPredictions) > 0: evaluator = EvaluatorClass(eventExamples, eventPredictions, classSet=classSets["entity"]) print evaluator.toStringConcise(title="Events") return evaluator
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None): corpusTree, corpusRoot = self.getCorpus(input) if not os.path.exists(parseDir): raise Exception("Cannot find parse input '" + str(parseDir) + "'") if not os.path.isdir(parseDir): raise Exception("Parse input '" + str(parseDir) + "' is not a directory") if extensions == None: extensions = self.allExt elif isinstance(extensions, basestring): extensions = extensions.split(",") extensions = [x for x in extensions if x in self.allExt] unescapeFormats = self.getUnescapeFormats(unescapeFormats) if docMatchKeys == None: docMatchKeys = ["origId", "pmid", "id"] elif isinstance(docMatchKeys, basestring): docMatchKeys = docMatchKeys.split(",") print >> sys.stderr, "Inserting parses from file types:", extensions counts = defaultdict(int) files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType) typeCounts = {x:defaultdict(int) for x in extensions} # Make document elements if needed documents = [x for x in corpusRoot.findall("document")] if len(documents) == 0: typeCounts["document-generation"] = defaultdict(int) documents = self.prepareDocuments(corpusRoot, files) counter = ProgressCounter(len(files), "Parse Insertion") # Insert parses and make sentence elements if needed typeCounts["sentence-splitting"] = defaultdict(int) print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents" for document in documents: counts["document"] += 1 matchFound = False for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]: if docMatchValue in files: if matchFound: raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys)) matchFound = True counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ") counts["document-match"] += 1 for ext in extensions: if ext not in files[docMatchValue]: continue counts[ext + "-match"] += 1 sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)] self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags) if not matchFound: counts["document-no-match"] += 1 if len(typeCounts["sentence-splitting"]) > 0: print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"]) print >> sys.stderr, "Counts", dict(counts) for ext in extensions: if len(typeCounts[ext]) > 0: print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext]) # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def run( cls, fileIn, fileOut=None, tokenization="split-Charniak-Lease", entityOffsetKey="charOffset", includeNeg=False, stem=False, ): """Builds the master gazzeteer. fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element fileOut: a string or None. If given, the resulting gazzetteer will be written out tokenization: name of the tokenization to be used Produces a dictionary with... """ print >>sys.stderr, "Building gazetteer" gztr = {} # key: token value: dictionary (key: className, value count) root = ETUtils.ETFromObj(fileIn) if not ET.iselement(root): assert isinstance(root, ET.ElementTree) root = root.getroot() sentences = [] for sNode in root.getiterator("sentence"): sentences.append(sNode) counter = ProgressCounter(len(sentences), "Build gazetteer") for sNode in sentences: counter.update(1, "Adding to gazetteer sentence " + sNode.get("id") + ", ") for tokenizationNode in sNode.getiterator("tokenization"): if tokenizationNode.get("tokenizer") == tokenization: break else: assert False, "Did not find %s tokenization" % tokenization tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey) assert len(tClasses) == len(tokenizationNode) for tokIdx, tokNode in enumerate(tokenizationNode): gsClass = tClasses[tokIdx] b, e = charOffStr2tuple(tokNode.get("charOffset")) tokNodeTxt = tokTxt(b, e, sNode, stem).lower() tokDict = gztr.setdefault(tokNodeTxt, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # for multi-part texts, add collapsed and last token versions if tokNodeTxt.find("-") != -1: # collapsed text = tokNodeTxt.replace("-", "") if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # last part text = tokNodeTxt.rsplit("-", 1)[-1] if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 if fileOut: Gazetteer.saveGztr(gztr, fileOut, includeNeg) return gztr
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True): """ Load an entire corpus through CorpusElements and add SentenceGraph-objects to its SentenceElements-objects. """ import cElementTreeUtils as ETUtils import sys sys.path.append("..") from Utils.ProgressCounter import ProgressCounter from InteractionXML.CorpusElements import CorpusElements # Corpus may be in file or not if type(corpus) == types.StringType: print >> sys.stderr, "Loading corpus file", corpus corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Use CorpusElements-class to access xml-tree corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements) print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences" # Make sentence graphs duplicateInteractionEdgesRemoved = 0 sentences = [] counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs") counter.showMilliseconds = True for sentence in corpusElements.sentences[:]: counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ") # No tokens, no sentence. No also no dependencies = no sentence. # Let's not remove them though, so that we don't lose sentences from input. if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: #corpusElements.sentences.remove(sentence) sentence.sentenceGraph = None continue for pair in sentence.pairs: # gif-xml defines two closely related element types, interactions and # pairs. Pairs are like interactions, but they can also be negative (if # interaction-attribute == False). Sometimes pair-elements have been # (incorrectly) used without this attribute. To work around these issues # we take all pair-elements that define interaction and add them to # the interaction-element list. isInteraction = pair.get("interaction") if isInteraction == "True" or isInteraction == None: sentence.interactions.append(pair) # add to interaction-elements if pair.get("type") == None: # type-attribute must be explicitly defined pair.set("type", "undefined") # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement #graph.mapEntityHints() print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs" return corpusElements
def run(cls, fileIn, fileOut=None, tokenization="split-Charniak-Lease", entityOffsetKey="charOffset", includeNeg=False, stem=False): """Builds the master gazzeteer. fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element fileOut: a string or None. If given, the resulting gazzetteer will be written out tokenization: name of the tokenization to be used Produces a dictionary with... """ print >> sys.stderr, "Building gazetteer" gztr = {} #key: token value: dictionary (key: className, value count) root = ETUtils.ETFromObj(fileIn) if not ET.iselement(root): assert isinstance(root, ET.ElementTree) root = root.getroot() sentences = [] for sNode in root.getiterator("sentence"): sentences.append(sNode) counter = ProgressCounter(len(sentences), "Build gazetteer") for sNode in sentences: counter.update( 1, "Adding to gazetteer sentence " + sNode.get("id") + ", ") for tokenizationNode in sNode.getiterator("tokenization"): if tokenizationNode.get("tokenizer") == tokenization: break else: assert False, "Did not find %s tokenization" % tokenization tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey) assert len(tClasses) == len(tokenizationNode) for tokIdx, tokNode in enumerate(tokenizationNode): gsClass = tClasses[tokIdx] b, e = charOffStr2tuple(tokNode.get("charOffset")) tokNodeTxt = tokTxt(b, e, sNode, stem).lower() tokDict = gztr.setdefault(tokNodeTxt, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # for multi-part texts, add collapsed and last token versions if tokNodeTxt.find("-") != -1: # collapsed text = tokNodeTxt.replace("-", "") if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # last part text = tokNodeTxt.rsplit("-", 1)[-1] if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 if fileOut: Gazetteer.saveGztr(gztr, fileOut, includeNeg) return gztr
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False): newCorpora = {} print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for document in documents: counter.update() docSet = document.get("set") if docSet == None: if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id") if not countsByType.has_key("No set"): countsByType["No set"] = 0 countsByType["No set"] += 1 continue elif not newCorpora.has_key(docSet): newCorpora[docSet] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[docSet].set(k, v) countsByType[docSet] = 0 newCorpora[docSet].append(document) countsByType[docSet] += 1 # Make merged sets for mergedSet in mergedSets: tag = "-and-".join(sorted(mergedSet)) if not newCorpora.has_key(tag): newCorpora[tag] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[tag].set(k, v) countsByType[tag] = 0 for componentSet in mergedSet: for element in newCorpora[componentSet].findall("document"): newCorpora[tag].append(element) countsByType[tag] += 1 print >> sys.stderr, "Documents per set" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + str(k) + ":", countsByType[k] if stem == None: outDir, stem = os.path.dirname(outDir), os.path.basename(outDir) if not os.path.exists(outDir): os.makedirs(outDir) print >> sys.stderr, "Writing output files to directory", outDir if saveCombined: print >> sys.stderr, "Saving combined input to", stem + tail ETUtils.write(corpusRoot, stem + tail) else: print >> sys.stderr, "Combined input not saved" for docSet in sorted(newCorpora.keys()): outFilename = os.path.join(outDir, stem + "-" + docSet + tail) print >> sys.stderr, "Writing set", docSet, "to", outFilename ETUtils.write(newCorpora[docSet], outFilename)
def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=False, verbose=False): counts = defaultdict(int) entityExamples = [] entityPredictions = [] interactionExamples = [] interactionPredictions = [] eventExamples = [] eventPredictions = [] falseEntity = defaultdict(lambda: defaultdict(int)) if not verbose: counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing") # Loop through the sentences and collect all predictions toCorpusSentences = None if toCorpus != None: toCorpusSentences = toCorpus.documentSentences for i in range(len(fromCorpus.documentSentences)): if len(fromCorpus.documentSentences[i]) > 0 and not verbose: counter.update(len(fromCorpus.documentSentences[i]), fromCorpus.documentSentences[i][0].sentence.get("id").rsplit(".", 1)[0]) if toCorpusSentences != None: newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], toCorpusSentences[i], target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts) else: newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], None, target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts) entityExamples.extend(newEntityExPred[0]) entityPredictions.extend(newEntityExPred[1]) interactionExamples.extend(newInteractionExPred[0]) interactionPredictions.extend(newInteractionExPred[1]) eventExamples.extend(newEventExPred[0]) eventPredictions.extend(newEventExPred[1]) for k,v in sentFalseEntity.iteritems(): falseEntity[k][0] += v[0] falseEntity[k][1] += v[1] # Process the predictions with an evaluator and print the results evaluator = None if len(entityPredictions) > 0: evaluator = EvaluatorClass(entityExamples, entityPredictions, classSet=classSets["entity"]) print evaluator.toStringConcise(title="Entities") if errorMatrix: print evaluator.matrixToString() print evaluator.matrixToString(True) if len(interactionPredictions) > 0: evaluator = EvaluatorClass(interactionExamples, interactionPredictions, classSet=classSets["interaction"]) print evaluator.toStringConcise(title="Interactions") if errorMatrix: print evaluator.matrixToString() print evaluator.matrixToString(True) #print "Interactions (fp ent->fp int, fn-ent->fn-int )" #for key in sorted(falseEntity.keys()): # print "", key, falseEntity[key][0], "/", falseEntity[key][1] if len(eventPredictions) > 0: evaluator = EvaluatorClass(eventExamples, eventPredictions, classSet=classSets["entity"]) print evaluator.toStringConcise(title="Events") if errorMatrix: print evaluator.matrixToString() print evaluator.matrixToString(True) return evaluator
def compareToBinary(complexSentencesById, classifications, exampleBuilder, options): # Load corpus and make sentence graphs print >> sys.stderr, "Calculating performance on binary corpus" classificationsBySentence = {} for classification in classifications: example = classification[0][0] sentenceId = example[0].rsplit(".",1)[0] sentenceOrigId = complexSentencesById[sentenceId].sentence.attrib["origId"] if not classificationsBySentence.has_key(sentenceOrigId): classificationsBySentence[sentenceOrigId] = [] classificationsBySentence[sentenceOrigId].append(classification) print >> sys.stderr, "Loading Binary corpus" binaryCorpusElements = loadCorpus(options.binaryCorpus) binaryClassifications = [] counter = ProgressCounter(len(binaryCorpusElements.sentences), "Build binary classifications") for binarySentence in binaryCorpusElements.sentences: counter.update(1, "Building binary classifications ("+binarySentence.sentence.attrib["id"]+"): ") if(classificationsBySentence.has_key(binarySentence.sentence.attrib["origId"])): complexClassificationGraph = NX.XGraph(multiedges = multiedges) for token in binarySentence.sentenceGraph.tokens: complexClassificationGraph.add_node(token) for classification in classificationsBySentence[binarySentence.sentence.attrib["origId"]]: if classification[1] > 0: example = classification[0][0] t1 = example[3]["t1"] t2 = example[3]["t2"] t1Binary = None for token in binarySentence.sentenceGraph.tokens: if token.attrib["charOffset"] == t1.attrib["charOffset"]: t1Binary = token t2Binary = None for token in binarySentence.sentenceGraph.tokens: if token.attrib["charOffset"] == t2.attrib["charOffset"]: t2Binary = token assert(t1Binary != None and t2Binary != None) complexClassificationGraph.add_edge(t1Binary, t2Binary) paths = NX.all_pairs_shortest_path(complexClassificationGraph, cutoff=999) for pair in binarySentence.pairs: t1 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e1"]] t2 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e2"]] assert(pair.attrib["interaction"] == "True" or pair.attrib["interaction"] == "False") if pair.attrib["interaction"] == "True": pairClass = 1 else: pairClass = -1 extra = {"xtype":"edge","type":"i","t1":t1,"t2":t2} if paths.has_key(t1) and paths[t1].has_key(t2): binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], 1, "binary"] ) else: binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], -1, "binary"] ) print >> sys.stderr, "Evaluating binary classifications" evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() if options.output != None: evaluation.saveCSV(options.output + "/binary_comparison_results.csv")
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True): """ Load an entire corpus through CorpusElements and add SentenceGraph-objects to its SentenceElements-objects. """ import Utils.ElementTreeUtils as ETUtils import sys from Utils.ProgressCounter import ProgressCounter from Utils.InteractionXML.CorpusElements import CorpusElements # Corpus may be in file or not if type(corpus) == types.StringType: print >> sys.stderr, "Loading corpus file", corpus corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Use CorpusElements-class to access xml-tree corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements) print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences" # Make sentence graphs duplicateInteractionEdgesRemoved = 0 sentences = [] counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs") counter.showMilliseconds = True for sentence in corpusElements.sentences[:]: counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ") # No tokens, no sentence. No also no dependencies = no sentence. # Let's not remove them though, so that we don't lose sentences from input. if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: #corpusElements.sentences.remove(sentence) sentence.sentenceGraph = None continue for pair in sentence.pairs: # gif-xml defines two closely related element types, interactions and # pairs. Pairs are like interactions, but they can also be negative (if # interaction-attribute == False). Sometimes pair-elements have been # (incorrectly) used without this attribute. To work around these issues # we take all pair-elements that define interaction and add them to # the interaction-element list. isInteraction = pair.get("interaction") if isInteraction == "True" or isInteraction == None: sentence.interactions.append(pair) # add to interaction-elements if pair.get("type") == None: # type-attribute must be explicitly defined pair.set("type", "undefined") # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement #graph.mapEntityHints() print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs" return corpusElements
def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000): print >> sys.stderr, "Making vocabulary" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() vocabulary = {"indices":{}, "vectors":[]} print >> sys.stderr, "Loading word vectors from", wordVectorPath print >> sys.stderr, "max_rank_mem", max_rank_mem print >> sys.stderr, "max_rank", max_rank max_rank_mem = int(max_rank_mem) max_rank = int(max_rank) wv = WV.load(wordVectorPath, max_rank_mem, max_rank) dimVector = wv.vectors.shape[1] print >> sys.stderr, "WordVector length", dimVector #addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range #addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") counts = defaultdict(int) for document in documents: counter.update() counts["document"] += 1 for sentence in document.findall("sentence"): counts["sentence"] += 1 tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName) if tokenization != None: counts["tokenization"] += 1 for token in tokenization.findall("token"): counts["token"] += 1 text = token.get("text") if text not in vocabulary["indices"]: counts["token-unique"] += 1 vector = wv.w_to_normv(token.get("text").lower()) if vector is not None: counts["vector"] += 1 vector = vector.tolist() + [0.0, 0.0] addVector(text, vector, vocabulary) else: counts["no-vector"] += 1 print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz" with gzip.open(output + "-vectors.json.gz", "wt") as f: json.dump(vocabulary, f) print >> sys.stderr, "Writing indices to", output + "-indices.json.gz" with gzip.open(output + "-indices.json.gz", "wt") as f: json.dump({"indices":vocabulary["indices"], "vectors":None}, f) return vocabulary
def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization)) inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization) #goldIterator = [] if gold != None: goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization) for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None self.processDocument(inputSentences, goldSentences, outfile) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds()
def compareDocuments(documentMap, targetFiles, options): documentIds = sorted(documentMap.keys()) counter = ProgressCounter(len(documentIds)) stats = {} eventStats = { "Start Events": 0, "End Events": 0, "False Positive Trigger": 0 } #, #"Cause FN":0, #"Cause FP":0, #"Theme FN":0, #"Theme FP":0} for docId in documentIds: counter.update(1, "Processing: ") # document " + str(docId) + ": " ) for fileName in sorted(documentMap[docId]): extension = fileName.split(".", 1)[-1] addStat(stats, extension, "source") if os.path.exists(os.path.join(options.output, fileName)): addStat(stats, extension, "target") if extension == "txt" or extension == "a1": if compareByLine(fileName, options): addStat(stats, extension, "identical") else: addStat(stats, extension, "different") if options.verbose: print >> sys.stderr, " ...in comparing", fileName elif extension == "a2.t1": if compareA2T1Files(fileName, options, eventStats): addStat(stats, extension, "identical") else: addStat(stats, extension, "different") if options.verbose: print >> sys.stderr, " ...in comparing", fileName print >> sys.stderr, "Files (source, target, identical, different):" for key in sorted(stats.keys()): print >> sys.stderr, " " + key + ":" + (10 - len(key)) * " " + "\t", for value in stats[key]: print >> sys.stderr, "\t" + str(value), print >> sys.stderr print >> sys.stderr, "Event stats:" for key in sorted(eventStats.keys()): print >> sys.stderr, " " + key + ": " + str(eventStats[key]) print >> sys.stderr, "Event extraction:" eventsSource = eventStats["Start Events"] events0 = 0 if eventStats.has_key("Error Level 0"): events0 = eventStats["Error Level 0"] if eventsSource == 0: percent = 0 else: percent = (100.0 * events0 / eventsSource) print >> sys.stderr, " Exact:", events0, "/", eventsSource, "(%.2f" % percent + " %)"
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def processCorpus(input, outDir, stem, tail, mergedSets=[]): newCorpora = {} print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for document in documents: counter.update() docSet = document.get("set") if docSet == None: print >> sys.stderr, "Warning, no set defined for document", document.get("id") if not countsByType.has_key(None): countsByType[None] = 0 countsByType[docSet] += 1 continue if not newCorpora.has_key(docSet): newCorpora[docSet] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[docSet].set(k, v) countsByType[docSet] = 0 newCorpora[docSet].append(document) countsByType[docSet] += 1 # Make merged sets for mergedSet in mergedSets: tag = "-and-".join(sorted(mergedSet)) if not newCorpora.has_key(tag): newCorpora[tag] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[tag].set(k, v) countsByType[tag] = 0 for componentSet in mergedSet: for element in newCorpora[componentSet].findall("document"): newCorpora[tag].append(element) countsByType[tag] += 1 print >> sys.stderr, "New Sets" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + str(k) + ":", countsByType[k] if not os.path.exists(outDir): os.makedirs(outDir) print >> sys.stderr, "Writing output files to directory", outDir for docSet in sorted(newCorpora.keys()): outFilename = os.path.join(outDir, stem + docSet + tail) print >> sys.stderr, "Writing set", docSet, "to", outFilename ETUtils.write(newCorpora[docSet], outFilename)
def compareDocuments(documentMap, targetFiles, options): documentIds = sorted(documentMap.keys()) counter = ProgressCounter(len(documentIds)) stats = {} eventStats = {"Start Events":0, "End Events":0, "False Positive Trigger":0}#, #"Cause FN":0, #"Cause FP":0, #"Theme FN":0, #"Theme FP":0} for docId in documentIds: counter.update(1, "Processing: ")# document " + str(docId) + ": " ) for fileName in sorted(documentMap[docId]): extension = fileName.split(".",1)[-1] addStat(stats, extension, "source") if os.path.exists(os.path.join(options.output, fileName)): addStat(stats, extension, "target") if extension == "txt" or extension == "a1": if compareByLine(fileName, options): addStat(stats, extension, "identical") else: addStat(stats, extension, "different") if options.verbose: print >> sys.stderr, " ...in comparing", fileName elif extension == "a2.t1": if compareA2T1Files(fileName, options, eventStats): addStat(stats, extension, "identical") else: addStat(stats, extension, "different") if options.verbose: print >> sys.stderr, " ...in comparing", fileName print >> sys.stderr, "Files (source, target, identical, different):" for key in sorted(stats.keys()): print >> sys.stderr, " " + key + ":" + (10-len(key)) * " " + "\t", for value in stats[key]: print >> sys.stderr, "\t" + str(value), print >> sys.stderr print >> sys.stderr, "Event stats:" for key in sorted(eventStats.keys()): print >> sys.stderr, " " + key + ": " + str(eventStats[key]) print >> sys.stderr, "Event extraction:" eventsSource = eventStats["Start Events"] events0 = 0 if eventStats.has_key("Error Level 0"): events0 = eventStats["Error Level 0"] if eventsSource == 0: percent = 0 else: percent = (100.0 * events0 / eventsSource) print >> sys.stderr, " Exact:", events0, "/", eventsSource, "(%.2f" % percent + " %)"
def build(cls, input, output, parse, tokenization=None, includeNeg=False): p = PathGazetteer(includeNeg) sentences = cls.getSentences(input, parse, tokenization) counter = ProgressCounter(len(sentences), "Build path gazetteer") for sentence in sentences: counter.update(1, "Building path gazetteer ("+sentence[0].getSentenceId()+"): ") p.processSentence(sentence[0]) p.calculateFractions() f = open(output, "wt") for key in sorted(p.gazetteer.keys()): v = p.gazetteer[key] f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) + " " + str(v[3]) + "\n") f.close()
def processCorpus(input, attrs=["text"]): print attrs print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} interactors = {} for document in documents: entDict = {} for entity in document.getiterator("entity"): entDict[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entDict[interaction.get("e1")] e2 = entDict[interaction.get("e2")] # form identifier tuples e1Tuple = [] for attr in attrs: e1Tuple.append(e1.get(attr)) e1Tuple = tuple(e1Tuple) e2Tuple = [] for attr in attrs: e2Tuple.append(e2.get(attr)) e2Tuple = tuple(e2Tuple) interactors = [e1Tuple, e2Tuple] #interactors.sort() print interactors
def process(input, output=None, preprocess=True, debug=False): """ Run MetaMap. """ counter = ProgressCounter(id="MetaMap") # Create working directory workdir = tempfile.mkdtemp() outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) # Loop iteratively over elements skip = False for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event == "start": # element start message, element may not be fully read yet if element.tag == "sentence": sentence = element counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ") # Run metamap for the sentence element elif element.tag == "metamap": # skip the metamap element to remove the original one skip = True if not skip and output != None: outWriter.begin(element) elif event == "end": # element is fully read in memory if not skip and output != None: outWriter.end(element) if element.tag == "metamap": skip = False # write elements again after this one if preprocess: element = convert(element, sentence) outWriter.write(element) # insert the new metamap element into the output stream if output != None: print >> sys.stderr, "Writing output to", output outWriter.close() ETUtils.encodeNewlines(output) if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: shutil.rmtree(workdir) return output
def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def build(cls, input, output, parse, tokenization=None, includeNeg=False): p = PathGazetteer(includeNeg) sentences = cls.getSentences(input, parse, tokenization) counter = ProgressCounter(len(sentences), "Build path gazetteer") for sentence in sentences: counter.update( 1, "Building path gazetteer (" + sentence[0].getSentenceId() + "): ") p.processSentence(sentence[0]) p.calculateFractions() f = open(output, "wt") for key in sorted(p.gazetteer.keys()): v = p.gazetteer[key] f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) + " " + str(v[3]) + "\n") f.close()
def prepareDocuments(self, corpusRoot, files, conllFormat=None, counts=None): print >> sys.stderr, "Generating document elements from the parses" docNames = sorted(files.keys()) corpusName = corpusRoot.get("source", "CORPUS") #parseExtensions = set(["ptb", "conll", "conllx", "conllu"]) counter = ProgressCounter(len(docNames), "Document Generation") for i in range(len(docNames)): docName = docNames[i] counter.update(1, "Making document element for document '" + str(docName) + "': ") #filePaths = files[docName] extensions = sorted(files[docName].keys()) sentObjs = self.readParse(extensions[0], files[docName][extensions[0]], conllFormat) sentTexts = [] for sentObj in sentObjs: if "tokens" in sentObj: sentTexts.append(" ".join([x["text"] for x in sentObj["tokens"]])) docText = " ".join(sentTexts) ET.SubElement(corpusRoot, "document", id=corpusName + ".d" + str(i), origId=docName, text=docText) return [x for x in corpusRoot.findall("document")]
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example (" + example[0] + "): ") features = example[2] for i in range(len(weightFeatures) - 1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example ("+example[0]+"): ") features = example[2] for i in range(len(weightFeatures)-1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def buildExamples(exampleBuilder, sentences, options): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) exampleBuilder.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, exampleBuilder.getPredictedValueRange() examples = [] if hasattr(exampleBuilder, "styles") and "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") for sentence in sentences: counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") sentence[1] = exampleBuilder.buildExamples(sentence[0]) examples.extend(sentence[1]) print >> sys.stderr, "Examples built:", len(examples) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Preprocessing examples:" examples = exampleBuilder.preProcessExamples(examples) # Save examples # if options.output != None: # print >> sys.stderr, "Saving examples to", options.output + "/examples.txt" # commentLines = [] # commentLines.append("Input file: " + options.input) # commentLines.append("Example builder: " + options.exampleBuilder) # commentLines.append("Features:") # commentLines.extend(exampleBuilder.featureSet.toStrings()) # Example.writeExamples(examples, options.output + "/examples.txt", commentLines) #examples = filterFeatures(exampleBuilder.featureSet, examples) #Example.normalizeFeatureVectors(examples) return examples
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][ 0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def findHeadsSyntactic(corpus, parse, tokenization): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ counts = [0, 0] sentences = [x for x in corpus.getiterator("sentence")] counter = ProgressCounter(len(sentences), "SYNTAX") for sentence in sentences: counter.update() tokElement = ETUtils.getElementByAttrib( sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer": tokenization}) parseElement = ETUtils.getElementByAttrib( sentence, "sentenceanalyses/parses/parse", {"parser": parse}) if tokElement == None or parseElement == None: print >> sys.stderr, "Warning, sentence", sentence.get( "id"), "missing parse or tokenization" tokens = tokElement.findall("token") tokenHeadScores = getTokenHeadScores( tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) for entity in sentence.findall("entity"): if entity.get("headOffset") == None: headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute entity.set("headOffset", headToken.get("charOffset")) entity.set("headMethod", "Syntax") entity.set("headString", headToken.get("text")) counts[0] += 1 return counts
def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None): """ Waits for a process to finish, and tracks the number of entities it writes to it's outputfile. If writing a sentence takes longer than the timeout, the process is considered stalled and is killed. """ maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file) counter = ProgressCounter(numCorpusSentences, counterName) counter.showMilliseconds = True prevNumSentences = 0 # Number of output sentences on previous check finalCheckLeft = True # Make one final check to update counters processStatus = None # When None, process not finished prevTime = time.time() startTime = time.time() # Wait until process is finished and periodically check it's progress. while processStatus == None or finalCheckLeft: if processStatus != None: # Extra loop to let counters finish finalCheckLeft = False # Done only once if os.path.exists(outputFile[0]): # Output file has already appeared on disk # Measure number of sentences in output file numSentences = 0 f = codecs.open(outputFile[0], "rt", **outputFile[1]) for line in f: if measureByGap: if line.strip() == "": numSentences += 1 else: numSentences += 1 f.close() # Update status if numSentences - prevNumSentences != 0: # Process has progressed counter.update(numSentences - prevNumSentences, updateMessage + ": ") if finalCheckLeft: # This is a normal loop, not the final check # Startuptime hasn't yet passed or process has made progress if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0: #if prevNumSentences == 0 or numSentences - prevNumSentences != 0: prevTime = time.time() # reset timeout else: # Nothing happened on this update, check whether process hung elapsedTime = time.time() - prevTime if timeout != None and elapsedTime > timeout: print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")" print >> sys.stderr, "Killing process" process.kill() prevNumSentences = numSentences time.sleep(1) else: # Output file doesn't exist yet prevTime = time.time() # reset counter if output file hasn't been created processStatus = process.poll() # Get process status, None == still running counter.markFinished() # If we get this far, don't show the error message even if process didn't finish return (numSentences, numCorpusSentences)
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names (" + sId + "): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % ( tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % ( parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 0 #1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "sd_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") dep.set("id", "spd_%d" % depSeqId) depSeqId += 1 for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree
defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1])) newFeatures = {} for k,v in example[2].iteritems(): newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v example[2] = newFeatures ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None): #print >> sys.stderr, "Writing output to Interaction XML" corpus = self.loadCorpus(corpus, parse, tokenization) if goldCorpus != None: goldCorpus = self.loadCorpus(corpus, parse, tokenization) examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() #counter = ProgressCounter(len(corpus.sentences), "Write Examples") exampleQueue = [] # One sentence's examples predictionsByExample = {} currentMajorId = None prevMajorIds = set() processedSentenceIds = set() xType = None count = 0 for example in examples: count += 1 assert count > 0 progress = ProgressCounter(count, "Write Examples") for example, prediction in itertools.izip_longest(examples, predictions): assert example != None assert prediction != None majorId, minorId = example[0].rsplit(".x", 1) #if currentMajorId == "GENIA.d114.s9": print "Start" if majorId != currentMajorId: # new sentence if currentMajorId != None: #if currentMajorId == "GENIA.d114.s9": print "JAA" processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} prevMajorIds.add(currentMajorId) assert majorId not in prevMajorIds, majorId currentMajorId = majorId exampleQueue.append(example) # queue example predictionsByExample[example[0]] = prediction assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType) # Process what is still in queue if currentMajorId != None: processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} # Process sentences with no examples (e.g. to clear interactions) for sentenceId in sorted(corpus.sentencesById.keys()): if sentenceId not in processedSentenceIds: sentenceObject = corpus.sentencesById[sentenceId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence) # Print statistics if len(self.counts) > 0: print >> sys.stderr, self.counts self.counts = defaultdict(int) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpus.rootElement, outputFile) return corpus.tree
def processCorpus(inputCorpus, outputPath, task=1, outputIsA2File=False, verbose=True, strengths=False): if outputIsA2File: a2File = open(outputPath, "wt") if len(inputCorpus.documents) > 1: print >> sys.stderr, "Warning: Input file has more than one document, a2-file events will have overlapping ids" if verbose: counter = ProgressCounter(len(inputCorpus.documents), "Document") # Each document is written to an output file for document in inputCorpus.documents: docSentence = document.find("sentence") if docSentence == None: counter.update(1, "Processing empty document") continue documentId = docSentence.get("origId") if documentId == None: documentId = document.get("origId") else: documentId = documentId.rsplit(".", 1)[0] if verbose: counter.update(1, "Processing document " + document.get("id") + " (origId " + documentId + "): ") # Write a1 file if outputIsA2File: outputFile = None else: outputFile = codecs.open(os.path.join(outputPath,documentId + ".a1"), "wt", "utf-8") #outputFile = open(os.path.join(outputPath,documentId + ".a1"), "wt") namedEntityTriggerIds = writeProteins(document, inputCorpus, outputFile) if not outputIsA2File: outputFile.close() # Write a2.t1 file if task == 1: strengthFile = None if outputIsA2File: outputFile = a2File else: outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t1"), "wt", "utf-8") #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t1.scores"), "wt", "utf-8") #outputFile = open(os.path.join(outputPath,documentId + ".a2.t1"), "wt") events, entityMap = getEvents(document, inputCorpus, 1) #print "EVENTS-FINAL", events, "\nENTITY_MAP", entityMap triggerIds = copy.copy(namedEntityTriggerIds) writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 1, strengths=strengthFile) writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, strengths=strengthFile) #outputFile.close() # Write a2.t12 file elif task == 2: strengthFile = None if outputIsA2File: outputFile = a2File else: outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t12"), "wt", "utf-8") #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t12.scores"), "wt", "utf-8") #outputFile = open(os.path.join(outputPath,documentId + ".a2.t12"), "wt") events, entityMap = getEvents(document, inputCorpus, 2) triggerIds = copy.copy(namedEntityTriggerIds) writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 2, strengths=strengthFile) writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, strengths=strengthFile) #outputFile.close() # Write a2.t123 file elif task == 3: strengthFile = None if outputIsA2File: outputFile = a2File else: outputFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t123"), "wt", "utf-8") #strengthFile = codecs.open(os.path.join(outputPath,documentId + ".a2.t123.scores"), "wt", "utf-8") #outputFile = open(os.path.join(outputPath,documentId + ".a2.t123"), "wt") events, entityMap = getEvents(document, inputCorpus, 2) triggerIds = copy.copy(namedEntityTriggerIds) writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, 2, strengths=strengthFile) writeEvents(document, inputCorpus, outputFile, events, entityMap, triggerIds, True, strengths=strengthFile) #outputFile.close() if not outputIsA2File: outputFile.close() # Write txt file outputFile = codecs.open(os.path.join(outputPath,documentId + ".txt"), "wt", "utf-8") #outputFile = open(os.path.join(outputPath,documentId + ".txt"), "wt") writeDocumentText(document, outputFile) outputFile.close() if outputIsA2File: a2File.close()
class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def hasStyle(self, style): return style in self.styles and not self.styles[style] def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag + name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents": 0, "sentences": 0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: # Entered here - Mu self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: # Entered here, 1448 - Mu self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") # pdb.set_trace() # This line generates log below:(getSentences function generates the first 2 lines) # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113) # Skipped 381 duplicate interaction edges in SentenceGraphs # Defining predicted value range: None - Mu self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization) ) # self.parse: mccc; self.tokenization: None removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False # this is True - Mu inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) # pdb.set_trace() #goldIterator = [] if gold != None: # Entered here - Mu removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False # this is False - Mu goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None # pdb.set_trace() # see the documentation of function processSentence() in this script # inputSentences[1].sentence is the unmerged version # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph, # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Classes:", len(self.classSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update( 1, "Building examples (" + sentence.sentence.get("id") + "): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): ''' sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance sentence.sentence: Element 'sentence' in the xml file ''' # pdb.set_trace() # Process filtering rules # does NOT entered here since self.styles["sentenceLimit"] is None - Mu if "sentenceLimit" in self.styles and self.styles[ "sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split( ".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get( sentAttr ): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # self.exampleCount += c self.exampleCount += self.buildExamplesFromGraph( sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # return sentenceGraph_return, argCombinations_return @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets( classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): # pdb.set_trace() # input is the path to the corpus xml file if type(input) != types.ListType: # Program entered here - Mu # Load corpus and make sentence graphs # pdb.set_trace() corpusElements = Core.SentenceGraph.loadCorpus( input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append([sentence.sentenceGraph, None]) return sentences else: # assume input is already a list of sentences assert (removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: # Entered here - Mu self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: # Entered here, 1448 - Mu self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") # pdb.set_trace() # This line generates log below:(getSentences function generates the first 2 lines) # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113) # Skipped 381 duplicate interaction edges in SentenceGraphs # Defining predicted value range: None - Mu self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization) ) # self.parse: mccc; self.tokenization: None removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False # this is True - Mu inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) # pdb.set_trace() #goldIterator = [] if gold != None: # Entered here - Mu removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False # this is False - Mu goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None # pdb.set_trace() # see the documentation of function processSentence() in this script # inputSentences[1].sentence is the unmerged version # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph, # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Classes:", len(self.classSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds()
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join( outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest( *[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len( set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[ x.findall("sentence") for x in (docA, docB, docGold, docTemplate) ]): assert len( set([ x.get("id") for x in (sentA, sentB, sentGold, sentTemplate) ])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt( a, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt( b, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt( template, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)
def makeSentences(input, tokenizationPath, output=None, removeText=False): """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting tokenizations from", tokenizationPath if tokenizationPath.find(".tar.gz") != -1: tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if tokenizationPath[0] == "/": tokenizationPath = tokenizationPath[1:] else: tarFile = None docCount = 0 docsWithSentences = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "Sentence Splitting") for document in sourceElements: docCount += 1 counter.update(1, "Splitting Documents ("+document.get("id")+"/" + document.get("pmid") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) if document.find("sentence") == None: # no existing sentence split text = document.get("text") if text == None or text.strip() == "": continue newFile = os.path.join(tokenizationPath, document.get("pmid") + ".tok") f = openFile(newFile, tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension oldFile = os.path.join(tokenizationPath, document.get("pmid") + ".tokenized") f = openFile(newFile, oldFile) if f == None: # no tokenization found continue sentencesCreated += alignSentences(document, f.readlines()) f.close() # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docsWithSentences += 1 else: docsWithSentences += 1 if tarFile != None: tarFile.close() print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization)) removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) #goldIterator = [] if gold != None: removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds()
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 emptySentenceCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents (" + document.get("id") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open( os.path.join(workdir, "sentence-splitter-input.txt" + docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists( Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [ Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt" + docTag), os.path.join(workdir, "sentence-splitter-output.txt" + docTag), Settings.RUBY_PATH ] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: postProcessorPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "geniass-postproc.pl") assert os.path.exists(postProcessorPath), postProcessorPath ppIn = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") ppOut = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "wt", "utf-8") perlReturnValue = subprocess.call(["perl", postProcessorPath], stdin=ppIn, stdout=ppOut) assert perlReturnValue == 0, perlReturnValue ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "rt", "utf-8") else: workfile = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None #emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r" ] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], ( text, sText, prevText, sText[i:i + 10], text[docIndex:docIndex + 10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex + 1:]) #if emptySentenceCount > 0: # print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
"--tokenization", default=None, dest="tokenization", help="Tokenization element name") optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name") (options, args) = optparser.parse_args() print >> sys.stderr, "Loading input file", options.input corpusElements = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization) counter = ProgressCounter(len(corpusElements.sentences), "Resolving chains") tags = ["e1", "e2"] for sentence in corpusElements.sentences: counter.update( 1, "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ") identityChainDict = {} tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores() for interaction in sentence.interactions: if interaction.attrib["type"] == "identity": e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentence.sentenceGraph.entityHeadTokenByEntity[e2] if tokenHeadScores[t2] > tokenHeadScores[t1]: identityChainDict[
def makeSentences(input, tokenizationPath, output=None, removeText=False, escDict={}, ignoreErrors=False): """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting tokenizations from", tokenizationPath assert os.path.exists(tokenizationPath) if tokenizationPath.find(".tar.gz") != -1: tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if tokenizationPath[0] == "/": tokenizationPath = tokenizationPath[1:] else: tarFile = None docCount = 0 docsWithSentences = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "Sentence Splitting") for document in sourceElements: docCount += 1 origId = document.get("pmid") if origId == None: origId = document.get("origId") origId = str(origId) counter.update( 1, "Splitting Documents (" + document.get("id") + "/" + origId + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) if document.find("sentence") == None: # no existing sentence split text = document.get("text") if text == None or text.strip() == "": continue newFile = os.path.join(tokenizationPath, origId + ".tok") f = openFile(newFile, tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension oldFile = os.path.join(tokenizationPath, origId + ".tokenized") f = openFile(oldFile, tarFile) if f == None: # no tokenization found continue sentencesCreated += alignSentences(document, f.readlines(), escDict, ignoreErrors=ignoreErrors) f.close() # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docsWithSentences += 1 else: docsWithSentences += 1 if tarFile != None: tarFile.close() print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator( input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update( len(sentences), "Finding heads (" + sentences[-1].sentence.get("id") + "): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents ("+document.get("id")+"): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8") subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut) ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8") else: workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex-1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex-1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex+1:]) if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None): corpusTree, corpusRoot = self.getCorpus(input) if not os.path.exists(parseDir): raise Exception("Cannot find parse input '" + str(parseDir) + "'") if not os.path.isdir(parseDir): raise Exception("Parse input '" + str(parseDir) + "' is not a directory") if extensions == None: extensions = self.allExt elif isinstance(extensions, basestring): extensions = extensions.split(",") extensions = [x for x in extensions if x in self.allExt] unescapeFormats = self.getUnescapeFormats(unescapeFormats) if docMatchKeys == None: docMatchKeys = ["origId", "pmid", "id"] elif isinstance(docMatchKeys, basestring): docMatchKeys = docMatchKeys.split(",") print >> sys.stderr, "Inserting parses from file types:", extensions counts = defaultdict(int) files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType) typeCounts = {x: defaultdict(int) for x in extensions} # Make document elements if needed documents = [x for x in corpusRoot.findall("document")] if len(documents) == 0: typeCounts["document-generation"] = defaultdict(int) documents = self.prepareDocuments(corpusRoot, files) counter = ProgressCounter(len(files), "Parse Insertion") # Insert parses and make sentence elements if needed typeCounts["sentence-splitting"] = defaultdict(int) print >> sys.stderr, "Inserting parses for", len( files), "out of total", len(documents), "documents" for document in documents: counts["document"] += 1 matchFound = False for docMatchValue in [ document.get(x) for x in docMatchKeys if document.get(x) != None ]: if docMatchValue in files: if matchFound: raise Exception( "Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys)) matchFound = True counter.update( 1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ") counts["document-match"] += 1 for ext in extensions: if ext not in files[docMatchValue]: continue counts[ext + "-match"] += 1 sentences = [ x for x in self.getSentences(document, skipParsed=skipParsed) ] self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags) if not matchFound: counts["document-no-match"] += 1 if len(typeCounts["sentence-splitting"]) > 0: print >> sys.stderr, "Sentence Splitting Counts", dict( typeCounts["sentence-splitting"]) print >> sys.stderr, "Counts", dict(counts) for ext in extensions: if len(typeCounts[ext]) > 0: print >> sys.stderr, "Counts for type '" + ext + "':", dict( typeCounts[ext]) # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree