def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus( input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def mergeAll(input, output=None, debug=False): corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) mergeDuplicateEntities(corpusElements, debug) mergeDuplicateInteractions(corpusElements, debug) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
print >> sys.stderr, "Psyco not installed" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") #optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-s", "--source", default=None, dest="source", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-r", "--target", default=None, dest="target", help="Corpus in analysis format", metavar="FILE") #optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name") optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name") (options, args) = optparser.parse_args() assert(options.source != None) assert(options.target != None) #assert(options.output != None) print >> sys.stderr, "Loading source:", sourceElements = CorpusElements.loadCorpus(options.source, options.parse, options.tokenization) print >> sys.stderr, "Loading target:", targetElements = CorpusElements.loadCorpus(options.target, options.parse, options.tokenization) parseCopied = None tokenizationCopied = None print >> sys.stderr, "Mapping sentences" origIdToSentences = {} for sourceSentence in sourceElements.sentences: origIdToSentences[sourceSentence.sentence.get("origId")] = [sourceSentence, None] for targetSentence in targetElements.sentences: assert origIdToSentences.has_key(targetSentence.sentence.get("origId")), targetSentence.sentence.get("origId") origIdToSentences[targetSentence.sentence.get("origId")][1] = targetSentence print >> sys.stderr, "Comparing sentences" count = 0 for key in sorted(origIdToSentences.keys()): sourceSentence = origIdToSentences[key][0]
def copyParse(input, source, output, parse, tokenization): print >> sys.stderr, "Loading input file", input inputTree = ETUtils.ETFromObj(input) inputRoot = inputTree.getroot() print >> sys.stderr, "Loading source:", sourceElements = CorpusElements.loadCorpus(source, parse, tokenization) sourceSentencesByText = {} for sentence in sourceElements.sentences: sentenceText = sentence.sentence.get("text") #assert not sourceSentencesByText.has_key(sentenceText) if sourceSentencesByText.has_key(sentenceText): print >> sys.stderr, "Duplicate text", sentence.sentence.get("id"), sourceSentencesByText[sentenceText].sentence.get("id") sourceSentencesByText[sentenceText] = sentence parsesCopied = [0,0] tokenizationsCopied = [0,0] for sentence in inputRoot.getiterator("sentence"): parsesCopied[1] += 1 tokenizationsCopied[1] += 1 #sourceSentence = sourceElements.sentencesByOrigId[sentence.attrib["origId"]] if not sourceSentencesByText.has_key(sentence.get("text")): print >> sys.stderr, "Warning, no text found for sentence", sentence.get("id") continue sourceSentence = sourceSentencesByText[sentence.get("text")] # Create analyses element (if needed) targetAnalysesElement = sentence.find("sentenceanalyses") if targetAnalysesElement == None: targetAnalysesElement = ET.Element("sentenceanalyses") sentence.append(targetAnalysesElement) # Create parses element (if needed) targetParsesElement = targetAnalysesElement.find("parses") if targetParsesElement == None: targetParsesElement = ET.Element("parses") targetAnalysesElement.append(targetParsesElement) # Check whether parse already exists targetParseElements = targetParsesElement.findall("parse") newParse = None for parseElement in targetParseElements: if parseElement.get("parser") == parse: newParse = parseElement break # Copy parse if it doesn't if newParse == None and sourceSentence.parseElement != None: targetParsesElement.append(sourceSentence.parseElement) parsesCopied[0] += 1 # Create tokenizations element (if needed) targetTokenizationsElement = targetAnalysesElement.find("tokenizations") if targetTokenizationsElement == None: targetTokenizationsElement = ET.Element("tokenizations") targetAnalysesElement.append(targetTokenizationsElement) # Check whether tokenization already exists targetTokenizationElements = targetTokenizationsElement.findall("tokenization") newTokenization = None for tokenizationElement in targetTokenizationElements: if tokenizationElement.attrib["tokenizer"] == newParse.attrib["tokenizer"]: newTokenization = tokenizationElement break # Copy parse if it doesn't if newTokenization == None and sourceSentence.tokenizationElement != None: targetTokenizationsElement.append(sourceSentence.tokenizationElement) tokenizationsCopied[0] += 1 print >> sys.stderr, "Copied parse elements", parsesCopied print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(inputTree, output) return inputTree
def copyParse(input, source, output, parse, tokenization): print >> sys.stderr, "Loading input file", input inputTree = ETUtils.ETFromObj(input) inputRoot = inputTree.getroot() print >> sys.stderr, "Loading source:", sourceElements = CorpusElements.loadCorpus(source, parse, tokenization) sourceSentencesByText = {} for sentence in sourceElements.sentences: sentenceText = sentence.sentence.get("text") #assert not sourceSentencesByText.has_key(sentenceText) if sourceSentencesByText.has_key(sentenceText): print >> sys.stderr, "Duplicate text", sentence.sentence.get( "id"), sourceSentencesByText[sentenceText].sentence.get("id") sourceSentencesByText[sentenceText] = sentence parsesCopied = [0, 0] tokenizationsCopied = [0, 0] for sentence in inputRoot.getiterator("sentence"): parsesCopied[1] += 1 tokenizationsCopied[1] += 1 #sourceSentence = sourceElements.sentencesByOrigId[sentence.attrib["origId"]] if not sourceSentencesByText.has_key(sentence.get("text")): print >> sys.stderr, "Warning, no text found for sentence", sentence.get( "id") continue sourceSentence = sourceSentencesByText[sentence.get("text")] # Create analyses element (if needed) targetAnalysesElement = sentence.find("sentenceanalyses") if targetAnalysesElement == None: targetAnalysesElement = ET.Element("sentenceanalyses") sentence.append(targetAnalysesElement) # Create parses element (if needed) targetParsesElement = targetAnalysesElement.find("parses") if targetParsesElement == None: targetParsesElement = ET.Element("parses") targetAnalysesElement.append(targetParsesElement) # Check whether parse already exists targetParseElements = targetParsesElement.findall("parse") newParse = None for parseElement in targetParseElements: if parseElement.get("parser") == parse: newParse = parseElement break # Copy parse if it doesn't if newParse == None and sourceSentence.parseElement != None: targetParsesElement.append(sourceSentence.parseElement) parsesCopied[0] += 1 # Create tokenizations element (if needed) targetTokenizationsElement = targetAnalysesElement.find( "tokenizations") if targetTokenizationsElement == None: targetTokenizationsElement = ET.Element("tokenizations") targetAnalysesElement.append(targetTokenizationsElement) # Check whether tokenization already exists targetTokenizationElements = targetTokenizationsElement.findall( "tokenization") newTokenization = None for tokenizationElement in targetTokenizationElements: if tokenizationElement.attrib["tokenizer"] == newParse.attrib[ "tokenizer"]: newTokenization = tokenizationElement break # Copy parse if it doesn't if newTokenization == None and sourceSentence.tokenizationElement != None: targetTokenizationsElement.append( sourceSentence.tokenizationElement) tokenizationsCopied[0] += 1 print >> sys.stderr, "Copied parse elements", parsesCopied print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(inputTree, output) return inputTree
"--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name") optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name") (options, args) = optparser.parse_args() assert (options.source != None) assert (options.target != None) #assert(options.output != None) print >> sys.stderr, "Loading source:", sourceElements = CorpusElements.loadCorpus(options.source, options.parse, options.tokenization) print >> sys.stderr, "Loading target:", targetElements = CorpusElements.loadCorpus(options.target, options.parse, options.tokenization) parseCopied = None tokenizationCopied = None print >> sys.stderr, "Mapping sentences" origIdToSentences = {} for sourceSentence in sourceElements.sentences: origIdToSentences[sourceSentence.sentence.get("origId")] = [ sourceSentence, None ] for targetSentence in targetElements.sentences: assert origIdToSentences.has_key(targetSentence.sentence.get( "origId")), targetSentence.sentence.get("origId") origIdToSentences[targetSentence.sentence.get(