def __init__( self, paragraph, noCitesParagraph, noNumbersParagraph, markedParagraph, sentence, noCitesSentence, noNumbersSentence, markedSentence, citesPerParagraph, citesPerSentence, ): self.sentenceContext = Context(sentence, noCitesSentence, noNumbersSentence, markedSentence, citesPerSentence) self.paragraphContext = Context( paragraph, noCitesParagraph, noNumbersParagraph, markedParagraph, citesPerParagraph ) # Figure out the pre context self.preContext = "" # Check for a para surround first. match = re.search("\(([^\)]*{}[^\(]*)\)".format(re.escape(parser.MARKED_CITATION_MARKER)), markedSentence) if match: self.preContext = match.group(1).replace(parser.MARKED_CITATION_MARKER, " ").strip() else: match = re.search( '([^\[\]\.,;:!"\?\-]+){}'.format(re.escape(parser.MARKED_CITATION_MARKER)), markedSentence ) if match: self.preContext = match.group(1).strip() self.preContextUnigrams = util.getNonStopNgrams(self.preContext, 1) self.preContextBigrams = util.getNonStopNgrams(self.preContext, 2)
def __init__(self, parseStructure, root=True): self.root = root if root: meta = parseStructure["root"]["meta"] else: meta = parseStructure["meta"] self.title = meta["title"] self.authors = meta["authors"] self.terms = meta["terms"] self.categories = meta["categories"] self.abstract = meta["abstract"] if root: self.fullText = parseStructure["root"]["fullText"] self.pdfPath = parseStructure["root"]["pdfPath"] else: self.fullText = parseStructure["fullText"] self.pdfPath = parseStructure["pdfPath"] if root: self.noCitationsText = parseStructure["root"]["noCitationsText"] self.noNumbersText = parseStructure["root"]["noNumbersText"] self.citationKey = parseStructure["root"]["citationKey"] self.citations = parseStructure["root"]["citations"] # Do pre-processing on the citations. for citation in self.citations: # proper nouns citation.sentenceProperNouns = util.removeStopwords( util.getCapitalWords(citation.sentenceContext.noCitations) ) citation.paragraphProperNouns = util.removeStopwords( util.getCapitalWords(citation.paragraphContext.noCitations) ) # bigrams citation.sentenceBigrams = util.getNonStopNgrams(citation.sentenceContext.noCitations, 2) citation.paragraphBigrams = util.getNonStopNgrams(citation.paragraphContext.noCitations, 2) # Add important unigrams to the proper nouns citation.sentenceProperNouns.update(util.importantUnigrams(citation.sentenceBigrams)) citation.paragraphProperNouns.update(util.importantUnigrams(citation.paragraphBigrams)) # This is a dict to accomodate missing references, and index by 1. self.references = {} for reference in parseStructure["references"].items(): self.references[reference[0]] = Paper(reference[1], False)
def __init__(self, paper): super(BaseAbstractMethod, self).__init__() if BaseAbstractMethod.abstractWords.has_key(paper.title): return paperAbstractWords = {} paperAbstractBigrams = {} for (key, reference) in paper.references.items(): paperAbstractWords[key] = util.getCapitalWords(reference.abstract) #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract))) paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2) paperAbstractWords = util.uniqueSets(paperAbstractWords) paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams) # If a word appears in >= 25% of bigrams, then put it in the unigrams. for (referenceKey, bigrams) in paperAbstractBigrams.items(): counts = {} for bigram in bigrams: for word in bigram.split('-'): word = util.STEMMER.stem(word) if not counts.has_key(word): counts[word] = 1 else: counts[word] += 1 for (word, count) in counts.items(): if float(count) / len(bigrams) >= 0.25: paperAbstractWords[referenceKey].add(word) if DEBUG: print "ABSTRACT:" for (ref, nouns) in paperAbstractWords.items(): print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref]) BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams