def stem(*args): """ .. function:: stem(text1, text2,...) -> text Does stemming according to the porter algorithm. Examples: >>> sql("stem 'cutting and creating'") stem('cutting and creating') ---------------------------- cut and creat >>> sql("stem ceci est en français cutting") stem('ceci est en français cutting') ------------------------------------- ceci est en françai cut """ out = [] for i in args: o = i.lower() o = o.strip() o = o.split(" ") for k in o: if len(k) > 0: out.append(porter.stem(k)) return " ".join(out)
def _import_dfr(self, dfr_dir): citations = self._import_dfr_metadata(dfr_dir) wordcounts_dir = os.path.join(dfr_dir, "wordcounts") for doi in citations.keys(): try: this_text = '' for rowdict in self.parse_csv(os.path.join(wordcounts_dir, "wordcounts_" + doi.replace('/','_') + ".CSV")): word = rowdict["WORDCOUNTS"] if word in self.stopwords: continue if self.stemming: prestem = word if word not in self.stemmed: self.stemmed[prestem] = stem(prestem) word = self.stemmed[prestem] count = int(rowdict["WEIGHT"]) this_text += (word + u' ') * count if len(this_text) < 20: continue yield doi, this_text except: logging.error(doi) logging.error(traceback.format_exc())
def get_wiki_pmi_coherence( topics, numterms=NUM_TERMS): # TODO make sure the terms are already stemmed """ Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models) """ dbase = db(WIKI_COCC_DB) if not dbase.check_table_existence('co_occ'): return {} scores = {} rtime = time() tid_dict = {} # keep terms and cooccurence counts in memory for caching cocc_dict = {} for i in xrange(len(topics)): scores[topics[i].id] = [] print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % ( i, len(topics), time() - rtime) rtime = time() # prep the top numterms terms titles = [] topics[i].get_terms(numterms) for j in xrange(numterms): # TODO make sure stemming is handled consistently titles.append(stem(topics[i].get_term(j).title)) if not tid_dict.has_key(titles[-1]): res = dbase.get_wiki_occ(titles[-1]) if res == []: # don't include terms that are not in the database TODO better way to handle this? del (titles[-1]) numterms -= 1 continue tid_dict[titles[-1]] = [ res[0], res[1] ] # res[0] is the term_id res[1] is the occurance for m in xrange(1, numterms): tid1 = tid_dict[titles[m]][0] t1_occ = tid_dict[titles[m]][1] for l in xrange(0, m): # [x]range goes to m-1 tid2 = tid_dict[titles[l]][0] t2_occ = tid_dict[titles[l]][1] min_tid = min(tid1, tid2) max_tid = max(tid1, tid2) # see if we already found the given cooccurence db_cocc_lookup = True if cocc_dict.has_key(min_tid): if cocc_dict[min_tid].has_key(max_tid): db_cocc_lookup = False else: cocc_dict[min_tid] = {} if db_cocc_lookup: cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc( tid1, tid2, min(t1_occ, t2_occ)) co_occs = cocc_dict[min_tid][max_tid] numer = (co_occs + 1) * WIKI_NUM_ABST # +1 is for smoothing denom = t1_occ * t2_occ scores[topics[i].id].append(log((float(numer)) / denom)) return scores
def stem(*args): """ .. function:: stem(text1, text2,...) -> text Does stemming according to the porter algorithm. Examples: >>> sql("stem 'cutting and creating'") stem('cutting and creating') ---------------------------- cut and creat >>> sql("stem ceci est en français cutting") stem('ceci est en français cutting') ------------------------------------- ceci est en françai cut """ out = [] for i in args: o = i.lower() o = o.strip() o = o.split(' ') for k in o: if len(k) > 0: out.append(porter.stem(k)) return ' '.join(out)
def get_wiki_pmi_coherence(topics, numterms=NUM_TERMS): # TODO make sure the terms are already stemmed """ Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models) """ dbase = db(WIKI_COCC_DB) if not dbase.check_table_existence('co_occ'): return {} scores = {} rtime = time() tid_dict = {} # keep terms and cooccurence counts in memory for caching cocc_dict = {} for i in xrange(len(topics)): scores[topics[i].id] = [] print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (i,len(topics), time() - rtime) rtime = time() # prep the top numterms terms titles = [] topics[i].get_terms(numterms) for j in xrange(numterms): # TODO make sure stemming is handled consistently titles.append(stem(topics[i].get_term(j).title)) if not tid_dict.has_key(titles[-1]): res = dbase.get_wiki_occ(titles[-1]) if res == []: # don't include terms that are not in the database TODO better way to handle this? del(titles[-1]) numterms -= 1 continue tid_dict[titles[-1]] = [res[0], res[1]] # res[0] is the term_id res[1] is the occurance for m in xrange(1,numterms): tid1 = tid_dict[titles[m]][0] t1_occ = tid_dict[titles[m]][1] for l in xrange(0,m): # [x]range goes to m-1 tid2 = tid_dict[titles[l]][0] t2_occ = tid_dict[titles[l]][1] min_tid = min(tid1,tid2) max_tid = max(tid1,tid2) # see if we already found the given cooccurence db_cocc_lookup = True if cocc_dict.has_key(min_tid): if cocc_dict[min_tid].has_key(max_tid): db_cocc_lookup = False else: cocc_dict[min_tid] = {} if db_cocc_lookup: cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(tid1, tid2, min(t1_occ, t2_occ)) co_occs = cocc_dict[min_tid][max_tid] numer = (co_occs + 1)*WIKI_NUM_ABST # +1 is for smoothing denom = t1_occ*t2_occ scores[topics[i].id].append( log((float(numer))/denom)) return scores
def groupNounsWithCounts(self, nounsWithCounts): ret = [] stemToNouns = {} # { <stem> : { <noun> : True } } # populate stemToNouns for noun in nounsWithCounts.keys(): currStem = stem(noun) if currStem not in stemToNouns: stemToNouns[currStem] = {} if noun not in stemToNouns[currStem]: stemToNouns[currStem][noun] = True # populate ret for currStem in stemToNouns.keys(): nounToTrue = stemToNouns[currStem] currObj = { 'values': [], 'positive': {}, 'negative': {}, } for noun in sorted(nounToTrue.keys()): currObj['values'].append(noun) positiveCountDict = nounsWithCounts[noun]['positive'] negativeCountDict = nounsWithCounts[noun]['negative'] for neighbor in positiveCountDict.keys(): if neighbor not in currObj['positive']: currObj['positive'][neighbor] = { 'count' : 0, 'sentences' : [] } currObj['positive'][neighbor]['count'] += positiveCountDict[neighbor]['count'] currObj['positive'][neighbor]['sentences'].extend(positiveCountDict[neighbor]['sentences']) for neighbor in negativeCountDict.keys(): if neighbor not in currObj['negative']: currObj['negative'][neighbor] = { 'count' : 0, 'sentences' : [] } currObj['negative'][neighbor]['count'] += negativeCountDict[neighbor]['count'] currObj['negative'][neighbor]['sentences'].extend(negativeCountDict[neighbor]['sentences']) ret.append(currObj) return ret
def stem_en(*args): """ .. function:: stem_en(text1, text2,...) -> text Detects if the input is in english and only then does the porter stemming else it returns the input arguments concatenated Examples: >>> sql("stem_en 'cutting and creating'") stem_en('cutting and creating') ------------------------------- cut and creat >>> sql("stem_en ceci est en français cutting") stem_en('ceci est en français cutting') ---------------------------------------- ceci est en français cutting """ jargs = "".join(args) if detectlang(*args) != "english": return jargs out = [] for i in args: o = i.lower() o = o.strip() o = o.split(" ") for k in o: if len(k) > 0: out.append(porter.stem(k)) return " ".join(out)
def _import_files(self): if self.stemming: self.stemmed = {} self.docs = [] with codecs.open(self.texts_file, 'w', encoding='utf-8') as f: for filename in self.files: with codecs.open(filename, 'r', encoding='utf-8') as input_file: text = input_file.read() text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE) if self.stemming: newtext = u'' for word in text.split(): if word not in self.stemmed: self.stemmed[word] = stem(word) newtext += self.stemmed[word] + u' ' text = newtext f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n') self.docs.append(filename) if self.dfr: for doi, text in self._import_dfr(self.dfr_dir): f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n') self.docs.append(doi) self.doc_count = len(self.docs)
def stem_en(*args): """ .. function:: stem_en(text1, text2,...) -> text Detects if the input is in english and only then does the porter stemming else it returns the input arguments concatenated Examples: >>> sql("stem_en 'cutting and creating'") stem_en('cutting and creating') ------------------------------- cut and creat >>> sql("stem_en ceci est en français cutting") stem_en('ceci est en français cutting') ---------------------------------------- ceci est en français cutting """ jargs = ''.join(args) if detectlang(*args) != 'english': return jargs out = [] for i in args: o = i.lower() o = o.strip() o = o.split(' ') for k in o: if len(k) > 0: out.append(porter.stem(k)) return ' '.join(out)
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)') if self.make_stem_db: dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)') # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-',' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile,'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd=='': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key(wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True
def textacknowledgmentsstem(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)' ): """ .. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_)) Returns the "Reference" section of documents. To find it, it searches for parts of the document that have a high density of pattern matches. .. parameters:: txt,maxlen,pattern txt: input text. span: the size of the string in words that the txt is splited maxlen: the size of the scrolling window over the text in which the density is calculated. pattern: regular expression that is matched against the lines of the text. By default the pattern matches year occurences so as to extract sections that look like references. Examples: >>> sql("select textacknowledgmentsstem('')") textacknowledgmentsstem('') ------------------ <BLANKLINE> """ exp = re.sub('\r\n','\n',txt) exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp)) if exp.count(' ') < span * 10: return exp acknowledgments = [] origwords = exp.split(' ') words = exp.lower() words = words.split(' ') stemed = [] for k in words: if len(k) > 0: try: stemed.append(porter.stem(k)) except Exception: stemed.append(k) spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)] spanedstemtext = [' '.join(stemed[i:i+span]) for i in range(0, len(stemed), span)] reversedtext = iter(spanedstemtext) results = [] densities = [] for i in xrange(maxlen/2): results.append(0) for i in reversedtext: count = sum(1 for m in re.finditer(pattern, i)) if count: results.append(count) else: results.append(0) for i in xrange(maxlen/2): results.append(0) for i in xrange(maxlen/2,len(results)-maxlen/2): densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen) threshold = 1 current = 0 for i in spanedorigtext: if len(i)>10: if densities[current] > threshold: acknowledgments.append(i) current+=1 return '\n'.join(acknowledgments)
def textacknowledgmentsstem( txt, span=10, maxlen=3, pattern=r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)' ): """ .. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_)) Returns the "Reference" section of documents. To find it, it searches for parts of the document that have a high density of pattern matches. .. parameters:: txt,maxlen,pattern txt: input text. span: the size of the string in words that the txt is splited maxlen: the size of the scrolling window over the text in which the density is calculated. pattern: regular expression that is matched against the lines of the text. By default the pattern matches year occurences so as to extract sections that look like references. Examples: >>> sql("select textacknowledgmentsstem('')") textacknowledgmentsstem('') ------------------ <BLANKLINE> """ exp = re.sub('\r\n', '\n', txt) exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp)) if exp.count(' ') < span * 10: return exp acknowledgments = [] origwords = exp.split(' ') words = exp.lower() words = words.split(' ') stemed = [] for k in words: if len(k) > 0: try: stemed.append(porter.stem(k)) except Exception: stemed.append(k) spanedorigtext = [ ' '.join(origwords[i:i + span]) for i in range(0, len(origwords), span) ] spanedstemtext = [ ' '.join(stemed[i:i + span]) for i in range(0, len(stemed), span) ] reversedtext = iter(spanedstemtext) results = [] densities = [] for i in xrange(maxlen / 2): results.append(0) for i in reversedtext: count = sum(1 for m in re.finditer(pattern, i)) if count: results.append(count) else: results.append(0) for i in xrange(maxlen / 2): results.append(0) for i in xrange(maxlen / 2, len(results) - maxlen / 2): densities.append( sum(results[i - maxlen / 2:i - maxlen / 2 + maxlen]) * 1.0 / maxlen) threshold = 1 current = 0 for i in spanedorigtext: if len(i) > 10: if densities[current] > threshold: acknowledgments.append(i) current += 1 return '\n'.join(acknowledgments)
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % ( pdf, txtname ) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append( txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table( 'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)' ) if self.make_stem_db: dbase.add_table( 'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)' ) # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-', ' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile, 'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd == '': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key( wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document( os.path.join(self.paradir, slugify(usetitle)), paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True