def get_wiki_pmi_coherence( topics, numterms=NUM_TERMS): # TODO make sure the terms are already stemmed """ Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models) """ dbase = db(WIKI_COCC_DB) if not dbase.check_table_existence('co_occ'): return {} scores = {} rtime = time() tid_dict = {} # keep terms and cooccurence counts in memory for caching cocc_dict = {} for i in xrange(len(topics)): scores[topics[i].id] = [] print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % ( i, len(topics), time() - rtime) rtime = time() # prep the top numterms terms titles = [] topics[i].get_terms(numterms) for j in xrange(numterms): # TODO make sure stemming is handled consistently titles.append(stem(topics[i].get_term(j).title)) if not tid_dict.has_key(titles[-1]): res = dbase.get_wiki_occ(titles[-1]) if res == []: # don't include terms that are not in the database TODO better way to handle this? del (titles[-1]) numterms -= 1 continue tid_dict[titles[-1]] = [ res[0], res[1] ] # res[0] is the term_id res[1] is the occurance for m in xrange(1, numterms): tid1 = tid_dict[titles[m]][0] t1_occ = tid_dict[titles[m]][1] for l in xrange(0, m): # [x]range goes to m-1 tid2 = tid_dict[titles[l]][0] t2_occ = tid_dict[titles[l]][1] min_tid = min(tid1, tid2) max_tid = max(tid1, tid2) # see if we already found the given cooccurence db_cocc_lookup = True if cocc_dict.has_key(min_tid): if cocc_dict[min_tid].has_key(max_tid): db_cocc_lookup = False else: cocc_dict[min_tid] = {} if db_cocc_lookup: cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc( tid1, tid2, min(t1_occ, t2_occ)) co_occs = cocc_dict[min_tid][max_tid] numer = (co_occs + 1) * WIKI_NUM_ABST # +1 is for smoothing denom = t1_occ * t2_occ scores[topics[i].id].append(log((float(numer)) / denom)) return scores
def init_rel_db(self): """ Initialize the relationship (TMA) database by creating the appropriate tables """ self.dbase = db(self.params['outdir'] + '/tma.sqlite') self.dbase.add_table( "doc_doc (id INTEGER PRIMARY KEY, doc_a INTEGER, doc_b INTEGER, score FLOAT)" ) self.dbase.add_table( "doc_topic (id INTEGER PRIMARY KEY, doc INTEGER, topic INTEGER, score FLOAT)" ) self.dbase.add_table( "topics (id INTEGER PRIMARY KEY, title VARCHAR(100), score FLOAT)") self.dbase.add_table( "topic_term (id INTEGER PRIMARY KEY, topic INTEGER, term INTEGER, score FLOAT)" ) self.dbase.add_table( "topic_topic (id INTEGER PRIMARY KEY, topic_a INTEGER, topic_b INTEGER, score FLOAT)" ) self.dbase.add_table( "doc_term (id INTEGER PRIMARY KEY, doc INTEGER, term INTEGER, score FLOAT)" ) self.dbase.add_table( "terms (id INTEGER PRIMARY KEY, title VARCHAR(100), count INTEGER)" ) self.dbase.add_table( "docs (id INTEGER PRIMARY KEY, title VARCHAR(100))")
def get_bing_coherence_dict(terms_dict, corpus_dbloc, numtitles=50): """ Coherence from (Newman, 2011 Automatic...) (search index with Bing) """ dbase = db(corpus_dbloc) # do we have a de-stemming table? destem = dbase.check_table_existence("termid_to_prestem") bing = SBing(BING_API_KEY) scores = {} # TODO store more meta data so we can click through and see more of the anlaysis (e.g. which terms appeared in titles, frequency, cooccurance, which titles we were working with, etc) print 'Querying Bing...' for i, key in enumerate(terms_dict): terms = terms_dict[key] topic_terms = [] for trm in terms: if destem: trm_title = (dbase.get_prestem(trm[1])[0][0]) # TODO what if the stemmed term doesn't exist for some reason? else: trm_title = trm[0] topic_terms.append(trm_title) topic_terms.sort() # sort for linear overlapping scans with search titles search_qry = ' '.join(topic_terms) topic_terms = map(stem, topic_terms) # TODO make stemming optional on match? print '-topic %i of %i: %s' % (i, len(terms_dict), search_qry), tmatches = 0 for j in xrange(0,numtitles, 50): try: json_response = bing.search(qry=search_qry, top=50, skip=j) except HTTPError: print 'Error accessing Bing -- make sure your API key is correct' # TODO propagate this message to the display return {} responses = json_response['d']['results'] title_terms = map(lambda resp: resp['Title'].strip().lower().split(), responses) #TODO make case sensitive if desired TODO make stemming optional title_terms = [item for sublist in title_terms for item in sublist] title_terms = map(stem, title_terms) # make list of lists into flat list title_terms.sort() tle_n = 0 top_n=0 # presorting the lists allows linear scans while tle_n < len(title_terms) and top_n < len(topic_terms): cval = cmp(title_terms[tle_n], topic_terms[top_n]) if cval == 0: # match tmatches += 1 tle_n += 1 elif cval == -1: # title_terms is > topic_terms tle_n += 1 else: # topic_terms > title_terms top_n += 1 print ': %i' % tmatches scores[key] = tmatches return scores
def get_wiki_pmi_coherence(topics, numterms=NUM_TERMS): # TODO make sure the terms are already stemmed """ Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models) """ dbase = db(WIKI_COCC_DB) if not dbase.check_table_existence('co_occ'): return {} scores = {} rtime = time() tid_dict = {} # keep terms and cooccurence counts in memory for caching cocc_dict = {} for i in xrange(len(topics)): scores[topics[i].id] = [] print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (i,len(topics), time() - rtime) rtime = time() # prep the top numterms terms titles = [] topics[i].get_terms(numterms) for j in xrange(numterms): # TODO make sure stemming is handled consistently titles.append(stem(topics[i].get_term(j).title)) if not tid_dict.has_key(titles[-1]): res = dbase.get_wiki_occ(titles[-1]) if res == []: # don't include terms that are not in the database TODO better way to handle this? del(titles[-1]) numterms -= 1 continue tid_dict[titles[-1]] = [res[0], res[1]] # res[0] is the term_id res[1] is the occurance for m in xrange(1,numterms): tid1 = tid_dict[titles[m]][0] t1_occ = tid_dict[titles[m]][1] for l in xrange(0,m): # [x]range goes to m-1 tid2 = tid_dict[titles[l]][0] t2_occ = tid_dict[titles[l]][1] min_tid = min(tid1,tid2) max_tid = max(tid1,tid2) # see if we already found the given cooccurence db_cocc_lookup = True if cocc_dict.has_key(min_tid): if cocc_dict[min_tid].has_key(max_tid): db_cocc_lookup = False else: cocc_dict[min_tid] = {} if db_cocc_lookup: cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(tid1, tid2, min(t1_occ, t2_occ)) co_occs = cocc_dict[min_tid][max_tid] numer = (co_occs + 1)*WIKI_NUM_ABST # +1 is for smoothing denom = t1_occ*t2_occ scores[topics[i].id].append( log((float(numer))/denom)) return scores
def init_rel_db(self): """ Initialize the relationship (TMA) database by creating the appropriate tables """ self.dbase = db(self.params['outdir'] + '/tma.sqlite') self.dbase.add_table("doc_doc (id INTEGER PRIMARY KEY, doc_a INTEGER, doc_b INTEGER, score FLOAT)") self.dbase.add_table("doc_topic (id INTEGER PRIMARY KEY, doc INTEGER, topic INTEGER, score FLOAT)") self.dbase.add_table("topics (id INTEGER PRIMARY KEY, title VARCHAR(100), score FLOAT)") self.dbase.add_table("topic_term (id INTEGER PRIMARY KEY, topic INTEGER, term INTEGER, score FLOAT)") self.dbase.add_table("topic_topic (id INTEGER PRIMARY KEY, topic_a INTEGER, topic_b INTEGER, score FLOAT)") self.dbase.add_table("doc_term (id INTEGER PRIMARY KEY, doc INTEGER, term INTEGER, score FLOAT)") self.dbase.add_table("terms (id INTEGER PRIMARY KEY, title VARCHAR(100), count INTEGER)") self.dbase.add_table("docs (id INTEGER PRIMARY KEY, title VARCHAR(100))")
def get_topic_coherence_scores(topics, corpus_dbloc, numterms=NUM_TERMS): """ Coherence from (Mimno, 2011 Topic Coherence...) """ dbase = db(corpus_dbloc) scores = {}#[[] for i in range(len(topics))] for i in xrange(len(topics)): scores[topics[i].id] = [] topics[i].get_terms(numterms) # prep the top numterms terms for m in xrange(1,numterms): for l in xrange(m): # [x]range goes from 0 to m-1 dl_set = set(dbase.get_doc_occ(topics[i].get_term(l).id)) # TODO: could optimize the intersection by sorting the sqlite query dm_set = set(dbase.get_doc_occ(topics[i].get_term(m).id)) dl = len(dl_set) dml= len(dl_set.intersection(dm_set)) scores[topics[i].id].append( log(float((dml + 1))/dl)) del(dbase) return scores
def get_topic_coherence_scores(topics, corpus_dbloc, numterms=NUM_TERMS): """ Coherence from (Mimno, 2011 Topic Coherence...) """ dbase = db(corpus_dbloc) scores = {} #[[] for i in range(len(topics))] for i in xrange(len(topics)): scores[topics[i].id] = [] topics[i].get_terms(numterms) # prep the top numterms terms for m in xrange(1, numterms): for l in xrange(m): # [x]range goes from 0 to m-1 dl_set = set( dbase.get_doc_occ(topics[i].get_term(l).id) ) # TODO: could optimize the intersection by sorting the sqlite query dm_set = set(dbase.get_doc_occ(topics[i].get_term(m).id)) dl = len(dl_set) dml = len(dl_set.intersection(dm_set)) scores[topics[i].id].append(log(float((dml + 1)) / dl)) del (dbase) return scores
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)') if self.make_stem_db: dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)') # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-',' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile,'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd=='': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key(wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True
def tfidf_clean(self, top_k_terms=5000, min_df=5): """ Use tf-idf to clean the corpus. Takes the top tf-idf score of each term and retains the top top_k_terms terms Warning: by default tfidf_clean changes the corpus's corpusfile to the cleaned version and moves the original version to {{original_name}}-pre_tfidf @param top_k_terms: keep the top_k_terms terms by tf-idf rank @param min_df: minimum document frequency for the terms """ if not self.corpus_used: print "WARNING: You must first parse some data before calling tfidf_clean" return False orig_corpusfile = self.corpusfile + '-pre_tfidf' shutil.move(self.corpusfile, orig_corpusfile) # first obtain tf-idf scores for all terms tf_list = [0]*self.vocabct df_list = [0]*self.vocabct tfidf_list = [0]*self.vocabct for doc in open(orig_corpusfile,'r'): cts = doc.strip().split()[1:] #remove the term count term_ct_pairs = map(lambda x: x.split(':'), cts) doc_len = sum(map(lambda x: int(x[1]), term_ct_pairs)) for pair in term_ct_pairs: trm = int(pair[0]) tf = float(pair[1])/doc_len df_list[trm] += 1 if tf > tf_list[trm]: tf_list[trm] = tf # calculate tf-df scores for i in xrange(self.vocabct): tfidf_list[i] = tf_list[i]*log10(float(self.docct)/df_list[i]) # determine the minimum tf-idf score srt_tfidf = sorted(tfidf_list, reverse=True) min_score = srt_tfidf[top_k_terms] # rewrite the corpus to file, only allowing terms whose max(tf-idf) score exceeds the minimum old_to_new_dict = dict() self.vocabct = 0 self.wordct = 0 writefile = open(self.corpusfile,'w'); for doc in open(orig_corpusfile,'r'): writeline = '' cts = doc.strip().split()[1:] term_ct_pairs = map(lambda x: x.split(':'), cts) doc_term_ct = 0 for tc_pair in term_ct_pairs: tid = int(tc_pair[0]) if tfidf_list[tid] < min_score or df_list[tid] < min_df: continue if not old_to_new_dict.has_key(tid): old_to_new_dict[tid] = self.vocabct self.vocabct += 1 self.wordct += int(tc_pair[1]) writeline += str(old_to_new_dict[tid]) + ':' + tc_pair[1] + ' ' doc_term_ct += 1 writeline = str(doc_term_ct) + " " + writeline writefile.write(writeline + '\n') writefile.close() remove_ct = len(tfidf_list)-len(old_to_new_dict) print 'Processing removed %i of %i terms, keeping %i terms. Min TF-IDF score is: %0.4f' % (remove_ct, len(tfidf_list), len(old_to_new_dict), min_score) # update the appropriate databases TODO: perhaps wait to form the databases for efficiecy dbase = db(self.corpus_db) if self.make_stem_db: dbase = db(self.corpus_db) oldid_to_prestem = dbase.fetch('SELECT * FROM termid_to_prestem') dbase.execute('DROP TABLE termid_to_prestem') dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)') id_prestem_list = [] for op_item in map(list, oldid_to_prestem): if old_to_new_dict.has_key(op_item[0]): op_item[0] = old_to_new_dict[op_item[0]] id_prestem_list.append(op_item) dbase.executemany('INSERT INTO termid_to_prestem(id, prestem) VALUES(?,?)',id_prestem_list) dbase.execute('SELECT * FROM term_doc_pair') term_doc_items = [] for item in dbase.cur: if old_to_new_dict.has_key(item[1]): item = list(item) item[1] = old_to_new_dict[item[1]] term_doc_items.append(item[1:]) dbase.execute('DROP TABLE term_doc_pair') dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)') dbase.executemany('INSERT INTO term_doc_pair(term, doc) VALUES(?,?)', term_doc_items) dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') del(dbase) del(term_doc_items) # update corpus vocab oldid_to_term = dict((v,k) for k, v in self.vocab.iteritems()) self.vocab = {} for k,v in old_to_new_dict.iteritems(): self.vocab[oldid_to_term[k]] = v
def get_bing_coherence_dict(terms_dict, corpus_dbloc, numtitles=50): """ Coherence from (Newman, 2011 Automatic...) (search index with Bing) """ dbase = db(corpus_dbloc) # do we have a de-stemming table? destem = dbase.check_table_existence("termid_to_prestem") bing = SBing(BING_API_KEY) scores = {} # TODO store more meta data so we can click through and see more of the anlaysis (e.g. which terms appeared in titles, frequency, cooccurance, which titles we were working with, etc) print 'Querying Bing...' for i, key in enumerate(terms_dict): terms = terms_dict[key] topic_terms = [] for trm in terms: if destem: trm_title = ( dbase.get_prestem(trm[1])[0][0] ) # TODO what if the stemmed term doesn't exist for some reason? else: trm_title = trm[0] topic_terms.append(trm_title) topic_terms.sort( ) # sort for linear overlapping scans with search titles search_qry = ' '.join(topic_terms) topic_terms = map(stem, topic_terms) # TODO make stemming optional on match? print '-topic %i of %i: %s' % (i, len(terms_dict), search_qry), tmatches = 0 for j in xrange(0, numtitles, 50): try: json_response = bing.search(qry=search_qry, top=50, skip=j) except HTTPError: print 'Error accessing Bing -- make sure your API key is correct' # TODO propagate this message to the display return {} responses = json_response['d']['results'] title_terms = map( lambda resp: resp['Title'].strip().lower().split(), responses ) #TODO make case sensitive if desired TODO make stemming optional title_terms = [item for sublist in title_terms for item in sublist] title_terms = map(stem, title_terms) # make list of lists into flat list title_terms.sort() tle_n = 0 top_n = 0 # presorting the lists allows linear scans while tle_n < len(title_terms) and top_n < len(topic_terms): cval = cmp(title_terms[tle_n], topic_terms[top_n]) if cval == 0: # match tmatches += 1 tle_n += 1 elif cval == -1: # title_terms is > topic_terms tle_n += 1 else: # topic_terms > title_terms top_n += 1 print ': %i' % tmatches scores[key] = tmatches return scores
def tfidf_clean(self, top_k_terms=5000, min_df=5): """ Use tf-idf to clean the corpus. Takes the top tf-idf score of each term and retains the top top_k_terms terms Warning: by default tfidf_clean changes the corpus's corpusfile to the cleaned version and moves the original version to {{original_name}}-pre_tfidf @param top_k_terms: keep the top_k_terms terms by tf-idf rank @param min_df: minimum document frequency for the terms """ if not self.corpus_used: print "WARNING: You must first parse some data before calling tfidf_clean" return False orig_corpusfile = self.corpusfile + '-pre_tfidf' shutil.move(self.corpusfile, orig_corpusfile) # first obtain tf-idf scores for all terms tf_list = [0] * self.vocabct df_list = [0] * self.vocabct tfidf_list = [0] * self.vocabct for doc in open(orig_corpusfile, 'r'): cts = doc.strip().split()[1:] #remove the term count term_ct_pairs = map(lambda x: x.split(':'), cts) doc_len = sum(map(lambda x: int(x[1]), term_ct_pairs)) for pair in term_ct_pairs: trm = int(pair[0]) tf = float(pair[1]) / doc_len df_list[trm] += 1 if tf > tf_list[trm]: tf_list[trm] = tf # calculate tf-df scores for i in xrange(self.vocabct): tfidf_list[i] = tf_list[i] * log10(float(self.docct) / df_list[i]) # determine the minimum tf-idf score srt_tfidf = sorted(tfidf_list, reverse=True) min_score = srt_tfidf[top_k_terms] # rewrite the corpus to file, only allowing terms whose max(tf-idf) score exceeds the minimum old_to_new_dict = dict() self.vocabct = 0 self.wordct = 0 writefile = open(self.corpusfile, 'w') for doc in open(orig_corpusfile, 'r'): writeline = '' cts = doc.strip().split()[1:] term_ct_pairs = map(lambda x: x.split(':'), cts) doc_term_ct = 0 for tc_pair in term_ct_pairs: tid = int(tc_pair[0]) if tfidf_list[tid] < min_score or df_list[tid] < min_df: continue if not old_to_new_dict.has_key(tid): old_to_new_dict[tid] = self.vocabct self.vocabct += 1 self.wordct += int(tc_pair[1]) writeline += str(old_to_new_dict[tid]) + ':' + tc_pair[1] + ' ' doc_term_ct += 1 writeline = str(doc_term_ct) + " " + writeline writefile.write(writeline + '\n') writefile.close() remove_ct = len(tfidf_list) - len(old_to_new_dict) print 'Processing removed %i of %i terms, keeping %i terms. Min TF-IDF score is: %0.4f' % ( remove_ct, len(tfidf_list), len(old_to_new_dict), min_score) # update the appropriate databases TODO: perhaps wait to form the databases for efficiecy dbase = db(self.corpus_db) if self.make_stem_db: dbase = db(self.corpus_db) oldid_to_prestem = dbase.fetch('SELECT * FROM termid_to_prestem') dbase.execute('DROP TABLE termid_to_prestem') dbase.add_table( 'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)') id_prestem_list = [] for op_item in map(list, oldid_to_prestem): if old_to_new_dict.has_key(op_item[0]): op_item[0] = old_to_new_dict[op_item[0]] id_prestem_list.append(op_item) dbase.executemany( 'INSERT INTO termid_to_prestem(id, prestem) VALUES(?,?)', id_prestem_list) dbase.execute('SELECT * FROM term_doc_pair') term_doc_items = [] for item in dbase.cur: if old_to_new_dict.has_key(item[1]): item = list(item) item[1] = old_to_new_dict[item[1]] term_doc_items.append(item[1:]) dbase.execute('DROP TABLE term_doc_pair') dbase.add_table( 'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)') dbase.executemany('INSERT INTO term_doc_pair(term, doc) VALUES(?,?)', term_doc_items) dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') del (dbase) del (term_doc_items) # update corpus vocab oldid_to_term = dict((v, k) for k, v in self.vocab.iteritems()) self.vocab = {} for k, v in old_to_new_dict.iteritems(): self.vocab[oldid_to_term[k]] = v
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % ( pdf, txtname ) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append( txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table( 'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)' ) if self.make_stem_db: dbase.add_table( 'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)' ) # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-', ' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile, 'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd == '': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key( wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document( os.path.join(self.paradir, slugify(usetitle)), paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True