def main(): stem = nltk.stem.LancasterStemmer() cleanword = lambda w : stem.stem(w.strip(w).lower()) bib = btparse.load(sys.argv[1]) aid = np.random.randint(len(bib)) while ('abstract' in bib[aid].keys()) == False: aid = np.random.randint(len(bib)) abstract = nltk.wordpunct_tokenize(bib[aid]['abstract']+" "+bib[aid]['title']) q_vec0 = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")]) q_vec = [] q_val = [] for w in q_vec0: w = cleanword(w) if len(w)>2 and w not in ignore_list and re.search('\\\\',w) == None: if (w in q_vec) == False: q_vec.append(w) q_val.append(1) else: q_val[-1] += 1 q_val = np.array(q_val)/np.sqrt(np.dot(q_val,q_val)) prob = np.zeros(len(bib)) if pytools: progress = pytools.ProgressBar("Analysing",len(bib)) progress.draw() for ind,entry in enumerate(bib): if ind != aid and ('abstract' in bib[ind].keys()): abstract = nltk.wordpunct_tokenize(bib[ind]['abstract']+" "+bib[ind]['title']) r_vec = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")]) r_val = np.zeros(len(q_val)) for w in r_vec: w = cleanword(w) if w in q_vec: r_val[q_vec.index(w)] += 1 mod = np.dot(r_val,r_val) if mod > 0: prob[ind] = np.dot(r_val/np.sqrt(mod),q_val) if pytools: progress.progress() if pytools: print "" # sort based on probability (best first) inds_sort = np.argsort(prob)[::-1] print 'similar papers to:\n\t%s\n\t\tby: %s\n'%(bib[aid]['title'],bib[aid]['author']) for i in range(10): best = inds_sort[i] print '%3d.\t%s\n\t\tby: %s\n\t\tid = %3d, prob = %f\n'%(i+1,bib[best]['title'],bib[best]['author'],best,prob[best])
if item.has_key("abstract") and item.has_key("title"): text = nltk.wordpunct_tokenize(item["abstract"] + " " + item["title"]) for word in [x[0] for x in nltk.pos_tag(text) if x[1] in ("NN")]: word = stem.stem(word.strip(strip_chars).lower()) if len(word)>1 and word not in ignore_list: try: wordvector[word] += 1 except KeyError: wordvector[word] = 1 if pytools: progress.progress() if pytools: print "" sortedwordvector = sorted(wordvector.iteritems(), key=operator.itemgetter(1)) return sortedwordvector[-numkeep:] if __name__ == "__main__": if len(sys.argv) < 2: print "Usage: ./%s input.bib output.dat [numkeep]"%sys.argv[0] exit(127) elif len(sys.argv) == 4: numkeep = int(sys.argv[3]) else: numkeep = 0 bib = btparse.load(sys.argv[1]) globalWordVector = getGlobalWordVector(bib, numkeep=numkeep) cPickle.dump([x[0] for x in globalWordVector],open(sys.argv[2],"w+")) print "Top 50 words:" for word, count in globalWordVector[-50:]: print "%-15s %d"%(word,count)