def serial_topics(abstracts, num): ''' Serial computation of topic models for all abstracts. ''' # prepare dictionary and corpora for topic modeling docs = [abstract.Get('cleantext') for abstract in abstracts] dictionary = corpora.Dictionary(docs) dictionary.save('abstracts.dict') corpus = [dictionary.doc2bow(doc) for doc in docs] corpora.MmCorpus.serialize('abstracts.mm', corpus) # use gensim tfidf to transform tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # load lsa and lda models lsaModel = Lsa.serial(corpus_tfidf, dictionary, num) ldaModel = Lda.serial(corpus_tfidf, dictionary, num) # store lda and lsa representation in all abstracts for i in xrange(len(abstracts)): lsaVec = lsaModel[tfidf[corpus[i]]] ldaVec = ldaModel[tfidf[corpus[i]]] lsaVector = defaultdict(float) ldaVector = defaultdict(float) for v in lsaVec: lsaVector[v[0]] = v[1] for v in ldaVec: ldaVector[v[0]] = v[1] abstracts[i].Set('lsa', lsaVector) abstracts[i].Set('lda', ldaVector) abstracts[i].Set('numtopics', num)
def process(filename): ''' Serial processing of abstracts, for evaluation purposes. ''' abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) dictlist = Process.load(filename, abstracts, stops) # create dictionary Process.create_dict(dictlist, dictionary) # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) dictlength = len(dictionary) bigramdict = [] termbow = defaultdict(float) termbigram = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = Process.create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbow[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigram[pair] += 1.0 # create dict of tfidf Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) Process.serial_tfidf(abstracts, 'bigram', termbigram) ##### TOPICS # prepare dictionary and corpora for topic modeling docs = [abstract.Get('cleantext') for abstract in abstracts] dictionary = corpora.Dictionary(docs) dictionary.save('abstracts.dict') corpus = [dictionary.doc2bow(doc) for doc in docs] corpora.MmCorpus.serialize('abstracts.mm', corpus) # use gensim tfidf to transform tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # load lsa and lda models numtopics = 10 # this can be adjusted lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics) ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics) # store lda and lsa representation in all abstracts for i in xrange(len(abstracts)): lsaVec = lsaModel[tfidf[corpus[i]]] ldaVec = ldaModel[tfidf[corpus[i]]] lsaVector = defaultdict(float) ldaVector = defaultdict(float) for v in lsaVec: lsaVector[v[0]] = v[1] for v in ldaVec: ldaVector[v[0]] = v[1] abstracts[i].Set('lsa', lsaVector) abstracts[i].Set('lda', ldaVector) abstracts[i].Set('numtopics', numtopics) return abstracts