def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"): brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1] br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] #try if brands are not in dict br_ids=[]; bad_brands=[] for (i,brt) in enumerate(br_tokens): try: ID=lda.id2word.token2id[brt] print brands[i] ,lda.id2word.dfs[ID] except KeyError: print 'no '+ brt + ' in dict' bad_brands.append(brands[br_tokens.index(brt)]) #update brands=sorted(list(set(brands)-set(bad_brands))) br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] br_ids=[lda.id2word.token2id[brt] for brt in br_tokens] topics = lda.state.get_lambda() topics = [topic / topic.sum() for topic in topics] l=(len(brands)); sims = numpy.zeros((l,l)) for i in xrange(l): for j in xrange(l): p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics]) sims[i,j]= p return (sims,brands)
len([c for c in authPostCount.values() if c > 5]) #53,068 s = r"Z:\ermunds\results\1 prices paid\5-6-2013\PricesStemmed20passes_20topics.dict" dict1 = gensim.corpora.dictionary.Dictionary().load(s) tupl = [] for ID in dict1.keys(): tupl.append((dict1.dfs[ID], dict1[ID])) tupl = sorted(tupl, reverse=True) with open('wordCounts', 'w') as f: for t in tupl: f.write(str(t) + '\n') lst = [] for b in bf.getMakes(): token = gslib.wordCleanUp(gslib.textCleanUp(b)) try: ID = dict1.token2id[token] fr = dict1.dfs[ID] print b, fr, token lst.append((fr, b)) except KeyError: print b, 'fail', token lst = sorted(lst) fname = 'brand_mentions_count.txt' with open(fname, 'w') as outfile: for t in lst: outfile.write(t[1] + ":" + str(t[0]) + '\n')
s = r"Z:\ermunds\results\1 prices paid\5-6-2013\PricesStemmed20passes_20topics.dict" dict1 = gensim.corpora.dictionary.Dictionary().load(s) tupl = [] for ID in dict1.keys(): tupl.append((dict1.dfs[ID],dict1[ID])) tupl=sorted(tupl,reverse=True) with open('wordCounts','w') as f: for t in tupl: f.write(str(t)+'\n') lst= [] for b in bf.getMakes(): token = gslib.wordCleanUp(gslib.textCleanUp(b)) try: ID=dict1.token2id[token] fr=dict1.dfs[ID] print b,fr,token lst.append((fr,b)) except KeyError: print b,'fail',token lst = sorted(lst) fname = 'brand_mentions_count.txt' with open(fname,'w') as outfile: for t in lst: outfile.write(t[1]+":"+str(t[0])+'\n')