def main(indir=r"Z:\ermunds\results\2005 20t unbranded", modelName="2005 20topics", world_list_file_name="adjectives.txt"): dirs = gslib.LDAdirs(modelName, indir) dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) lda = gensim.models.ldamodel.LdaModel(id2word=dict1).load(dirs.modelFname) brands = lists.BrandsClustered_1 adjs = words_from_file(world_list_file_name) adjs.extend(brands) # just a check brs = pruneWordsList(brands, lda) wrds = pruneWordsList(adjs, lda) sims = LDAscalarProd(lda, brs["IDs"], wrds["IDs"]) wordsWeights = LDAweights(lda, wrds["IDs"]) brandsWeights = LDAweights(lda, brs["IDs"]) sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights)) sims_norm_df = pandas.DataFrame(sims_norm, index=brs.index, columns=wrds.index) sims_df = pandas.DataFrame(sims, index=brs.index, columns=wrds.index) stats = pandas.Series([indir, modelName], index=['indir', 'modelName']) stats_df = pandas.DataFrame(stats) writer = pandas.ExcelWriter(os.path.join(indir, modelName + '.xlsx')) sims_norm_df.to_excel(writer, sheet_name='cosine distance') sims_df.to_excel(writer, sheet_name='raw scalar products') brs.to_excel(writer, sheet_name='brands') wrds.to_excel(writer, sheet_name='words') stats_df.to_excel(writer, sheet_name='stats') writer.save()
def total_purge(words): cols = list() for year in range(2002, 2013): print year modelDir = 'Z:\\ermunds\\results\\%d 20topics' % year modelName = '%d 20topics' % year dirs = gslib.LDAdirs(modelName, modelDir) (dict1, _, lda) = gslib.loadStuff(dirs) words_df = getLikes.pruneWordsList(words, lda) col = words_df["Counts"] col.name = str(year) cols.append(col) stats = pandas.DataFrame(cols) return stats
def get_divs(words, brands, indir, modelName, topics_marginal_probs): dirs = gslib.LDAdirs(modelName, indir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = pruneWordsList(brands, lda) words_df = pruneWordsList(words, lda) probs = ptopic_given_word(lda, topics_marginal_probs) sims = LDA_simmetricKLdiv(probs, brands_df["IDs"], words_df["IDs"]) sims_norm_df = pandas.DataFrame(sims, index=brands_df.index, columns=words_df.index) return (sims_norm_df, brands_df, words_df)
def get_likes(words, brands, indir, modelName): dirs = gslib.LDAdirs(modelName, indir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = pruneWordsList(brands, lda) words_df = pruneWordsList(words, lda) probs = pword_given_topic(lda) sims = LDAscalarProd(probs, brands_df["IDs"], words_df["IDs"]) wordsWeights = LDAweights(lda, words_df["IDs"]) brandsWeights = LDAweights(lda, brands_df["IDs"]) sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights)) sims_norm_df = pandas.DataFrame(sims_norm, index=brands_df.index, columns=words_df.index) return (sims_norm_df, brands_df, words_df)
def main(): ''' resorts sims and saves a png copy ''' # dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all\all unbranded threads",modelName="unbranded2passes_20topics") #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads", modelName="All2passes_20topics") dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2", modelName="unbranded220topics") #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink", modelName="unbranded220topics") CSVin = "simsN_posts" CSVout = "simsNtweaked" suffix = '' figName = 'heatmap_from_posts_no whitening' + suffix #mp.generateCSV(indir=dirs.indir,modelName=dirs.modelName,suffix = suffix) sims, brands = mp.loadCSV(dirs, CSVin) nbrands = BrandsClustered_1 # caps bug of may 14 del nbrands[nbrands.index('mercedes-benz')] idx = numpy.zeros(len(nbrands), dtype=int) for i, b in enumerate(nbrands): idx[i] = brands.index(b) ''' ibrand = brands.index('ram') idx = numpy.argsort(-sims[ibrand,:]) ibrand = brands.index('jeep') sort_a_slice(idx,sims,a=6,b=None,compare_to=ibrand) ibrand = brands.index('nissan') sort_a_slice(idx,sims,a=10,b=None,compare_to=ibrand) ibrand = brands.index('chrysler') sort_a_slice(idx,sims,a=15,b=None,compare_to=ibrand) ibrand = brands.index('bmw') sort_a_slice(idx,sims,a=23,b=None,compare_to=ibrand,sign=1) ''' (sims, nbrands) = select(sims, brands, idx) mp.saveCSV(dirs, CSVout, nbrands, sims) draw.main(dirs, CSVout, figName)
def main( outdir=r'Z:\ermunds\results\2005 20t unbranded', num_passes=2, n_repeat=10, num_topics=20, threadChoseStr='', modelTag='2005+', time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"), time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"), ): ''' # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates # threadChoseStr - filter topic names by this phrase ''' dTr = bf.notMain(threadChoseStr) modelName = modelTag + str(num_topics) + 'topics' dirs = gslib.LDAdirs(modelName, outdir) with open(dirs.dataFileName, 'a') as file1: pickle.dump(dTr, file1) ## setup logging to file and console logger = logging.getLogger('') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(dirs.logFileName) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)-12s - %(levelname)s - %(message)s') ch.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(ch) logger.addHandler(fh) ## get threads, extract post texts and save to single file # 7min per 1GB logging.log(logging.INFO, "building doc list") lineCounter = 0 with open(dirs.allDocsFileName, 'a') as docDumpFile: for Trlist in dTr.values(): for Tr in Trlist: for p in Tr.getPosts(): if (p.msgTime > time_low_cutoff) and (p.msgTime < time_hi_cutoff): doc = gslib.textCleanUp( p.msgTitle) + gslib.textCleanUp(p.msgText) lineCounter += 1 print(doc, file=docDumpFile) logging.log(logging.INFO, "total {} docs ".format(lineCounter)) #build dict 1.5H/GB dict1 = gslib.build_dict(dirs) dict1.save(dirs.dictFileName) #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) #pipe docfile to gensim corpus #fixme - corpusAdapter missing a len() property corpus = gslib.corpusAdapter(dirs.allDocsFileName, id2word=dict1) gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname, corpus=corpus, id2word=dict1) mm = gensim.corpora.MmCorpus(dirs.corpusFname) ## run the LDA (2h per update on 2M posts) # first runs a small step and then update 9 times saving results to disk every time lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dict1, num_topics=num_topics, update_every=0, passes=num_passes) lda.save(dirs.modelFname + "_0") for i in xrange(n_repeat - 1): lda.update(mm) # save inremediate result lda.save(dirs.modelFname + "_" + str(i + 1)) for t in lda.show_topics(-1): logging.info(str('all topics here') + t) lda.save(dirs.modelFname) logger.removeHandler(ch) logger.removeHandler(fh) return modelName
# -*- coding: utf-8 -*- """ Created on Tue May 14 23:23:59 2013 @author: Vasya """ import gensim import logging import genSimLDAlib as gslib #import someBrandFiltering as bf dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2", modelName="unbranded220topics") logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S', filename=dirs.logFileName, filemode='a') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info("ading more from beh.py") dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) mm = gensim.corpora.MmCorpus(dirs.corpusFname)
""" import getLikes import numpy as np import genSimLDAlib as gslib import mess_with_sims indir=r"Z:\ermunds\results\2005 20topics" modelName="2005+20topics" dirs = gslib.LDAdirs(modelName,indir) (dict1,_,lda)=gslib.loadStuff(dirs) words = mess_with_sims.BrandsClustered_1 #words = lda.id2word.values() wordsClean = getLikes.pruneWordsList(words,lda) weights = getLikes.LDAweights(lda,wordsClean["IDs"]) import matplotlib.pyplot as plt yy = wordsClean.Counts xx = np.sqrt(weights) labels = wordsClean.index
def compute_and_save(modelName="201220topics", LDAdir=r"Z:\ermunds\results\2012 20topics"): dirs = gslib.LDAdirs(indir=LDAdir,modelName=modelName) (_,mm,lda)=gslib.loadStuff(dirs) agg = marginal_topic_distribution(lda,mm) np.savetxt( os.path.join(LDAdir,"topics_marginal.csv"), agg, delimiter=",") return agg
# -*- coding: utf-8 -*- """ Created on Tue May 14 23:52:27 2013 @author: Vasya """ import gensim import logging import genSimLDAlib as gslib #import someBrandFiltering as bf num_topics = 100 num_passes = 2 dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\sink", modelName="unbranded220topics") logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S', filename=dirs.logFileName, filemode='a') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info("adding more from beh2.py") dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
def main(BAVfile, sheet_name, LDAdir, modelName): BAV_raw = pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [ col for col in BAV_raw.columns[idx_first:] if len(col.split("_")) == 2 and col.endswith('pct') ] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered, 'Smart (car)') BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper') BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes') BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles', 'Mitsubishi') BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce') BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston') BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo') words = [w.encode() for w in BAV_filtered.columns] brands = [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv')) (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words, brands=brands, indir=LDAdir, modelName=modelName) (divs, _, _) = getLikes.get_divs(words, brands, indir=LDAdir, modelName=modelName, topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName, LDAdir) (dict1, _, lda) = gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter( os.path.join(LDAdir, modelName + '_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
# -*- coding: utf-8 -*- """ Created on Mon May 13 09:28:54 2013 @author: Vasya """ import ldaModel2bradsSims_direct as mp import sims_csv_plotter import genSimLDAlib as gsLib import gensim #dirs = gsLib.LDAdirs(modelName = 'PricesStemmed20passes_20topics',indir = r"Z:\ermunds\results\1 prices paid\5-6-2013") dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads", modelName="All2passes_20topics") dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) lda = gensim.models.ldamodel.LdaModel(id2word=dict1).load(dirs.modelFname) simsR, brands = mp.corrBrands(lda) sims = mp.normalize(simsR) ##mp.saveCSV(dirs,'simsN',brands,sims) #sims,brands= mp.loadCSV(dirs,"simsN") ##sims_csv_plotter.main(dirs,CSVin="simsN",figName='fromTopics') brands[brands.index('mercedes-benz')] = 'mercedesbenz' mp.likes(sims, brands, brands[0]) print 'tada' mp.likes2(lda, brands, word='luxury') # looks like sims cvs
# -*- coding: utf-8 -*- """ Created on Fri May 17 14:31:58 2013 @author: Vasya """ #import gensim import genSimLDAlib as gslib import mess_with_sims as sims import numpy dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\2012 20topics", modelName="201220topics") docsfilename = dirs.allDocsFileName (dict1, mm, lda) = gslib.loadStuff(dirs) brands = sims.BrandsClustered_1 # decompose a post into topics and ptint them gslib.make_sense(1, lda, mm, docsfilename) # guess the topic of concept list consepts = ['cheap', 'ugly', 'unrelaible'] consepts = ['young', 'trendy', 'fast', 'macho'] # fail consepts = ['green', 'environment', 'sustainable', 'hybrid'] #n75 consepts = ['reliable', 'safe'] # n8 consepts = 'air hot heat cool exhaust system fan coolant temp blow'.split( ) # n5 ws, IDl, ID2index = gslib.world_list2IDs(dict1, consepts, tokenizef=gslib.wordCleanUp)
topicsPs = np.genfromtxt(os.path.join(modelDir, 'topics_marginal.csv')) words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt") brands = getLikes.words_from_file(r"Z:\ermunds\brands.txt") (divs, _, _) = getLikes.get_divs(words, brands, indir=modelDir, modelName=modelName, topics_marginal_probs=topicsPs) (sims, b, w) = getLikes.get_likes(words, brands, indir=modelDir, modelName=modelName) dirs = gslib.LDAdirs(modelName, modelDir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = getLikes.pruneWordsList(brands, lda) words_df = getLikes.pruneWordsList(words, lda) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pd.DataFrame(probs, columns=lda.id2word.values()) alls = pd.concat([brands_df["IDs"], words_df["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx')) sims.to_excel(writer, sheet_name='cosine distance') divs.to_excel(writer, sheet_name='KL divs') b.to_excel(writer, sheet_name='brands')
fig.savefig(figfname) plt.show() def plotSims(sims,brands,dirs,figName='heatmap'): fig = plt.figure(figsize=(11,9)) ax = fig.add_axes([.1,.1,.8,.8]) # ruined by corobar imgplot = ax.imshow(sims,interpolation='none') idx= xrange(len(brands)) ax.set_yticks(idx) ax.set_yticklabels(zip(brands,idx)) ax.set_xticks(idx) ax.set_xticklabels(brands,rotation=90) plt.colorbar(imgplot) ax.set_position([.1,.2,.6,.6]) figfname=dirs.indir+'\\'+figName+'.png' fig.savefig(figfname) plt.show() if __name__ == '__main__': dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink" ,modelName="All2passes_20topics") main(dirs)