def main(BAVfile,sheet_name,LDAdir,modelName): BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered,'Smart (car)') BAV_filtered = try_drop(BAV_filtered,'Mini Cooper') BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes') BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi') BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce') BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston') BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo') words= [w.encode() for w in BAV_filtered.columns] brands= [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv')) (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName) (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName,LDAdir) (dict1,_,lda)=gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda,topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
def get_divs(words,brands,indir, modelName,topics_marginal_probs): dirs = gslib.LDAdirs(modelName,indir) (dict1,_,lda)=gslib.loadStuff(dirs) brands_df = pruneWordsList(brands,lda) words_df = pruneWordsList(words,lda) probs = ptopic_given_word(lda,topics_marginal_probs) sims = LDA_simmetricKLdiv(probs,brands_df["IDs"],words_df["IDs"]) sims_norm_df = pandas.DataFrame(sims, index=brands_df.index, columns=words_df.index) return(sims_norm_df,brands_df,words_df)
def total_purge(words): cols = list() for year in range(2002, 2013): print year modelDir = 'Z:\\ermunds\\results\\%d 20topics' % year modelName = '%d 20topics' % year dirs = gslib.LDAdirs(modelName, modelDir) (dict1, _, lda) = gslib.loadStuff(dirs) words_df = getLikes.pruneWordsList(words, lda) col = words_df["Counts"] col.name = str(year) cols.append(col) stats = pandas.DataFrame(cols) return stats
def get_divs(words, brands, indir, modelName, topics_marginal_probs): dirs = gslib.LDAdirs(modelName, indir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = pruneWordsList(brands, lda) words_df = pruneWordsList(words, lda) probs = ptopic_given_word(lda, topics_marginal_probs) sims = LDA_simmetricKLdiv(probs, brands_df["IDs"], words_df["IDs"]) sims_norm_df = pandas.DataFrame(sims, index=brands_df.index, columns=words_df.index) return (sims_norm_df, brands_df, words_df)
def get_likes(words,brands,indir, modelName): dirs = gslib.LDAdirs(modelName,indir) (dict1,_,lda)=gslib.loadStuff(dirs) brands_df = pruneWordsList(brands,lda) words_df = pruneWordsList(words,lda) probs = pword_given_topic(lda) sims = LDAscalarProd(probs,brands_df["IDs"],words_df["IDs"]) wordsWeights = LDAweights(lda,words_df["IDs"]) brandsWeights = LDAweights(lda,brands_df["IDs"]) sims_norm = sims/numpy.sqrt(numpy.outer(brandsWeights,wordsWeights)) sims_norm_df = pandas.DataFrame(sims_norm, index=brands_df.index, columns=words_df.index) return(sims_norm_df,brands_df,words_df)
def get_likes(words, brands, indir, modelName): dirs = gslib.LDAdirs(modelName, indir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = pruneWordsList(brands, lda) words_df = pruneWordsList(words, lda) probs = pword_given_topic(lda) sims = LDAscalarProd(probs, brands_df["IDs"], words_df["IDs"]) wordsWeights = LDAweights(lda, words_df["IDs"]) brandsWeights = LDAweights(lda, brands_df["IDs"]) sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights)) sims_norm_df = pandas.DataFrame(sims_norm, index=brands_df.index, columns=words_df.index) return (sims_norm_df, brands_df, words_df)
def total_purge(words): cols = list() for year in range(2002,2013): print year modelDir = 'Z:\\ermunds\\results\\%d 20topics'%year modelName = '%d 20topics' %year dirs = gslib.LDAdirs(modelName,modelDir) (dict1,_,lda)=gslib.loadStuff(dirs) words_df = getLikes.pruneWordsList(words,lda) col = words_df["Counts"] col.name = str(year) cols.append(col) stats = pandas.DataFrame(cols) return stats
# -*- coding: utf-8 -*- """ Created on Fri May 17 14:31:58 2013 @author: Vasya """ #import gensim import genSimLDAlib as gslib import mess_with_sims as sims import numpy dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\2012 20topics", modelName="201220topics") docsfilename=dirs.allDocsFileName (dict1,mm,lda)=gslib.loadStuff(dirs) brands = sims.BrandsClustered_1 # decompose a post into topics and ptint them gslib.make_sense(1,lda,mm,docsfilename) # guess the topic of concept list consepts= ['cheap','ugly','unrelaible'] consepts= ['young','trendy','fast','macho'] # fail consepts= ['green','environment','sustainable','hybrid'] #n75 consepts= ['reliable','safe'] # n8 consepts= 'air hot heat cool exhaust system fan coolant temp blow'.split() # n5 ws,IDl,ID2index = gslib.world_list2IDs(dict1,consepts,tokenizef=gslib.wordCleanUp) for t,p in lda[ [(t,1) for t in IDl] ]: print '__with prob:{}% is N{}: {}'.format(int(p*100),t, ' '.join([w for _,w in lda.show_topic(t,10)]))
import getLikes import numpy as np import genSimLDAlib as gslib import mess_with_sims indir=r"Z:\ermunds\results\2005 20topics" modelName="2005+20topics" dirs = gslib.LDAdirs(modelName,indir) (dict1,_,lda)=gslib.loadStuff(dirs) words = mess_with_sims.BrandsClustered_1 #words = lda.id2word.values() wordsClean = getLikes.pruneWordsList(words,lda) weights = getLikes.LDAweights(lda,wordsClean["IDs"]) import matplotlib.pyplot as plt yy = wordsClean.Counts xx = np.sqrt(weights) labels = wordsClean.index plt.scatter(xx,yy, marker='.')
def compute_and_save(modelName="201220topics", LDAdir=r"Z:\ermunds\results\2012 20topics"): dirs = gslib.LDAdirs(indir=LDAdir,modelName=modelName) (_,mm,lda)=gslib.loadStuff(dirs) agg = marginal_topic_distribution(lda,mm) np.savetxt( os.path.join(LDAdir,"topics_marginal.csv"), agg, delimiter=",") return agg
def main(BAVfile, sheet_name, LDAdir, modelName): BAV_raw = pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [ col for col in BAV_raw.columns[idx_first:] if len(col.split("_")) == 2 and col.endswith('pct') ] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered, 'Smart (car)') BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper') BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes') BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles', 'Mitsubishi') BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce') BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston') BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo') words = [w.encode() for w in BAV_filtered.columns] brands = [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv')) (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words, brands=brands, indir=LDAdir, modelName=modelName) (divs, _, _) = getLikes.get_divs(words, brands, indir=LDAdir, modelName=modelName, topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName, LDAdir) (dict1, _, lda) = gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter( os.path.join(LDAdir, modelName + '_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)