Пример #1
0
def main(BAVfile,sheet_name,LDAdir,modelName):
    BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')]
    
    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns)
    
    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not
    
    BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered,'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered,'Mini Cooper')
    
    
    BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes')
    BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce')
    BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston')
    BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo')
    
    
    
    words=  [w.encode() for w in BAV_filtered.columns]
    brands= [b.encode() for b in BAV_filtered.index]
    
    topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv'))
    (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName)
    (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs)
    
    
    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]
    
    dirs = gslib.LDAdirs(modelName,LDAdir)
    (dict1,_,lda)=gslib.loadStuff(dirs)  
    probs = getLikes.ptopic_given_word(lda,topicsPs)
    probs_df =  pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index
    
    writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs') 
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')
    
    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
Пример #2
0
def get_divs(words,brands,indir, modelName,topics_marginal_probs):
     
    dirs = gslib.LDAdirs(modelName,indir)
    (dict1,_,lda)=gslib.loadStuff(dirs)  
    
    brands_df = pruneWordsList(brands,lda)
    words_df = pruneWordsList(words,lda)
    
    probs = ptopic_given_word(lda,topics_marginal_probs)
    sims = LDA_simmetricKLdiv(probs,brands_df["IDs"],words_df["IDs"])
    
    sims_norm_df = pandas.DataFrame(sims, index=brands_df.index, columns=words_df.index)
    return(sims_norm_df,brands_df,words_df)
def total_purge(words):
    cols = list()
    for year in range(2002, 2013):
        print year
        modelDir = 'Z:\\ermunds\\results\\%d 20topics' % year
        modelName = '%d 20topics' % year
        dirs = gslib.LDAdirs(modelName, modelDir)
        (dict1, _, lda) = gslib.loadStuff(dirs)
        words_df = getLikes.pruneWordsList(words, lda)
        col = words_df["Counts"]
        col.name = str(year)
        cols.append(col)
    stats = pandas.DataFrame(cols)
    return stats
Пример #4
0
def get_divs(words, brands, indir, modelName, topics_marginal_probs):

    dirs = gslib.LDAdirs(modelName, indir)
    (dict1, _, lda) = gslib.loadStuff(dirs)

    brands_df = pruneWordsList(brands, lda)
    words_df = pruneWordsList(words, lda)

    probs = ptopic_given_word(lda, topics_marginal_probs)
    sims = LDA_simmetricKLdiv(probs, brands_df["IDs"], words_df["IDs"])

    sims_norm_df = pandas.DataFrame(sims,
                                    index=brands_df.index,
                                    columns=words_df.index)
    return (sims_norm_df, brands_df, words_df)
Пример #5
0
def get_likes(words,brands,indir, modelName):   
    dirs = gslib.LDAdirs(modelName,indir)
    (dict1,_,lda)=gslib.loadStuff(dirs)  
    
    brands_df = pruneWordsList(brands,lda)
    words_df = pruneWordsList(words,lda)
    
    probs = pword_given_topic(lda)
    sims = LDAscalarProd(probs,brands_df["IDs"],words_df["IDs"])
    wordsWeights = LDAweights(lda,words_df["IDs"])
    brandsWeights = LDAweights(lda,brands_df["IDs"])
    sims_norm = sims/numpy.sqrt(numpy.outer(brandsWeights,wordsWeights))
    
    sims_norm_df = pandas.DataFrame(sims_norm, index=brands_df.index, columns=words_df.index)
    return(sims_norm_df,brands_df,words_df)
Пример #6
0
def get_likes(words, brands, indir, modelName):
    dirs = gslib.LDAdirs(modelName, indir)
    (dict1, _, lda) = gslib.loadStuff(dirs)

    brands_df = pruneWordsList(brands, lda)
    words_df = pruneWordsList(words, lda)

    probs = pword_given_topic(lda)
    sims = LDAscalarProd(probs, brands_df["IDs"], words_df["IDs"])
    wordsWeights = LDAweights(lda, words_df["IDs"])
    brandsWeights = LDAweights(lda, brands_df["IDs"])
    sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights))

    sims_norm_df = pandas.DataFrame(sims_norm,
                                    index=brands_df.index,
                                    columns=words_df.index)
    return (sims_norm_df, brands_df, words_df)
def total_purge(words):
    cols = list()
    for year in range(2002,2013):
        print year
        modelDir = 'Z:\\ermunds\\results\\%d 20topics'%year
        modelName = '%d 20topics' %year
        dirs = gslib.LDAdirs(modelName,modelDir)
        (dict1,_,lda)=gslib.loadStuff(dirs)  
        words_df = getLikes.pruneWordsList(words,lda)
        col = words_df["Counts"]
        col.name = str(year)
        cols.append(col)
    stats = pandas.DataFrame(cols)
    return stats
    


        

        
        
Пример #8
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 17 14:31:58 2013

@author: Vasya
"""
#import gensim
import genSimLDAlib as gslib
import mess_with_sims as sims
import numpy

dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\2012 20topics",    modelName="201220topics")
docsfilename=dirs.allDocsFileName
(dict1,mm,lda)=gslib.loadStuff(dirs)

brands = sims.BrandsClustered_1

# decompose a post into topics and ptint them
gslib.make_sense(1,lda,mm,docsfilename)

# guess the topic of concept list
consepts= ['cheap','ugly','unrelaible']
consepts= ['young','trendy','fast','macho'] # fail
consepts= ['green','environment','sustainable','hybrid'] #n75
consepts= ['reliable','safe'] # n8
consepts= 'air hot heat cool exhaust system fan coolant temp blow'.split() # n5
ws,IDl,ID2index = gslib.world_list2IDs(dict1,consepts,tokenizef=gslib.wordCleanUp)
for t,p in lda[ [(t,1) for t in IDl] ]:
        print  '__with prob:{}% is N{}: {}'.format(int(p*100),t, ' '.join([w for _,w in lda.show_topic(t,10)]))
        
import getLikes

import numpy as np
import genSimLDAlib as gslib
import mess_with_sims





indir=r"Z:\ermunds\results\2005 20topics"
modelName="2005+20topics"
    
dirs = gslib.LDAdirs(modelName,indir)
(dict1,_,lda)=gslib.loadStuff(dirs)

words = mess_with_sims.BrandsClustered_1
#words = lda.id2word.values()
wordsClean = getLikes.pruneWordsList(words,lda)

weights = getLikes.LDAweights(lda,wordsClean["IDs"])

import matplotlib.pyplot as plt


yy = wordsClean.Counts
xx = np.sqrt(weights)
labels = wordsClean.index

plt.scatter(xx,yy, marker='.')
Пример #10
0
def compute_and_save(modelName="201220topics", LDAdir=r"Z:\ermunds\results\2012 20topics"): 
    dirs = gslib.LDAdirs(indir=LDAdir,modelName=modelName)
    (_,mm,lda)=gslib.loadStuff(dirs)    
    agg = marginal_topic_distribution(lda,mm)
    np.savetxt( os.path.join(LDAdir,"topics_marginal.csv"), agg, delimiter=",")
    return agg
Пример #11
0
def main(BAVfile, sheet_name, LDAdir, modelName):
    BAV_raw = pandas.read_excel(BAVfile,
                                sheet_name,
                                index_col=0,
                                na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [
        col for col in BAV_raw.columns[idx_first:]
        if len(col.split("_")) == 2 and col.endswith('pct')
    ]

    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns)

    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not

    BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered, 'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper')

    BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes')
    BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles',
                              'Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce')
    BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston')
    BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo')

    words = [w.encode() for w in BAV_filtered.columns]
    brands = [b.encode() for b in BAV_filtered.index]

    topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv'))
    (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words,
                                                         brands=brands,
                                                         indir=LDAdir,
                                                         modelName=modelName)
    (divs, _, _) = getLikes.get_divs(words,
                                     brands,
                                     indir=LDAdir,
                                     modelName=modelName,
                                     topics_marginal_probs=topicsPs)

    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]

    dirs = gslib.LDAdirs(modelName, LDAdir)
    (dict1, _, lda) = gslib.loadStuff(dirs)
    probs = getLikes.ptopic_given_word(lda, topicsPs)
    probs_df = pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index

    writer = pandas.ExcelWriter(
        os.path.join(LDAdir, modelName + '_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs')
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')

    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)