示例#1
0
def main(indir=r"Z:\ermunds\results\2005 20t unbranded",
         modelName="2005 20topics",
         world_list_file_name="adjectives.txt"):
    dirs = gslib.LDAdirs(modelName, indir)
    dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
    lda = gensim.models.ldamodel.LdaModel(id2word=dict1).load(dirs.modelFname)

    brands = lists.BrandsClustered_1
    adjs = words_from_file(world_list_file_name)
    adjs.extend(brands)  # just a check

    brs = pruneWordsList(brands, lda)
    wrds = pruneWordsList(adjs, lda)
    sims = LDAscalarProd(lda, brs["IDs"], wrds["IDs"])
    wordsWeights = LDAweights(lda, wrds["IDs"])
    brandsWeights = LDAweights(lda, brs["IDs"])
    sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights))

    sims_norm_df = pandas.DataFrame(sims_norm,
                                    index=brs.index,
                                    columns=wrds.index)
    sims_df = pandas.DataFrame(sims, index=brs.index, columns=wrds.index)
    stats = pandas.Series([indir, modelName], index=['indir', 'modelName'])
    stats_df = pandas.DataFrame(stats)

    writer = pandas.ExcelWriter(os.path.join(indir, modelName + '.xlsx'))
    sims_norm_df.to_excel(writer, sheet_name='cosine distance')
    sims_df.to_excel(writer, sheet_name='raw scalar products')
    brs.to_excel(writer, sheet_name='brands')
    wrds.to_excel(writer, sheet_name='words')
    stats_df.to_excel(writer, sheet_name='stats')
    writer.save()
def total_purge(words):
    cols = list()
    for year in range(2002, 2013):
        print year
        modelDir = 'Z:\\ermunds\\results\\%d 20topics' % year
        modelName = '%d 20topics' % year
        dirs = gslib.LDAdirs(modelName, modelDir)
        (dict1, _, lda) = gslib.loadStuff(dirs)
        words_df = getLikes.pruneWordsList(words, lda)
        col = words_df["Counts"]
        col.name = str(year)
        cols.append(col)
    stats = pandas.DataFrame(cols)
    return stats
示例#3
0
def get_divs(words, brands, indir, modelName, topics_marginal_probs):

    dirs = gslib.LDAdirs(modelName, indir)
    (dict1, _, lda) = gslib.loadStuff(dirs)

    brands_df = pruneWordsList(brands, lda)
    words_df = pruneWordsList(words, lda)

    probs = ptopic_given_word(lda, topics_marginal_probs)
    sims = LDA_simmetricKLdiv(probs, brands_df["IDs"], words_df["IDs"])

    sims_norm_df = pandas.DataFrame(sims,
                                    index=brands_df.index,
                                    columns=words_df.index)
    return (sims_norm_df, brands_df, words_df)
示例#4
0
def get_likes(words, brands, indir, modelName):
    dirs = gslib.LDAdirs(modelName, indir)
    (dict1, _, lda) = gslib.loadStuff(dirs)

    brands_df = pruneWordsList(brands, lda)
    words_df = pruneWordsList(words, lda)

    probs = pword_given_topic(lda)
    sims = LDAscalarProd(probs, brands_df["IDs"], words_df["IDs"])
    wordsWeights = LDAweights(lda, words_df["IDs"])
    brandsWeights = LDAweights(lda, brands_df["IDs"])
    sims_norm = sims / numpy.sqrt(numpy.outer(brandsWeights, wordsWeights))

    sims_norm_df = pandas.DataFrame(sims_norm,
                                    index=brands_df.index,
                                    columns=words_df.index)
    return (sims_norm_df, brands_df, words_df)
示例#5
0
def main():
    '''
    resorts sims and saves a png copy
    '''
    #  dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all\all unbranded threads",modelName="unbranded2passes_20topics")
    #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads",    modelName="All2passes_20topics")
    dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2",
                         modelName="unbranded220topics")
    #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink",    modelName="unbranded220topics")

    CSVin = "simsN_posts"
    CSVout = "simsNtweaked"
    suffix = ''
    figName = 'heatmap_from_posts_no whitening' + suffix
    #mp.generateCSV(indir=dirs.indir,modelName=dirs.modelName,suffix = suffix)
    sims, brands = mp.loadCSV(dirs, CSVin)

    nbrands = BrandsClustered_1
    # caps bug of may 14
    del nbrands[nbrands.index('mercedes-benz')]

    idx = numpy.zeros(len(nbrands), dtype=int)
    for i, b in enumerate(nbrands):
        idx[i] = brands.index(b)
    '''      
    ibrand = brands.index('ram')
    idx = numpy.argsort(-sims[ibrand,:])
    ibrand = brands.index('jeep')
    sort_a_slice(idx,sims,a=6,b=None,compare_to=ibrand)
    
    ibrand = brands.index('nissan')
    sort_a_slice(idx,sims,a=10,b=None,compare_to=ibrand)
    
    ibrand = brands.index('chrysler')	
    sort_a_slice(idx,sims,a=15,b=None,compare_to=ibrand)

    ibrand = brands.index('bmw')	
    sort_a_slice(idx,sims,a=23,b=None,compare_to=ibrand,sign=1)
	
    '''
    (sims, nbrands) = select(sims, brands, idx)
    mp.saveCSV(dirs, CSVout, nbrands, sims)

    draw.main(dirs, CSVout, figName)
def main(
        outdir=r'Z:\ermunds\results\2005 20t unbranded',
        num_passes=2,
        n_repeat=10,
        num_topics=20,
        threadChoseStr='',
        modelTag='2005+',
        time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"),
        time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"),
):
    ''' 
    # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates             
    # threadChoseStr - filter topic names by this phrase
    '''
    dTr = bf.notMain(threadChoseStr)

    modelName = modelTag + str(num_topics) + 'topics'
    dirs = gslib.LDAdirs(modelName, outdir)
    with open(dirs.dataFileName, 'a') as file1:
        pickle.dump(dTr, file1)

    ## setup logging to file and console
    logger = logging.getLogger('')
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(dirs.logFileName)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)-12s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(ch)
    logger.addHandler(fh)

    ## get threads, extract post texts and save to single file
    # 7min per 1GB
    logging.log(logging.INFO, "building doc list")
    lineCounter = 0
    with open(dirs.allDocsFileName, 'a') as docDumpFile:
        for Trlist in dTr.values():
            for Tr in Trlist:
                for p in Tr.getPosts():
                    if (p.msgTime > time_low_cutoff) and (p.msgTime <
                                                          time_hi_cutoff):
                        doc = gslib.textCleanUp(
                            p.msgTitle) + gslib.textCleanUp(p.msgText)
                        lineCounter += 1
                        print(doc, file=docDumpFile)
    logging.log(logging.INFO, "total {} docs ".format(lineCounter))

    #build dict 1.5H/GB
    dict1 = gslib.build_dict(dirs)
    dict1.save(dirs.dictFileName)
    #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)

    #pipe docfile to gensim corpus
    #fixme - corpusAdapter missing a len() property
    corpus = gslib.corpusAdapter(dirs.allDocsFileName, id2word=dict1)
    gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname,
                                      corpus=corpus,
                                      id2word=dict1)
    mm = gensim.corpora.MmCorpus(dirs.corpusFname)

    ## run the LDA (2h per update on 2M posts)
    # first runs a small step and then update 9 times saving results to disk every time

    lda = gensim.models.ldamodel.LdaModel(corpus=mm,
                                          id2word=dict1,
                                          num_topics=num_topics,
                                          update_every=0,
                                          passes=num_passes)
    lda.save(dirs.modelFname + "_0")

    for i in xrange(n_repeat - 1):
        lda.update(mm)
        # save inremediate result
        lda.save(dirs.modelFname + "_" + str(i + 1))
        for t in lda.show_topics(-1):
            logging.info(str('all topics here') + t)
    lda.save(dirs.modelFname)

    logger.removeHandler(ch)
    logger.removeHandler(fh)
    return modelName
示例#7
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 14 23:23:59 2013

@author: Vasya
"""
import gensim
import logging
import genSimLDAlib as gslib

#import someBrandFiltering as bf

dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2",
                     modelName="unbranded220topics")

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    filename=dirs.logFileName,
    filemode='a')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

logging.info("ading more from beh.py")
dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
mm = gensim.corpora.MmCorpus(dirs.corpusFname)
"""

import getLikes

import numpy as np
import genSimLDAlib as gslib
import mess_with_sims





indir=r"Z:\ermunds\results\2005 20topics"
modelName="2005+20topics"
    
dirs = gslib.LDAdirs(modelName,indir)
(dict1,_,lda)=gslib.loadStuff(dirs)

words = mess_with_sims.BrandsClustered_1
#words = lda.id2word.values()
wordsClean = getLikes.pruneWordsList(words,lda)

weights = getLikes.LDAweights(lda,wordsClean["IDs"])

import matplotlib.pyplot as plt


yy = wordsClean.Counts
xx = np.sqrt(weights)
labels = wordsClean.index
示例#9
0
def compute_and_save(modelName="201220topics", LDAdir=r"Z:\ermunds\results\2012 20topics"): 
    dirs = gslib.LDAdirs(indir=LDAdir,modelName=modelName)
    (_,mm,lda)=gslib.loadStuff(dirs)    
    agg = marginal_topic_distribution(lda,mm)
    np.savetxt( os.path.join(LDAdir,"topics_marginal.csv"), agg, delimiter=",")
    return agg
示例#10
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 14 23:52:27 2013

@author: Vasya
"""
import gensim
import logging
import genSimLDAlib as gslib

#import someBrandFiltering as bf
num_topics = 100
num_passes = 2
dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\sink",
                     modelName="unbranded220topics")

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    filename=dirs.logFileName,
    filemode='a')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

logging.info("adding more from beh2.py")
dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
示例#11
0
def main(BAVfile, sheet_name, LDAdir, modelName):
    BAV_raw = pandas.read_excel(BAVfile,
                                sheet_name,
                                index_col=0,
                                na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [
        col for col in BAV_raw.columns[idx_first:]
        if len(col.split("_")) == 2 and col.endswith('pct')
    ]

    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns)

    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not

    BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered, 'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper')

    BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes')
    BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles',
                              'Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce')
    BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston')
    BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo')

    words = [w.encode() for w in BAV_filtered.columns]
    brands = [b.encode() for b in BAV_filtered.index]

    topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv'))
    (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words,
                                                         brands=brands,
                                                         indir=LDAdir,
                                                         modelName=modelName)
    (divs, _, _) = getLikes.get_divs(words,
                                     brands,
                                     indir=LDAdir,
                                     modelName=modelName,
                                     topics_marginal_probs=topicsPs)

    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]

    dirs = gslib.LDAdirs(modelName, LDAdir)
    (dict1, _, lda) = gslib.loadStuff(dirs)
    probs = getLikes.ptopic_given_word(lda, topicsPs)
    probs_df = pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index

    writer = pandas.ExcelWriter(
        os.path.join(LDAdir, modelName + '_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs')
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')

    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
示例#12
0
# -*- coding: utf-8 -*-
"""
Created on Mon May 13 09:28:54 2013

@author: Vasya
"""
import ldaModel2bradsSims_direct as mp
import sims_csv_plotter
import genSimLDAlib as gsLib
import gensim

#dirs = gsLib.LDAdirs(modelName = 'PricesStemmed20passes_20topics',indir = r"Z:\ermunds\results\1 prices paid\5-6-2013")
dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads",
                     modelName="All2passes_20topics")

dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
lda = gensim.models.ldamodel.LdaModel(id2word=dict1).load(dirs.modelFname)

simsR, brands = mp.corrBrands(lda)
sims = mp.normalize(simsR)
##mp.saveCSV(dirs,'simsN',brands,sims)

#sims,brands= mp.loadCSV(dirs,"simsN")
##sims_csv_plotter.main(dirs,CSVin="simsN",figName='fromTopics')

brands[brands.index('mercedes-benz')] = 'mercedesbenz'
mp.likes(sims, brands, brands[0])
print 'tada'

mp.likes2(lda, brands, word='luxury')  # looks like sims cvs
示例#13
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 17 14:31:58 2013

@author: Vasya
"""
#import gensim
import genSimLDAlib as gslib
import mess_with_sims as sims
import numpy

dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\2012 20topics",
                     modelName="201220topics")
docsfilename = dirs.allDocsFileName
(dict1, mm, lda) = gslib.loadStuff(dirs)

brands = sims.BrandsClustered_1

# decompose a post into topics and ptint them
gslib.make_sense(1, lda, mm, docsfilename)

# guess the topic of concept list
consepts = ['cheap', 'ugly', 'unrelaible']
consepts = ['young', 'trendy', 'fast', 'macho']  # fail
consepts = ['green', 'environment', 'sustainable', 'hybrid']  #n75
consepts = ['reliable', 'safe']  # n8
consepts = 'air hot heat cool exhaust system fan coolant temp blow'.split(
)  # n5
ws, IDl, ID2index = gslib.world_list2IDs(dict1,
                                         consepts,
                                         tokenizef=gslib.wordCleanUp)
topicsPs = np.genfromtxt(os.path.join(modelDir, 'topics_marginal.csv'))

words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt")
brands = getLikes.words_from_file(r"Z:\ermunds\brands.txt")

(divs, _, _) = getLikes.get_divs(words,
                                 brands,
                                 indir=modelDir,
                                 modelName=modelName,
                                 topics_marginal_probs=topicsPs)
(sims, b, w) = getLikes.get_likes(words,
                                  brands,
                                  indir=modelDir,
                                  modelName=modelName)

dirs = gslib.LDAdirs(modelName, modelDir)
(dict1, _, lda) = gslib.loadStuff(dirs)

brands_df = getLikes.pruneWordsList(brands, lda)
words_df = getLikes.pruneWordsList(words, lda)

probs = getLikes.ptopic_given_word(lda, topicsPs)
probs_df = pd.DataFrame(probs, columns=lda.id2word.values())
alls = pd.concat([brands_df["IDs"], words_df["IDs"]])
x = probs_df[alls]
x.columns = alls.index

writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx'))
sims.to_excel(writer, sheet_name='cosine distance')
divs.to_excel(writer, sheet_name='KL divs')
b.to_excel(writer, sheet_name='brands')
	fig.savefig(figfname)
	plt.show()



def plotSims(sims,brands,dirs,figName='heatmap'):	
    fig = plt.figure(figsize=(11,9))
    ax = fig.add_axes([.1,.1,.8,.8]) # ruined by corobar
    
    imgplot = ax.imshow(sims,interpolation='none')
    	
    idx= xrange(len(brands))
    	
    ax.set_yticks(idx)
    ax.set_yticklabels(zip(brands,idx))
    
    ax.set_xticks(idx)
    ax.set_xticklabels(brands,rotation=90)
    plt.colorbar(imgplot)
    
    ax.set_position([.1,.2,.6,.6])
    
    figfname=dirs.indir+'\\'+figName+'.png'
    fig.savefig(figfname)
    plt.show()


if __name__ == '__main__':
	dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink"
						,modelName="All2passes_20topics")	
	main(dirs)