Пример #1
0
def main():
    '''
    resorts sims and saves a png copy
    '''
 #  dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all\all unbranded threads",modelName="unbranded2passes_20topics")
   #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads",    modelName="All2passes_20topics")
    dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2",    modelName="unbranded220topics")
    #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink",    modelName="unbranded220topics")    
    
    CSVin= "simsN_posts"
    CSVout= "simsNtweaked"
    suffix=''
    figName='heatmap_from_posts_no whitening'+suffix
    #mp.generateCSV(indir=dirs.indir,modelName=dirs.modelName,suffix = suffix)
    sims,brands= mp.loadCSV(dirs,CSVin)

    nbrands= BrandsClustered_1
    # caps bug of may 14
    del nbrands[nbrands.index('mercedes-benz')]
    

    idx=numpy.zeros(len(nbrands),dtype=int)
    for i,b in enumerate(nbrands):
          idx[i]=brands.index(b)
    '''      
    ibrand = brands.index('ram')
    idx = numpy.argsort(-sims[ibrand,:])
    ibrand = brands.index('jeep')
    sort_a_slice(idx,sims,a=6,b=None,compare_to=ibrand)
    
    ibrand = brands.index('nissan')
    sort_a_slice(idx,sims,a=10,b=None,compare_to=ibrand)
    
    ibrand = brands.index('chrysler')	
    sort_a_slice(idx,sims,a=15,b=None,compare_to=ibrand)

    ibrand = brands.index('bmw')	
    sort_a_slice(idx,sims,a=23,b=None,compare_to=ibrand,sign=1)
	
    '''
    (sims,nbrands)=select(sims,brands,idx)
    mp.saveCSV(dirs,CSVout,nbrands,sims)

    draw.main(dirs,CSVout,figName)
Пример #2
0
def main():
    '''
    resorts sims and saves a png copy
    '''
    #  dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all\all unbranded threads",modelName="unbranded2passes_20topics")
    #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all branded threads",    modelName="All2passes_20topics")
    dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads 2",
                         modelName="unbranded220topics")
    #dirs = gsLib.LDAdirs(indir=r"Z:\ermunds\results\sink",    modelName="unbranded220topics")

    CSVin = "simsN_posts"
    CSVout = "simsNtweaked"
    suffix = ''
    figName = 'heatmap_from_posts_no whitening' + suffix
    #mp.generateCSV(indir=dirs.indir,modelName=dirs.modelName,suffix = suffix)
    sims, brands = mp.loadCSV(dirs, CSVin)

    nbrands = BrandsClustered_1
    # caps bug of may 14
    del nbrands[nbrands.index('mercedes-benz')]

    idx = numpy.zeros(len(nbrands), dtype=int)
    for i, b in enumerate(nbrands):
        idx[i] = brands.index(b)
    '''      
    ibrand = brands.index('ram')
    idx = numpy.argsort(-sims[ibrand,:])
    ibrand = brands.index('jeep')
    sort_a_slice(idx,sims,a=6,b=None,compare_to=ibrand)
    
    ibrand = brands.index('nissan')
    sort_a_slice(idx,sims,a=10,b=None,compare_to=ibrand)
    
    ibrand = brands.index('chrysler')	
    sort_a_slice(idx,sims,a=15,b=None,compare_to=ibrand)

    ibrand = brands.index('bmw')	
    sort_a_slice(idx,sims,a=23,b=None,compare_to=ibrand,sign=1)
	
    '''
    (sims, nbrands) = select(sims, brands, idx)
    mp.saveCSV(dirs, CSVout, nbrands, sims)

    draw.main(dirs, CSVout, figName)
Пример #3
0
#for i in xrange(20000):
    bow = mm[i]
    if not i%10000: print i
    temp_counter_1 = numpy.zeros((1,l_1))
    temp_counter_2 = numpy.zeros((1,l_2))
    for ID, count in bow:        
        if ID in IDset_1:
            index=ID2index_1[ID]
            temp_counter_1[0,index]+=count
            counter_1[0,index]+=count
        if ID in IDset_2:
            index=ID2index_2[ID]
            counter_2[0,index]+=count
            temp_counter_2[0,index]+=count
    coocM=coocM+temp_counter_1.T*temp_counter_2


wd.saveCSV(dirs,'coocM_raw',brandsl,coocM)

temp2 = wd.normalize(coocM)
temp25=numpy.log(temp2)
temp3=temp25-numpy.diag(temp25.diagonal())       
wd.saveCSV(dirs,'coocM',brandsl,temp3)
draw.main(dirs,'coocM',figName='from_cooc_log')        
    
temp2 = wd.normalize(coocM)
temp25=temp2
temp3=temp25-numpy.diag(temp25.diagonal())       
wd.saveCSV(dirs,'coocM',brandsl,temp3)
draw.main(dirs,'coocM',figName='from_cooc')     
   
Пример #4
0
LDA_vectorsWithOutCooc=pickle.load( open(dirs.indir+'\\'+'LDA_vectorsWithOutCooc.pickle','r'))

pickle.dump(brandsl, open(dirs.indir+'\\'+'brandsl.pickle','w'))
brandsl=pickle.load( open(dirs.indir+'\\'+'brandsl.pickle','r'))

## convert these vectors to sims 
vectors = LDA_vectorsWithCooc
#vectors = LDA_vectorsWithOutCooc

simCos=numpy.zeros((len(brandsl),len(brandsl)))

for b1i in xrange(len(brandsl)):
    for b2i in xrange(len(brandsl)):
        v1 =vectors[b1i,:]# (vectors[b1i,:] -ave)/stds
        v2 =vectors[b2i,:]# (vectors[b2i,:] -ave)/stds
        simCos[b1i,b2i] = (v1.dot(v2))/(scipy.linalg.norm(v1)*scipy.linalg.norm(v2))


mp.saveCSV(dirs,'simsN_posts',brandsl,simCos)
brandsOrdered=sims.BrandsClustered_1
simCos=sims.shuffle_sims(simCos,brandsl,brandsOrdered)

simplot.plotSims(simCos,brandsOrdered,dirs,figName='from topics '+'withRepeadedPosts')

##
for i in xrange(len(LDA_vectorsWithCooc)):
    LDA_vectorsWithCooc[i,:]=LDA_vectorsWithCooc[i,:]/sum(LDA_vectorsWithCooc[i,:])
    LDA_vectorsWithOutCooc[i,:]=LDA_vectorsWithOutCooc[i,:]/sum(LDA_vectorsWithOutCooc[i,:])