def lda(data, K, it, alpha, beta, dict_=True, verbose=True, randomness=1, PATH="", algo='cgs'): #** 1. Define Internal Parameters alpha = 0.5 beta = alpha #** 2. Random topics and dictionate data = np.asarray(data) if randomness > 0: data = addrandomcol(data, K, -1, randomness) #K if dict_: data, idx2vals, vals2idx, _ = dictionate(data) else: idx2vals = None vals2idx = None data = data.astype(float) data = data.astype(np.int) z_d = join2(data[:][:, [0, 2]]) w_z = join2(data[:][:, [2, 1]]) z_ = join2(data[:][:, [2]]) if algo == "motion": data = map(lambda row: [row[0], row[1], toDistribution(row[2], K)], data) #** 3. Inference if PATH != "": np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_0", w_z) np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_0", z_d) for i in range(it): start = time.time() if algo == "cgs": data, z_d, w_z = sampling(data, z_d, w_z, z_, alpha, beta) elif algo == "motion": data, z_d, w_z = motion(data, z_d, w_z, z_, alpha, beta) else: print "Only cgs and motion are implemented " assert (False) if PATH != "": np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_" + str(i + 1), w_z) np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_" + str(i + 1), z_d) print "Iteration", i, "took", time.time() - start return data, w_z, z_d, idx2vals, vals2idx
def __init__(self, data, alpha, beta): #** Preprocess the data self.data, idx2vals, vals2idx, self.counts = dictionate( data) #self.data is dictionated data self.V = len(idx2vals[0]) # Total number of observed variables in V self.W = len(idx2vals[1]) # Total number of observed variables in W self.alpha = alpha self.beta = beta # Global parameters self.currV = 0 # Current number of observed variables in V self.currW = 0 # Current number of observed variables in W self.Vs = set() # Set of Vs self.Ws = set() # Set of Ws self.K = 0 # Current number of existing K self.nvk_ = np.zeros((self.V, self.K)) self.n_kw = np.zeros((self.W, self.K)) self.n_k_ = np.zeros(self.K) self.sum_N = 0 self.P_new = self.alpha
def slda(data, K, it, a, alpha, beta, eta, dict_=True, verbose=True, randomness=1, compressed=False, batch=0, PATH="", form='standard', algo='cgs'): #** 1. Random topics and dictionate if randomness > 0: data = addrandomcol(data, K, 3, randomness) if dict_: data, idx2vals, vals2idx, _ = dictionate(data, cols=[0, 1]) else: idx2vals = None vals2idx = None if algo == "cgs" or algo == "cgsgpu": dz = join2(data[:][:, [0, 3]]) wz = join2(data[:][:, [1, 3]]) z = join2(data[:][:, [3]]) #** TODO: UPDATE rating range from 0-5 for all methods except cool and herongpu #** 2. Inference if algo == "cgs": print "cgs ---------------------------------------------" print data[:10] afterdata, D, W = cgs(data, dz, wz, z, K, it, a, alpha, beta, eta, PATH) elif algo == "heron": print "heron ----------------------------------------------" herondata, D, W, Z = preprocessData_old(data, K, compressed) herondata, D, W, Z = fixedp(g, herondata, D, W, Z, K, a, alpha, beta, eta, PATH, maxiter=it) elif algo == "cgsgpu": print "cgs gpu ------------------------------------------" afterdata, D, W, Z = SLDACGSGPU(data, wz, dz, z, K, it, a, alpha, beta, eta, PATH) elif algo == "herongpu": print "heron gpu ----------------------------------------" if batch > 0: # and compressed data, pz, D, W, Z = preprocessData(data, K, compressed) if batch > len(data): print "Batch size=", batch, "> len(data)=", len(data) batch = len(data) from_ = list(xrange(0, len(data), batch)) to_ = from_[1:] + [from_[-1] + batch] Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') print data for i in range(it): print "Iteration", i, "------------------------------------------" fullD = np.zeros(np.shape(D), dtype=np.float32) fullW = np.zeros(np.shape(W), dtype=np.float32) fullZ = np.zeros(np.shape(Z), dtype=np.float32) for f, t in zip(from_, to_): data_batch = data[f:t] pz_batch = pz[f:t] _, partD, partW, partZ = SLDAHERONGPU( data_batch[:, [0, 1, 2, 3]], W, D, Z, pz_batch, K, 1, a, alpha, beta, eta, PATH="") fullD += partD fullW += partW fullZ += partZ del _, data_batch D = fullD W = fullW Z = fullZ if PATH != "": if (i + 1) % 5 == 0: np.save( PATH + "wz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullW) np.save( PATH + "dz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullD) else: herondata, pz, D, W, Z = preprocessData(data, K, compressed) Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') afterdata, D, W, Z = SLDAHERONGPU(herondata[:][:, [0, 1, 2, 3]], W, D, Z, pz, K, it, a, alpha, beta, eta, PATH) elif algo == "cool": print "cool ----------------------------------------" if batch > 0: # and compressed data, pz, D, W, Z = preprocessData(data, K, compressed) if batch > len(data): print "Batch size=", batch, "> len(data)=", len(data) batch = len(data) from_ = list(xrange(0, len(data), batch)) to_ = from_[1:] + [from_[-1] + batch] Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') del pz for i in range(it): print "Iteration", i, "------------------------------------------" fullD = np.zeros(np.shape(D), dtype=np.float32) fullW = np.zeros(np.shape(W), dtype=np.float32) fullZ = np.zeros(np.shape(Z), dtype=np.float32) for f, t in zip(from_, to_): data_batch = data[f:t].copy() _, partD, partW, partZ = SLDACOOLGPU( data_batch[:, [0, 1, 2, 3]], W, D, Z, K, 1, a, alpha, beta, eta, PATH="") fullD += partD fullW += partW fullZ += partZ del _, data_batch D = fullD W = fullW Z = fullZ if PATH != "": if (i + 1) % 5 == 0: np.save( PATH + "wz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullW) np.save( PATH + "dz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullD) else: herondata, pz, D, W, Z = preprocessData_old(data, K, compressed) del pz Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') afterdata, D, W, Z = SLDACOOLGPU(herondata[:][:, [0, 1, 2, 3]], W, D, Z, pz, K, it, a, alpha, beta, eta, PATH) else: print "Inference method not supported" assert (0) return data, D, W, idx2vals, vals2idx
if (options.model=="cgs" and options.batch>0) or (options.model=="heron" and options.batch>0): print "CGS, neither heron support batches please try cool or herongpu." #**3. Run Inference Algorithm if options.randomness: print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - Uniformly initialized\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n" else: print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - NO initialization\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n" print "The parameters are being saved at:",options.path,"\n" if options.model == "LDA": import LDA.lda as lda data=np.load(options.filename) data,_,_,_=dictionate(data,cols=[0,1]) train,test=splitTrainTestRepeated(data,0.7) it=options.iteration path=options.path #** Initialize outside the method call for fair comparison between models train=addrandomcol(train,options.K,-1,1) data,dz,wz,idx2vals,vals2idx=lda.lda(train,options.K,options.iteration,options.alpha,options.beta,batch=options.batch,randomness=options.randomness,dict_=False,PATH=options.path,algo=options.inference,compressed=options.compression) elif options.model == "RTM": import RTM.rtm as rtm path,_=os.path.split(options.filename) dd=np.load(path+"/dd.npy")
def dt2b(dt2bdata,Ku,Kp,n,alpha,beta_row,beta_column,it,verbose=True): def init(data,Ku,Kp): data=np.array(data) assert(data.shape[1]==2) datadt2b=np.zeros((len(data),4),dtype='|S20') for idx,row in enumerate(datadt2b): row[0]=data[idx][0] row[1]=data[idx][1] datadt2b=addrandomtopic(datadt2b,Ku,-2) datadt2b=addrandomtopic(datadt2b,Kp,-1) return datadt2b #--printTopics------------------------------------------------------ """Print Topics top 't' topics given conditional distribution given the topic p_z = ditribution given topic t = Top T topics """ #------------------------------------------------------------ def printColumnTopics(p_z,t): for idx,z in enumerate(p_z): print "Topic",idx,'- evidence' for topic,evidence in zip(np.argsort(z)[::-1][:t],np.sort(z)[::-1][:t]): print idx2vals[1][topic],int(evidence) print "" def printRowTopics(p_z,t): for idx,z in enumerate(p_z): print "Topic",idx,'- evidence' for topic,evidence in zip(np.argsort(z)[::-1][:t],np.sort(z)[::-1][:t]): print idx2vals[0][topic],int(evidence) print "" def printTopics(mdata,verbose): print "Row Topics\n" printRowTopics(join2(processed_data[:][:,[2,0]]),n) print "------------------------------" print "Column Topics\n" printColumnTopics(join2(processed_data[:][:,[3,1]]),n) print "------------------------------" print "Topic Interrelation\n Evidence of the relationship between the row-topics and column-topics\n" print join2(processed_data[:][:,[2,3]]).astype(np.int) print "------------------------------" """-----------------* * * * |\/| /\ | |\ | * * | | / \ | | \| * * * *----------------""" dt2bdata=np.asarray(dt2bdata) dt2bdata=init(dt2bdata,Ku,Kp) print "Processing the data ..." dt2bdata,idx2vals,vals2idx=dictionate(dt2bdata) dt2bdata=dt2bdata.astype(np.int) print "Running the inference process ..." start=time.time() processed_data = dt2b_c.inference(dt2bdata,it) print 'Inference Took:',time.time()-start,'seconds' if verbose: printTopics(processed_data,verbose) columns_w_z=join2(processed_data[:][:,[3,1]]) rows_w_z=join2(processed_data[:][:,[2,0]]) joint=join2(processed_data[:][:,[2,3]]) return columns_w_z,rows_w_z,joint,idx2vals,vals2idx