def cluster(self,db,cr,pdm,dom_obj_map): cf = common_f() atoms = db.pred_atoms newatoms = '' orig_meta_map = {} r = random.randint(3,30) ifile = open('time.txt','a') ifile.write('BMFFFFF' + str(r)) ifile.close() for p in atoms: if len(pdm[p]) < 2: for a in atoms[p]: d_name = 'd' + str(pdm[p][0]) + '_' + str(a[0]) orig_meta_map[d_name] = d_name newatoms += p + '(' + str(a[0]) + ')\n' continue dom1 = pdm[p][0] dom2 = pdm[p][1] compress_dom1 = int(len(dom_obj_map[dom1])*cr) compress_dom2 = int(len(dom_obj_map[dom2])*cr) bmf_matrix = [[0 for j in range(compress_dom2)] for i in range(compress_dom1)] for i,atom in enumerate(atoms[p]): obj1 = int(atom[0]) obj2 = int(atom[1]) if (obj1 < compress_dom1) and (obj2 < compress_dom2): bmf_matrix[obj1][obj2] = 1 bmf_matrix = np.array(bmf_matrix) bmf = nimfa.Bmf(bmf_matrix, seed="nndsvd", rank=r, max_iter=100, lambda_w=1.1, lambda_h=1.1) bmf_fit = None try: bmf_fit = bmf() except: print('error',r) self.cluster(db,cr,pdm,dom_obj_map) return W = bmf_fit.basis() H = bmf_fit.coef() T = np.dot(W,H) T = T.tolist() for i,x in enumerate(T): for j,y in enumerate(x): if T[i][j] > .5: T[i][j] = 1 else: T[i][j] = 0 bmf_matrix = bmf_matrix.tolist() for i,row in enumerate(T): for j,c in enumerate(row): d1_obj = 'd' + str(dom1) + '_' + str(i) d2_obj = 'd' + str(dom2) + '_' + str(j) orig_meta_map[d1_obj] = d1_obj orig_meta_map[d2_obj] = d2_obj if row[j] == 1: newatoms += p + '(' + str(i) + ',' + str(j) + ')\n' ofile_name = self.bmf__cluster_db_file ofile = open(ofile_name,'w') ofile.write(newatoms) ofile.close() self.bmf_orig_meta_map = orig_meta_map
def fit(self, V, nu=None): assert nu is None self.V = V self.lsnmf = nimfa.Bmf(self.V, seed='random_vcol', max_iter=self.max_iter, rank=self.n_components) self.lsnmf_fit = self.lsnmf() self.W = self.lsnmf_fit.basis() self.H = self.lsnmf_fit.coef() return self
def BMF(X, k): bmf = nimfa.Bmf(X, rank=k, max_iter=12, lambda_w=1.1, lambda_h=1.1) bmf_fit = bmf() # W=bmf.W # H=bmf.H # print(W.shape()) # print(H.shape()) print(bmf_fit) # break matrix_bmf = bmf_fit.fitted() return matrix_bmf
def run_bmf(V): """ Run binary matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 bmf = nimfa.Bmf(V, seed="random_vcol", rank=rank, max_iter=12, initialize_only=True, lambda_w=1.1, lambda_h=1.1) fit = bmf() print_info(fit)
import numpy as np import nimfa V = np.random.rand(23, 200) # Factorization will be run 3 times (n_run) and factors will be tracked for computing # cophenetic correlation. Note increased time and space complexity bmf = nimfa.Bmf(V, max_iter=10, rank=30, n_run=3, track_factor=True) bmf_fit = bmf() print('K-L divergence: %5.3f' % bmf_fit.distance(metric='kl')) sm = bmf_fit.summary() print('Rss: %5.3f' % sm['rss']) print('Evar: %5.3f' % sm['evar']) print('Iterations: %d' % sm['n_iter']) print('Cophenetic correlation: %5.3f' % sm['cophenetic'])
import numpy as np import nimfa V = np.random.rand(40, 100) bmf = nimfa.Bmf(V, seed="nndsvd", rank=10, max_iter=12, lambda_w=1.1, lambda_h=1.1) bmf_fit = bmf()
def main(args): trainTripletsFile = open('txTripletsCounts.txt', 'rU') testTripletsFile = open('testTriplets.txt', 'rU') row = [] col = [] dat = [] datBin = [] trainMatrix = np.zeros((444075, 444075)) binMatrix = np.zeros((444075, 444075)) for line in trainTripletsFile: arr = line.split() row.append(int(float(arr[0]))) col.append(int(float(arr[1]))) dat.append(int(float(arr[2]))) datBin.append(1) #Manually construting the train,binary matrix trainMatrix[int(float(arr[0]))][int(float(arr[1]))] = int(float( arr[2])) binMatrix[int(float(arr[0]))][int(float(arr[1]))] = 1 for i in range(444075): if binMatrix[0][i] == 1: print i # Test file reading testRow = [] testCol = [] testDat = [] for line in testTripletsFile: arr = line.split() testRow.append(int(float(arr[0]))) testCol.append(int(float(arr[1]))) testDat.append(int(float(arr[2]))) # bag = [] # prev = 0 # count = 0; #adjacency representation # for line in trainTripletsFile: # arr = line.split() # if int(float(arr[0])) != prev: # bag.append(docBag) # docBag = [] # docBag.append(int(float(arr[1]))) # prev = int(float(arr[0])) # else: # if count == 0: # docBag = [] # docBag.append(int(float(arr[1]))) # count = count + 1 # else: # docBag.append(int(float(arr[1]))) # ACount = csc_matrix((dat, (row, col)), shape=(444075, 444075)).todense() # ABin = csc_matrix((datBin, (row, col)), shape=(444075, 444075)) # # #ut, s, vt = sparsesvd(ABin, 11) # #What the R code is doing. # # u = np.transpose(ut) # # v = np.transpose(vt) # # for i in range(numLinesinTest): # # row = u[testRow[i]] # # col = v[testCol[i]] # # x = np.multiply(row, s) # # p = np.multiply(x, col) ABinLDA = csr_matrix((datBin, (row, col)), shape=(444075, 444075)) # print ABinLDA.shape ACountRow = csr_matrix((dat, (row, col)), shape=(444075, 444075)) Test = csr_matrix((testDat, (testRow, testCol)), shape=(444075, 444075)) #Performing LDA over range of topics---------------------- if args[0] == "-t": for topics in range(10, 51, 5): print topics model = lda.LDA(n_topics=topics, n_iter=100, random_state=1) model.fit(ACountRow) print model.loglikelihood() #Performing LDA-------------------- if args[0] == "-l": # x = lda.utils.matrix_to_lists(ACountRow) # print x[0].shape # print x[1].shape # model.fit(ACountRow) vocab = [] for i in range(444075): vocab.append(i) # topic_word = model.topic_word_ # print("type(topic_word): {}".format(type(topic_word))) # print("shape: {}".format(topic_word.shape)) # Check if the sum across all vocab for a topic is ~1 # for n in range(5): # sum_pr = sum(topic_word[n,:]) # print("topic: {} sum: {}".format(n, sum_pr)) # n = 15 # for i, topic_dist in enumerate(topic_word): # topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1] # # print topic_words # doc_topic = model.doc_topic_ # # print("type(doc_topic): {}".format(type(doc_topic))) # print("shape: {}".format(doc_topic.shape)) # for n in range(10): # # print doc_topic[n] model = lda.LDA(n_topics=15, n_iter=100) model.fit(ACountRow) topic_word = model.topic_word_ doc_topic = model.doc_topic_ results = [] for i, value in enumerate(testRow): sumC = 0 for k in range(15): sumC += doc_topic[value][k] * topic_word[k][testCol[i]] results.append(sumC) print results #print model.loglikelihood() if len(results) != 10000: print "lol" # print("type(topic_word): {}".format(type(topic_word))) # print("shape: {}".format(topic_word.shape)) # for i, topic_dist in enumerate(topic_word): # topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1] # print topic_words # print("type(doc_topic): {}".format(type(doc_topic))) # print("shape: {}".format(doc_topic.shape)) # for n in range(10): # print doc_topic[n] #Choosing K in KMeans if args[0] == "-kC": K = [50] print "Fitting the data..." k_means_var = [KMeans(n_clusters=k).fit(ABinLDA) for k in K] print "Extracting the centroids..." inertias = [(X.inertia_ / 444075) for X in k_means_var] #averaged print inertias # plt.plot(K, inertias) # plt.show() #Performing KMeans------------------- if args[0] == "-k": n_clusters = 35 k_means = KMeans(n_clusters) k_means.fit(ABinLDA) labels = k_means.labels_ centers = k_means.cluster_centers_ for row in centers: print np.argmax(row) probInteraction = [] for j in range(len(testRow)): label = labels[testRow[j]] meanInteraction = centers[label] if meanInteraction[testCol[j]] > 0.2: print meanInteraction[testCol[j]] probInteraction.append(meanInteraction[testCol[j]]) zeroProb = [] oneProb = [] #Let's partition the probabilites into 0 and 1 and make the violin plot for i in range(len(testDat)): if testDat[i] == 0: zeroProb.append(probInteraction[i]) else: oneProb.append(probInteraction[i]) #Function to draw the violin plots # groups = range(2) # a = np.array(zeroProb) # b = np.array(oneProb) # data = [] # data.append(a) # data.append(b) # fig = pl.figure() # ax = fig.add_subplot(111) # violin_plot(ax,data,groups,bp=0) # pl.show() #pr(testDat, probInteraction) #NMF - used instead of factor analysis because we run out of memory #from sklearn.decomposition import ProjectedGradientNMF if args[0] == "-nmf": # nmf = ProjectedGradientNMF(n_components=1000, init='random', random_state=0, sparseness='data') # nmf.fit(ACountRow) # print "nmf components: " # print nmf.components_ # print "nmf shape: " + str(nmf.components_.shape) # print "nmf reconstruction_err: " + str(nmf.reconstruction_err_) nmf = nimfa.Nmf( ACountRow ) #, max_iter=10, rank=2)#, update='euclidean', objective='fro') nmf_fit = nmf() #W = nmf_fit.basis() #print('Basis matrix:\n%s' % W.todense()) # H = nmf_fit.coef() # print('Mixture matrix:\n%s' % H.todense()) #print('Euclidean distance: %5.3f' % nmf_fit.distance(metric='euclidean')) # sm = nmf_fit.summary() # print('Sparseness Basis: %5.3f Mixture: %5.3f' % (sm['sparseness'][0], sm['sparseness'][1])) # print('Iterations: %d' % sm['n_iter']) #print('Target estimate:\n%s' % np.dot(W.todense(), H.todense())) #GMM - don't know if this is the best method but might as well give it a try #Assuming Gaussian is probably not the best idea but what else are we going to do? YOLO if args[0] == "-bmf": bmf = nimfa.Bmf(ABinLDA) bmf_fit = bmf() #Performing cosine similarity----------- if args[0] == "-c": tfidf_transformer = TfidfTransformer() tfidf_matrix = tfidf_transformer.fit_transform(ACountRow) print tfidf_matrix.shape results = [] # x = cosine_similarity(tfidf_matrix[30:31], tfidf_matrix) # max_thousand_index = np.argsort(x[0])[-26:][::-1] # max_thousand_index_new = max_thousand_index[1:] # max_thousand = heapq.nlargest(26, x[0]) # new_max_thousand = max_thousand[1:] # print max_thousand_index # print max_thousand for i, value in enumerate(testRow): # print value x = cosine_similarity(tfidf_matrix[value:value + 1], tfidf_matrix) max_thousand_index = np.argsort(x[0])[-26:][::-1] max_thousand_index_new = max_thousand_index[1:] max_thousand = heapq.nlargest(26, x[0]) new_max_thousand = max_thousand[1:] max_thousand_norm = [ float(i) / sum(new_max_thousand) for i in new_max_thousand ] sumPredict = 0 for ind, cos_k in enumerate(max_thousand_index_new): if cos_k != value: # print cos_k, testCol[int(i)] if ABinLDA[cos_k, testCol[int(i)]] != 0: sumPredict += max_thousand_norm[ind] results.append(sumPredict) # results.append(ABinLDA[max_largest_index[1], testCol[int(i)]]) bigCount = 0 for l in range(10000): print results[l] if (results[l] > 0.5): bigCount += 1 print len(results) print bigCount