Exemplo n.º 1
0
	def cluster(self,db,cr,pdm,dom_obj_map):
		cf = common_f()
		atoms = db.pred_atoms
		newatoms = ''
		orig_meta_map = {}
		r = random.randint(3,30)
		ifile = open('time.txt','a')
		ifile.write('BMFFFFF' + str(r))
		ifile.close()
		for p in atoms:
			if len(pdm[p]) < 2:
				for a in atoms[p]:
					d_name = 'd' + str(pdm[p][0]) + '_' + str(a[0])
					orig_meta_map[d_name] = d_name 
					newatoms += p + '(' + str(a[0]) + ')\n'
				continue
			dom1 = pdm[p][0]
			dom2 = pdm[p][1]

			compress_dom1 = int(len(dom_obj_map[dom1])*cr)
			compress_dom2 = int(len(dom_obj_map[dom2])*cr)
			bmf_matrix = [[0 for j in range(compress_dom2)] for i in range(compress_dom1)]
			for i,atom in enumerate(atoms[p]):
				obj1 = int(atom[0])
				obj2 = int(atom[1])
				if (obj1 < compress_dom1) and (obj2 < compress_dom2):
					bmf_matrix[obj1][obj2] = 1
			bmf_matrix = np.array(bmf_matrix)
			bmf = nimfa.Bmf(bmf_matrix, seed="nndsvd", rank=r, max_iter=100, lambda_w=1.1, lambda_h=1.1)
			bmf_fit = None
			try:
				bmf_fit = bmf()
			except:
				print('error',r)
				self.cluster(db,cr,pdm,dom_obj_map)
				return
			W = bmf_fit.basis()
			H = bmf_fit.coef()
			T = np.dot(W,H)
			T = T.tolist()
			for i,x in enumerate(T):
				for j,y in enumerate(x):
					if T[i][j] > .5:
						T[i][j] = 1
					else:
						T[i][j] = 0
			bmf_matrix = bmf_matrix.tolist()
			for i,row in enumerate(T):
				for j,c in enumerate(row):
					d1_obj = 'd' + str(dom1) + '_' + str(i)
					d2_obj = 'd' + str(dom2) + '_' + str(j)
					orig_meta_map[d1_obj] = d1_obj
					orig_meta_map[d2_obj] = d2_obj
					if row[j] == 1:
						newatoms += p + '(' + str(i) + ',' + str(j) + ')\n'
		ofile_name = self.bmf__cluster_db_file
		ofile = open(ofile_name,'w')
		ofile.write(newatoms)
		ofile.close()
		self.bmf_orig_meta_map = orig_meta_map
Exemplo n.º 2
0
 def fit(self, V, nu=None):
     assert nu is None
     self.V = V
     self.lsnmf = nimfa.Bmf(self.V,
                            seed='random_vcol',
                            max_iter=self.max_iter,
                            rank=self.n_components)
     self.lsnmf_fit = self.lsnmf()
     self.W = self.lsnmf_fit.basis()
     self.H = self.lsnmf_fit.coef()
     return self
Exemplo n.º 3
0
def BMF(X, k):
    bmf = nimfa.Bmf(X, rank=k, max_iter=12, lambda_w=1.1, lambda_h=1.1)

    bmf_fit = bmf()

    # W=bmf.W
    # H=bmf.H
    # print(W.shape())
    # print(H.shape())
    print(bmf_fit)
    # break

    matrix_bmf = bmf_fit.fitted()
    return matrix_bmf
Exemplo n.º 4
0
def run_bmf(V):
    """
    Run binary matrix factorization.
    
    :param V: Target matrix to estimate.
    :type V: :class:`numpy.matrix`
    """
    rank = 10
    bmf = nimfa.Bmf(V,
                    seed="random_vcol",
                    rank=rank,
                    max_iter=12,
                    initialize_only=True,
                    lambda_w=1.1,
                    lambda_h=1.1)
    fit = bmf()
    print_info(fit)
Exemplo n.º 5
0
import numpy as np

import nimfa

V = np.random.rand(23, 200)

# Factorization will be run 3 times (n_run) and factors will be tracked for computing
# cophenetic correlation. Note increased time and space complexity
bmf = nimfa.Bmf(V, max_iter=10, rank=30, n_run=3, track_factor=True)
bmf_fit = bmf()

print('K-L divergence: %5.3f' % bmf_fit.distance(metric='kl'))

sm = bmf_fit.summary()
print('Rss: %5.3f' % sm['rss'])
print('Evar: %5.3f' % sm['evar'])
print('Iterations: %d' % sm['n_iter'])
print('Cophenetic correlation: %5.3f' % sm['cophenetic'])
Exemplo n.º 6
0
import numpy as np

import nimfa

V = np.random.rand(40, 100)
bmf = nimfa.Bmf(V, seed="nndsvd", rank=10, max_iter=12, lambda_w=1.1, lambda_h=1.1)
bmf_fit = bmf()
Exemplo n.º 7
0
def main(args):
    trainTripletsFile = open('txTripletsCounts.txt', 'rU')
    testTripletsFile = open('testTriplets.txt', 'rU')
    row = []
    col = []
    dat = []
    datBin = []
    trainMatrix = np.zeros((444075, 444075))
    binMatrix = np.zeros((444075, 444075))

    for line in trainTripletsFile:
        arr = line.split()
        row.append(int(float(arr[0])))
        col.append(int(float(arr[1])))
        dat.append(int(float(arr[2])))
        datBin.append(1)
        #Manually construting the train,binary matrix
        trainMatrix[int(float(arr[0]))][int(float(arr[1]))] = int(float(
            arr[2]))
        binMatrix[int(float(arr[0]))][int(float(arr[1]))] = 1

    for i in range(444075):
        if binMatrix[0][i] == 1:
            print i

    # Test file reading
    testRow = []
    testCol = []
    testDat = []
    for line in testTripletsFile:
        arr = line.split()
        testRow.append(int(float(arr[0])))
        testCol.append(int(float(arr[1])))
        testDat.append(int(float(arr[2])))

    # bag = []
    # prev = 0
    # count = 0;

    #adjacency representation
    # for line in trainTripletsFile:
    #     arr = line.split()
    #     if int(float(arr[0])) != prev:
    #         bag.append(docBag)
    #         docBag = []
    #         docBag.append(int(float(arr[1])))
    #         prev = int(float(arr[0]))
    #     else:
    #         if count == 0:
    #             docBag = []
    #             docBag.append(int(float(arr[1])))
    #             count = count + 1
    #         else:
    #             docBag.append(int(float(arr[1])))

    # ACount = csc_matrix((dat, (row, col)), shape=(444075, 444075)).todense()
    # ABin = csc_matrix((datBin, (row, col)), shape=(444075, 444075))
    # # #ut, s, vt = sparsesvd(ABin, 11)

    # #What the R code is doing.
    # # u = np.transpose(ut)
    # # v = np.transpose(vt)
    # # for i in range(numLinesinTest):
    # #     row = u[testRow[i]]
    # #     col = v[testCol[i]]
    # #     x = np.multiply(row, s)
    # #     p = np.multiply(x, col)

    ABinLDA = csr_matrix((datBin, (row, col)), shape=(444075, 444075))
    # print ABinLDA.shape
    ACountRow = csr_matrix((dat, (row, col)), shape=(444075, 444075))
    Test = csr_matrix((testDat, (testRow, testCol)), shape=(444075, 444075))

    #Performing LDA over range of topics----------------------
    if args[0] == "-t":
        for topics in range(10, 51, 5):
            print topics
            model = lda.LDA(n_topics=topics, n_iter=100, random_state=1)
            model.fit(ACountRow)
            print model.loglikelihood()

    #Performing LDA--------------------
    if args[0] == "-l":
        # x = lda.utils.matrix_to_lists(ACountRow)
        # print x[0].shape
        # print x[1].shape

        # model.fit(ACountRow)

        vocab = []
        for i in range(444075):
            vocab.append(i)

        # topic_word = model.topic_word_
        # print("type(topic_word): {}".format(type(topic_word)))
        # print("shape: {}".format(topic_word.shape))

        # Check if the sum across all vocab for a topic is ~1
        # for n in range(5):
        #     sum_pr = sum(topic_word[n,:])
        #     print("topic: {} sum: {}".format(n, sum_pr))

        # n = 15
        # for i, topic_dist in enumerate(topic_word):
        #     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        #     # print topic_words

        # doc_topic = model.doc_topic_
        # # print("type(doc_topic): {}".format(type(doc_topic)))
        # print("shape: {}".format(doc_topic.shape))

        # for n in range(10):
        #     # print doc_topic[n]

        model = lda.LDA(n_topics=15, n_iter=100)
        model.fit(ACountRow)
        topic_word = model.topic_word_
        doc_topic = model.doc_topic_

        results = []

        for i, value in enumerate(testRow):
            sumC = 0
            for k in range(15):
                sumC += doc_topic[value][k] * topic_word[k][testCol[i]]
            results.append(sumC)

        print results
        #print model.loglikelihood()
        if len(results) != 10000:
            print "lol"

        # print("type(topic_word): {}".format(type(topic_word)))
        # print("shape: {}".format(topic_word.shape))

        # for i, topic_dist in enumerate(topic_word):
        #     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        #     print topic_words

        # print("type(doc_topic): {}".format(type(doc_topic)))
        # print("shape: {}".format(doc_topic.shape))

        # for n in range(10):
        #   print doc_topic[n]

    #Choosing K in KMeans
    if args[0] == "-kC":
        K = [50]
        print "Fitting the data..."
        k_means_var = [KMeans(n_clusters=k).fit(ABinLDA) for k in K]
        print "Extracting the centroids..."
        inertias = [(X.inertia_ / 444075) for X in k_means_var]  #averaged
        print inertias

        # plt.plot(K, inertias)
        # plt.show()

    #Performing KMeans-------------------
    if args[0] == "-k":
        n_clusters = 35

        k_means = KMeans(n_clusters)
        k_means.fit(ABinLDA)
        labels = k_means.labels_
        centers = k_means.cluster_centers_

        for row in centers:
            print np.argmax(row)

        probInteraction = []

        for j in range(len(testRow)):
            label = labels[testRow[j]]
            meanInteraction = centers[label]
            if meanInteraction[testCol[j]] > 0.2:
                print meanInteraction[testCol[j]]
            probInteraction.append(meanInteraction[testCol[j]])

        zeroProb = []
        oneProb = []
        #Let's partition the probabilites into 0 and 1 and make the violin plot
        for i in range(len(testDat)):
            if testDat[i] == 0:
                zeroProb.append(probInteraction[i])
            else:
                oneProb.append(probInteraction[i])

        #Function to draw the violin plots
        # groups = range(2)
        # a = np.array(zeroProb)
        # b = np.array(oneProb)
        # data = []

        # data.append(a)
        # data.append(b)
        # fig = pl.figure()
        # ax = fig.add_subplot(111)
        # violin_plot(ax,data,groups,bp=0)
        # pl.show()

        #pr(testDat, probInteraction)

    #NMF - used instead of factor analysis because we run out of memory
    #from sklearn.decomposition import ProjectedGradientNMF
    if args[0] == "-nmf":
        # nmf = ProjectedGradientNMF(n_components=1000, init='random', random_state=0, sparseness='data')
        # nmf.fit(ACountRow)
        # print "nmf components: "
        # print nmf.components_
        # print "nmf shape: " + str(nmf.components_.shape)
        # print "nmf reconstruction_err: " + str(nmf.reconstruction_err_)

        nmf = nimfa.Nmf(
            ACountRow
        )  #, max_iter=10, rank=2)#, update='euclidean', objective='fro')
        nmf_fit = nmf()

        #W = nmf_fit.basis()
        #print('Basis matrix:\n%s' % W.todense())

        # H = nmf_fit.coef()
        # print('Mixture matrix:\n%s' % H.todense())

        #print('Euclidean distance: %5.3f' % nmf_fit.distance(metric='euclidean'))

        # sm = nmf_fit.summary()
        # print('Sparseness Basis: %5.3f  Mixture: %5.3f' % (sm['sparseness'][0], sm['sparseness'][1]))
        # print('Iterations: %d' % sm['n_iter'])
        #print('Target estimate:\n%s' % np.dot(W.todense(), H.todense()))

    #GMM - don't know if this is the best method but might as well give it a try
    #Assuming Gaussian is probably not the best idea but what else are we going to do? YOLO
    if args[0] == "-bmf":
        bmf = nimfa.Bmf(ABinLDA)
        bmf_fit = bmf()

    #Performing cosine similarity-----------
    if args[0] == "-c":
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(ACountRow)
        print tfidf_matrix.shape
        results = []
        # x = cosine_similarity(tfidf_matrix[30:31], tfidf_matrix)
        # max_thousand_index = np.argsort(x[0])[-26:][::-1]
        # max_thousand_index_new = max_thousand_index[1:]
        # max_thousand = heapq.nlargest(26, x[0])
        # new_max_thousand = max_thousand[1:]
        # print max_thousand_index
        # print max_thousand

        for i, value in enumerate(testRow):
            # print value
            x = cosine_similarity(tfidf_matrix[value:value + 1], tfidf_matrix)
            max_thousand_index = np.argsort(x[0])[-26:][::-1]
            max_thousand_index_new = max_thousand_index[1:]
            max_thousand = heapq.nlargest(26, x[0])
            new_max_thousand = max_thousand[1:]
            max_thousand_norm = [
                float(i) / sum(new_max_thousand) for i in new_max_thousand
            ]
            sumPredict = 0
            for ind, cos_k in enumerate(max_thousand_index_new):
                if cos_k != value:
                    # print cos_k, testCol[int(i)]
                    if ABinLDA[cos_k, testCol[int(i)]] != 0:
                        sumPredict += max_thousand_norm[ind]

            results.append(sumPredict)

        #             results.append(ABinLDA[max_largest_index[1], testCol[int(i)]])
        bigCount = 0
        for l in range(10000):
            print results[l]
            if (results[l] > 0.5):
                bigCount += 1

        print len(results)
        print bigCount