def initial_values(p, k, m): """Creates a list of lenght m with initial values. K-means generates cluster assignments and groupwise maximum likelihood estimates are the initial Dirichlet parameter values. """ scaler = preprocessing.StandardScaler() #Center the data p_scaled = scaler.fit_transform(pd.DataFrame(p)) cat = numpy.shape(p)[1] index = 0 inits = [] while index < m: kmeans = KMeans(n_clusters=k, n_init=5) z_init = kmeans.fit_predict(p_scaled) + 1 rho_mles = numpy.reshape(numpy.repeat(0.0, k * cat), (k, cat)) try: for l in numpy.arange(1, k + 1): rho_mles[l - 1, :] = dirichlet.mle(p[z_init == l, ]) except: for l in numpy.arange(1, k + 1): rho_mles[l - 1, :] = numpy.random.uniform(low=0.1, high=5, size=1) alpha_init = numpy.random.uniform(1, 2, 1) beta_init = numpy.random.uniform(0.1, 0.5, 1) inits_m = [z_init, rho_mles, alpha_init, beta_init] inits.append(inits_m) index += 1 return inits
def computeNIPS(dimensions): dsname, data, features = getNips() lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")
def fit(self, segments): """ Fits the model to given chroma segments. :param segments: AnnotatedChromaSegment list """ for k in self.kinds: chroma = self.preprocess(segments.chromas[segments.kinds == k]) self.alphas[k] = dirichlet.mle(chroma)
def train_dirichlet(qual_obs): alphas = np.array([dirichlet.mle(obs) for obs in qual_obs]) alphas = np.concatenate([circshift(alphas, 0, r) for r in range(12)], axis=0) logl = [np.array([loglikelihood(obs, a) for a in alphas]).T for obs in qual_obs] y_pred = np.concatenate([logl[n].argmax(axis=1) for n in range(13)]) y_true = np.concatenate([np.array([n]*len(logl[n])) for n in range(13)]) print score(y_true, y_pred)
def randomize_dirichlet(X): # additive smoothing to avoid numerical problems M = X.shape[1] # eps = np.spacing(1) eps = 0.01 / M X = (X + eps) / (1 + M * eps) alpha = dirichlet.mle(X) N = X.shape[0] X = np.random.dirichlet(alpha, size=N) return X
def train_dirichlet(qual_obs): alphas = np.array([dirichlet.mle(obs) for obs in qual_obs]) alphas = np.concatenate([circshift(alphas, 0, r) for r in range(12)], axis=0) logl = [ np.array([loglikelihood(obs, a) for a in alphas]).T for obs in qual_obs ] y_pred = np.concatenate([logl[n].argmax(axis=1) for n in range(13)]) y_true = np.concatenate([np.array([n] * len(logl[n])) for n in range(13)]) print score(y_true, y_pred)
def _update_psi(self): new_psi = np.zeros((self.n_topics, self.n_eng_modes), dtype=np.float64) for k in xrange(self.n_topics): if np.sum(self.ndz_[:, k]) > 0.0: new_psi[k, :] = dir.mle(self.Y, weights=self.ndz_[:, k]) else: new_psi[k, :] = self.eta self._psi_coefs = gamma(np.sum(new_psi, 1)) / np.prod( gamma(new_psi), 1) self.psi_ = new_psi
def _etaUpdate(self, dataE): """ use standard MLE estimation of eta from dirichlet distribution observation is dataE for each word with word-level topic """ dataE_smoothed = probNormalize(dataE + SMOOTH_FACTOR) eta_est = np.zeros([self.K, self.E]) for k in range(self.K): obs = np.repeat(dataE_smoothed, self.TI[:, k].tolist(), axis=0) eta_est[k] = dirichlet.mle(obs) return eta_est
def fit(self, segments): """ Fits the model to given chroma segments. :param segments: AnnotatedChromaSegment list """ in_chroma_sums = dict() for k in self.kinds: chroma = self.preprocess(segments.chromas[segments.kinds == k]) partition = [self.inDegreeDict[k], self.outDegreeDict[k]] in_chroma_sums[k] = amalgamate(partition, chroma).transpose()[0] in_chroma_composition = subcomposition( [[e] for e in self.inDegreeDict[k]], chroma).astype('float64') self.dirichlets[k] = dirichlet.mle(in_chroma_composition) out_chroma_composition = subcomposition( [[e] for e in self.outDegreeDict[k]], chroma).astype('float64') self.residualDirichletAlphas[k] = dirichlet.mle( out_chroma_composition) all_chords = np.concatenate(list(in_chroma_sums.values())) self.betaParams = beta.fit(all_chords, floc=0, fscale=1)
def _etaUpdate(self): """ use standard MLE estimation of eta from dirichlet distribution observation is dataE for each word with word-level topic """ dataE_smoothed = np.zeros([self.D, self.E]) for d in self.dataE_smoothed: dataE_smoothed[d] = self.dataE_smoothed[d] eta_est = np.zeros([self.K, self.E]) for k in range(self.K): obs = np.repeat(dataE_smoothed, self.TI[:,k].tolist(), axis=0) eta_est[k] = dirichlet.mle(obs) return eta_est
def estimate_dirichlet_par( x_train ): #input to dirichlet.mle N*K numpy array, N=train_samples, K=dimension_of_each_input par = {} alpha = 0.001 for i in x_train: x_train[i] = x_train[i] * 1.0 x_train[i] += alpha x = np.linalg.norm(x_train[i], axis=1, keepdims=True) x_train[i] = x_train[i] / x # x_train[i] = np.transpose(np.transpose(x_train[i])/(np.sum(x_train[i],axis=1))) par[i] = dirichlet.mle(x_train[i]) return par
def getHydrochemLL(): dsname, data, features = getHydrochem() print(data) print(data.shape) featureTypes = ["continuous"] * data.shape[1] domains = [[0, 1]] * data.shape[1] print(domains) families = ['piecewise'] * data.shape[1] #families = ['histogram'] * data.shape[1] #@memory.cache def learn(data, families, mininst, alpha, th): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=th), domains=domains, alpha=alpha, families=families, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=mininst) return spn stats = Stats(name=dsname) alll = [] for train, test, i in kfolded(data, 5): dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=100000) ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train, families, 10, 0.1, 0.1) ll = spn.root.eval(test) alll.append(numpy.mean(ll)) stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) print(numpy.mean(alll)) stats.save("results/hydrochems/" + dsname + ".json")
def dirichlet_fit(sampled_probas, method='fixedpoint'): """ Input: sampled_probas: tensor shape of (S#samples, B#examples, D#class_dim) output: das: matrix shape of (B#examples, D#class_dim) where each row is the alpha's from a Dirichlet distribution, fitted to the sampled_probas per example """ das = np.zeros((sampled_probas.shape[1], sampled_probas.shape[2])) for example_idx in xrange(sampled_probas.shape[1]): curr_samples = sampled_probas[:, example_idx] alphas = dirichlet.mle(curr_samples, method) das[example_idx] = alphas return das
def dir_ave_MLE(n): # Compute the empirical average of the Dirichlet and CC MLEs # for Dirichlet-generated data dir_means = [] CC_means = [] for i in range(trials): dat = np.random.dirichlet(alpha_true, n) # the CC MLE is just the empirical mean CC_means.append(dat.mean(axis=0)) try: alpha_hat = dirichlet.mle(dat) mean = alpha_hat / sum(alpha_hat) dir_means.append(mean) except: print("WARNING: failed to converge") CC_means = np.array(CC_means) dir_means = np.array(dir_means) CC_mean = CC_means.mean(axis=0) dir_mean = dir_means.mean(axis=0) return CC_mean, dir_mean
def CC_ave_MLE(n): # Compute the empirical average of the Dirichlet and CC MLEs # for CC generated data dir_means = [] CC_means = [] lam = lam_true.repeat(n).reshape(K, n).transpose() for i in range(trials): dat = sample_mcb_naive_ordered(lam=lam) # the CC MLE is just the empirical mean CC_means.append(dat.mean(axis=0)) try: alpha_hat = dirichlet.mle(dat) mean = alpha_hat / sum(alpha_hat) dir_means.append(mean) except: print("WARNING: failed to converge") CC_means = np.array(CC_means) dir_means = np.array(dir_means) CC_mean = CC_means.mean(axis=0) dir_mean = dir_means.mean(axis=0) return CC_mean, dir_mean
def estimateParameter(self): try: self.alpha = diri.mle(self.particles) except: pass return self.alpha
def randomize_dirichlet(X): alpha = dirichlet.mle(X) N = X.shape[0] X = np.random.dirichlet(alpha, size=N) return X
print(y_train) for sentence in range(len(X_train)): if y_train[sentence]==0: sum=0 for word in range(len(X_train[sentence])): sum=sum+X_train[sentence][word] t.append(X_train[sentence]/sum) # print(len(t)) b=numpy.asarray(t) # print(b.shape) a0=dirichlet.mle(b) print(a0) v=[] # print(X_train) for sentence in range(len(X_train)): if y_train[sentence]==1: sum=0 for word in range(len(X_train[sentence])): sum=sum+X_train[sentence][word] v.append(X_train[sentence]/sum) # print(len(t))
def test_mle(self, method): a0_fit = dirichlet.mle(self.D0, method=method) logl0_fit = dirichlet.loglikelihood(self.D0, a0_fit) assert (norm(self.a0 - a0_fit) / norm(self.a0) < 0.1) assert (abs((logl0_fit - self.logl0) / logl0_fit) < 0.01)
def getAirPollution(dimensions): dsname, data, features = getAirQualityUCITimeless() idxmissing = data == -200 data = data[:, numpy.sum(idxmissing,0) < 2000] idxmissing = data == -200 data = data[numpy.sum(idxmissing,1) == 0, :] idxmissing = data == -200 print(data.shape) _, mixt = getArchetypes(data, dimensions) if mixt is None: print( "no archetypes", dimensions) #0/0 return def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), #min_instances_slice=int(data.shape[0]*0.01)) min_instances_slice=200) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")
def test6(data): print(data.shape) _, mixt = getArchetypes(data, 3) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd mixt = normalize(mixt) dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000) featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn spn = learn(mixt) print(spn) spn_samples = numpy.zeros((data.shape[0], 3))/0 a,spn_samples = spn.root.sample(spn_samples) spn_samples = normalize(spn_samples) #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) return result def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res xy_all = cartesian(mixt) filename = 'plots/dirichlet_mle.pdf' try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) # all fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) numpy.random.seed(17) mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0]) print(dirichlet_alphas) xy_samples = cartesian(mixt_samples) fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, sampled points") plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, sampled points") plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) pp.close()
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700): if mixttype == "Archetype": _, mixt = getArchetypes(data, dimensions) if mixt is None: return () elif mixttype == "LDA": lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) elif mixttype == "RandomSample": mixt = numpy.random.dirichlet((1,1,1), 20).transpose() print(mixt) 0/0 print(mixt.shape) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42) numpy.savetxt("mixt_train.csv", mixt_train) numpy.savetxt("mixt_test.csv", mixt_test) #0/0 featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000) #@memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn #for the good pdf it was 700 spn = learn(mixt_train) print(spn) def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res print(dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) try: result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) except: print(normalize(data)) print(normalize(data)*1.0) print(normalize(data)+1) print(normalize(data)+0) 0/0 return result df_train = pandas.DataFrame() df_test = pandas.DataFrame() dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas) df_train["dirichlet_train"] = dtrain_fit df_test["dirichlet_test"] = dtest_fit spn_train_fit = spn.root.eval(mixt_train) spn_test_fit = spn.root.eval(mixt_test) df_train["spn_train"] = spn_train_fit df_test["spn_test"] = spn_test_fit if dimensions == 3: xy_train = cartesian(mixt_train) xy_test = cartesian(mixt_test) filename = 'plots/%s_%s.pdf' % (dsname, mixttype) try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) markersize = 1.0 # all # fig = plt.figure() # plt.title("dirichlet, original points") # draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("Dirichlet, train points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("Dirichlet, test points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # all # fig = plt.figure() # plt.title("spn, original points") # draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) # # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("SPN, train points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("SPN, test points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) pp.close() return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions, "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit), "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) , "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers() )
def randomize_dirichlet(X): alpha = dirichlet.mle(X) N = X.shape[0] X = np.random.dirichlet(alpha, size=N) return X
def Dirich(vocab_length): #%% IMPORTING LIBRARIES AND FUNCTIONS import numpy as np import pandas as pd # import os import time # import nltk # import operator # from nltk.corpus import stopwords # from sklearn import model_selection import scipy.io from scipy.stats import dirichlet # from sklearn.naive_bayes import MultinomialNB from Feature_Extraction import Feature_Extractor from dirichlet import mle from dirichlet import loglikelihood fast_run = 0 #%% EXTRACTING FEATURES FROM DATA if fast_run == 0: Folder_Name = './20_news_small/' # Folder_Name='./20_newsgroups/' [x_train, x_test, y_train, y_test, x, y, train_score, test_score] = Feature_Extractor(Folder_Name, vocab_length) #%% SAVING THE VARIABLES AND IMPORTING THEM TO AVOID LONG EXTRACTION TIMES scipy.io.savemat('Extracted_Features/file_feat_x_train.mat', mdict={'x_train': (x_train)}) scipy.io.savemat('Extracted_Features/file_feat_lab_y_train.mat', mdict={'y_train': (y_train)}) scipy.io.savemat('Extracted_Features/file_feat_x_test.mat', mdict={'x_test': (x_test)}) scipy.io.savemat('Extracted_Features/file_feat_lab_y_test.mat', mdict={'y_test': (y_test)}) scipy.io.savemat('Extracted_Features/file_feat_lab_train_score.mat', mdict={'train_score': (train_score)}) scipy.io.savemat('Extracted_Features/file_feat_lab_test_score.mat', mdict={'test_score': (test_score)}) #%% LOADING .mat FILES temp1 = scipy.io.loadmat('Extracted_Features/file_feat_x_test.mat') temp2 = scipy.io.loadmat('Extracted_Features/file_feat_lab_y_test.mat') temp3 = scipy.io.loadmat('Extracted_Features/file_feat_x_train.mat') temp4 = scipy.io.loadmat('Extracted_Features/file_feat_lab_y_train.mat') temp5 = scipy.io.loadmat( 'Extracted_Features/file_feat_lab_train_score.mat') temp6 = scipy.io.loadmat('Extracted_Features/file_feat_lab_test_score.mat') x_test = temp1["x_test"] y_test = temp2["y_test"] x_train = temp3["x_train"] y_train = temp4["y_train"] train_score = temp5["train_score"] test_score = temp6["test_score"] del temp1, temp2, temp3, temp4, temp5, temp6 #%% PARAMETERS FOR LOOPING LATER [nmbr_of_files, vocab_length] = np.shape(x_train) unique_classes = np.unique(y_train) nmbr_of_classes = len(unique_classes) #%% NORMALISING TRAINING DATA normalising_factor = np.sum(x_train, axis=1) #Count of all words in each class eta = 1.1625 x_train = x_train + eta x_train = x_train / (normalising_factor[:, None] + eta * vocab_length) #%% COMPUTING OPTIMAL ALPHAS FOR EACH CLASS (MLE) alpha = np.zeros((nmbr_of_classes, vocab_length)) for i in range(0, nmbr_of_classes): alpha[i][:] = mle(x_train[y_train[0][:] == i][:], tol=1e-7, method='meanprecision', maxiter=100000) #initialising uniform alphas #%% RUNNING CLASSIFIER #NORMALISING THE INPUT TEST SAMPLES. [nmbr_of_files, vocab_length] = np.shape(x_test) sample_normalising_factor = np.sum(x_test, axis=1) #LAPLACE SMOOTHING x_test x_test = x_test + eta x_test = x_test / (sample_normalising_factor[:, None] + eta * vocab_length) #TESTING y_pred = np.zeros((1, nmbr_of_files)) likelihoods = np.zeros((nmbr_of_classes, 1)) for i in range(0, nmbr_of_files): test_sample = x_test[i][:] for j in range(0, nmbr_of_classes): likelihoods[j][:] = loglikelihood( x_test[i][:], alpha[j][:]) #Skewing the trained alpha y_pred[0][i] = np.argmax(likelihoods) # np.random.dirichlet(alpha) # [nmbr_of_files,vocab_length]=np.shape(x_test) # multinom_matrix=np.zeros((nmbr_of_classes,vocab_length)) #Matrix holding the probability of each class raised to the power of frequency. # likelihood=np.zeros((nmbr_of_classes,1)) # y_pred=np.zeros((1,nmbr_of_files)) print("Classifying testing samples \n") #%% TESTING RESULTS diffrnce = y_pred - y_test diffrnce[diffrnce != 0] = 1 incorrect = sum(diffrnce[0][:]) accuracy = (1 - (incorrect / nmbr_of_files)) * 100 # print("Classifier accuracy is: ",accuracy,"%\n") # train_score=train_score*100 # test_score=test_score*100 # print("In-built function gives: ",train_score[0][0], "% accuracy on training set and", test_score[0][0],"% accuracy on testing set") return accuracy, test_score, y_pred, y_test
def test_mle(self, method): a0_fit = dirichlet.mle(self.D0, method=method) logl0_fit = dirichlet.loglikelihood(self.D0, a0_fit) assert(norm(self.a0 - a0_fit)/norm(self.a0) < 0.1) assert(abs((logl0_fit - self.logl0)/logl0_fit) < 0.01)
import numpy as np import pandas as pd import dirichlet import matplotlib.pyplot as plt bodyshop = pd.read_csv("data/autoshop_ratings.csv", header=0) bodyshop['total'] = bodyshop.iloc[:, 1:5].sum(axis=1) bodyshop = bodyshop.loc[bodyshop['total'] > 0] bodyshop['metric'] = (bodyshop['five'] * 5 + bodyshop['four'] * 4 \ + bodyshop['three'] * 3 + bodyshop['two'] * 2 \ + bodyshop['one']) / bodyshop['total'] bodyshop = bodyshop.sort_values(by='total', ascending=False) K = 5 ITERATION = 50 a0 = np.array([100, 299, 100]) D0 = np.random.dirichlet(a0, 1000) dirichlet.mle(D0)