def topic_distribution(name = None, study = None, order = None, **options): '''Given a model p_z,p_w_z,p_d_z, we can plot the document's distribution using p(z|d) = normalized((p(d|z)*p(z))) ''' m = microbplsa.MicrobPLSA() m.open_model(name = name, study = study, **options) #get model from the results file #return document's distribution p_z_d = m.model.document_topics() Z,N =p_z_d.shape #number of samples if order is not None: p_z_d = p_z_d[:,order] n = np.arange(N) width = 25.0/float(N) #scale width of bars by number of samples p = [] #list of plots colors = plt.cm.rainbow(np.linspace(0, 1, Z)) Lab = Labelling(m, ignore_continuous = False) Lab.metadata(non_labels = ['BarcodeSequence']) R = Lab.correlate() labels_r = Lab.assignlabels(R,num_labels = 1) labels, r = zip(*labels_r) labels = [l.replace('(','\n(') for l in labels] #sort and organize labels and topics so they are always plotted in the same order labelsUnsorted = zipper(labels,range(0,Z)) labelsUnsorted.sort() labels, Zrange = zip(*labelsUnsorted) Zrange = list(Zrange) p.append(plt.bar(n, p_z_d[Zrange[0],:], width, color=colors[0], linewidth = 0)) height = p_z_d[Zrange[0],:] for i,z in enumerate(Zrange[1:]): p.append(plt.bar(n, p_z_d[z,:], width, color=colors[i+1], bottom=height, linewidth = 0)) height += p_z_d[z,:] plt.ylabel('Probability P(z|d)') plt.xlabel('Sample') plt.title('Sample\'s topic distribution') #plt.xticks(np.arange(0,width/2.0,N*width), ['S'+str(n) for n in range(1,N)]) topiclegend = ['Topic' + str(Zrange[labels.index(l)]+1) + ': '+ l + '\n ('+ str(r[Zrange[labels.index(l)]]) + ')' for l in labels] fontP = FontProperties() if N >60: fontP.set_size('xx-small') else: fontP.set_size('small') ax = plt.subplot(111) ratio = float(N)*0.5 ax.set_aspect(ratio) ax.tick_params(axis = 'x', colors='w') #remove tick labels by setting them the same color as background box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) if order is not None: plt.xticks(n, order, size = 'xx-small') if Z > 12: columns = 2 else: columns = 1 plt.legend(p, topiclegend, prop = fontP, title = 'Topic Label', loc='center left', bbox_to_anchor=(1, 0.5), ncol = columns) return plt
def makePCA(datafile, num_components): m = microbplsa.MicrobPLSA() m.open_data(datafile = dataFile) #get data of OTU abundances per sample X = m.datamatrix.T plsa = m.open_model(modelFile = resultfile) #get model from the results file #return document's distribution p_d_z = plsa.p_d_z N,Z =p_d_z.shape #get topic labels if MANUAL_LABELS: labels = MANUAL_LABELS else: Lab = Labelling(study, Z, ignore_continuous = False) Lab.metadata(non_labels = ['BarcodeSequence']) R = Lab.correlate() labels_r = Lab.assignlabels(R,num_labels = 1) labels, r = zip(*labels_r) labels = [l.replace('(','\n(') for l in labels] #get primary topic per sample topics = [] for i, row in enumerate(p_d_z): max_topic_index = np.argmax(row) topics.append(max_topic_index) topics = np.array(topics) pca = PCA(n_components=num_components, whiten = True) pca.fit(X) X_r = pca.fit(X).transform(X) # Percentage of variance explained for each components print('Explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) #initiate plot and colors colors = [float(c)/float(Z) for c in range(0,Z)] colors = plt.cm.rainbow(np.linspace(0, 1, Z)) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = plt.subplot(111, projection ='3d') if num_components == 2: for c, i, l in zip(colors, range(0,Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], 'o', color=c, label=l) box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) elif num_components == 3: for c, i, l in zip(colors, range(0,Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], X_r[topics == i, 2], 'o', color=c, label=l) fontP = FontProperties() if Z > 12: columns = 2 else: columns = 1 box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) plt.legend(prop = fontP, loc='center left', bbox_to_anchor=(1, 0.5), ncol = columns) plt.title('PCA of Study %s with Z=%s' %(study, str(z))) plt.show() return None
''' Created on 22/01/2014 author: sperez8 Shows how to use Labelling class ''' from labelling import Labelling study = '1526' simple = False for Z in range(2,39): Lab = Labelling(study, Z, debug = False,ignore_continuous = False, adjusted_metadata = True) if simple: labels = Lab.getlabels() else: Lab.metadata(non_labels = ['BarcodeSequence']) m = list(Lab.metadatamatrix[:,7]) transf = {'DRY':1, "SAFE":2, "DIPPING":3, "UNDER_ISH":4} M = [transf[n] for n in m] print M if 5 in M: print "DDJDJD " import sys sys.exit() sys.exit() R = Lab.correlate() labels = Lab.assignlabels(R,num_labels = 8) Lab.save_labels(labels)
f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_' + str( z) + '_topics_.txt' datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_' + study + '_split_library_seqs_and_mapping/study_' + study + '_closed_reference_otu_table.biom' m = microbplsa.MicrobPLSA() plsa = m.open_model(f) #get model from the results file p_z_d = plsa.document_topics() #return document's distribution Z, N = p_z_d.shape #number of samples Lab = Labelling(study, Z, ignore_continuous=False, adjusted_metadata=True) #get labels! x, y, z = Lab.metadata(non_labels=[]) print y R = Lab.correlate() labels_r = Lab.assignlabels(R, num_labels=1) print labels_r oldlabels, r = zip(*labels_r) goodlabels = [] for lab, r in labels_r: if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD: goodlabels.append(lab) print("Only %i/%i passed the correlation threshold of %1.1f" % (len(goodlabels), len(oldlabels), CORRELATION_THRESHOLD)) labels = [replace(l, ' (', '_') for l in oldlabels] labels = [replace(l, ' ', '_') for l in labels] labels = [replace(l, ')', '') for l in labels] labels = [replace(l, ':', '_') for l in labels] labels = [replace(l, '.', '_') for l in labels] labels = [replace(l, '-', '_') for l in labels]
def makePCA(datafile, num_components): m = microbplsa.MicrobPLSA() m.open_data(datafile=dataFile) #get data of OTU abundances per sample X = m.datamatrix.T plsa = m.open_model(modelFile=resultfile) #get model from the results file #return document's distribution p_d_z = plsa.p_d_z N, Z = p_d_z.shape #get topic labels if MANUAL_LABELS: labels = MANUAL_LABELS else: Lab = Labelling(study, Z, ignore_continuous=False) Lab.metadata(non_labels=['BarcodeSequence']) R = Lab.correlate() labels_r = Lab.assignlabels(R, num_labels=1) labels, r = zip(*labels_r) labels = [l.replace('(', '\n(') for l in labels] #get primary topic per sample topics = [] for i, row in enumerate(p_d_z): max_topic_index = np.argmax(row) topics.append(max_topic_index) topics = np.array(topics) pca = PCA(n_components=num_components, whiten=True) pca.fit(X) X_r = pca.fit(X).transform(X) # Percentage of variance explained for each components print('Explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) #initiate plot and colors colors = [float(c) / float(Z) for c in range(0, Z)] colors = plt.cm.rainbow(np.linspace(0, 1, Z)) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = plt.subplot(111, projection='3d') if num_components == 2: for c, i, l in zip(colors, range(0, Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], 'o', color=c, label=l) box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) elif num_components == 3: for c, i, l in zip(colors, range(0, Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], X_r[topics == i, 2], 'o', color=c, label=l) fontP = FontProperties() if Z > 12: columns = 2 else: columns = 1 box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) plt.legend(prop=fontP, loc='center left', bbox_to_anchor=(1, 0.5), ncol=columns) plt.title('PCA of Study %s with Z=%s' % (study, str(z))) plt.show() return None
Z = 8 only_continuous = True f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_'+study +'_' end = '_topics_.txt' datafile = f+str(Z)+end format = 'pdf' Lab = Labelling(study, Z) metatable, factor_types, factors = Lab.metadata() R = Lab.correlate() labels_r = Lab.assignlabels(R) labels, r = zip(*labels_r) labels = [l.replace('(','\n(') for l in labels] m = microbplsa.MicrobPLSA() plsa = m.open_model(datafile) #get model from the results file #return document's distribution p_z_d = plsa.document_topics() colorlabel = list(metatable[:,1]) if format == 'svg': import matplotlib
CORRELATION_THRESHOLD = 0.0 pcoordfile = _root_dir +'/D3/pcplots/topics.js' f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_'+study +'_'+str(z)+'_topics_.txt' datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_'+study+'_split_library_seqs_and_mapping/study_'+study+'_closed_reference_otu_table.biom' m = microbplsa.MicrobPLSA() plsa = m.open_model(f) #get model from the results file p_z_d = plsa.document_topics() #return document's distribution Z,N =p_z_d.shape #number of samples Lab = Labelling(study, Z, ignore_continuous = False, adjusted_metadata = True) #get labels! x,y,z = Lab.metadata(non_labels = []) print y R = Lab.correlate() labels_r = Lab.assignlabels(R,num_labels = 1) print labels_r oldlabels, r = zip(*labels_r) goodlabels = [] for lab, r in labels_r: if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD: goodlabels.append(lab) print ("Only %i/%i passed the correlation threshold of %1.1f"%(len(goodlabels), len(oldlabels), CORRELATION_THRESHOLD)) labels = [replace(l,' (', '_') for l in oldlabels] labels = [replace(l,' ', '_') for l in labels] labels = [replace(l,')','') for l in labels] labels = [replace(l,':', '_') for l in labels] labels = [replace(l,'.', '_') for l in labels] labels = [replace(l,'-', '_') for l in labels] samplenames = Lab.metadatamatrix[:,0]
study = '1526' Z = 8 only_continuous = True f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_' end = '_topics_.txt' datafile = f + str(Z) + end format = 'pdf' Lab = Labelling(study, Z) metatable, factor_types, factors = Lab.metadata() R = Lab.correlate() labels_r = Lab.assignlabels(R) labels, r = zip(*labels_r) labels = [l.replace('(', '\n(') for l in labels] m = microbplsa.MicrobPLSA() plsa = m.open_model(datafile) #get model from the results file #return document's distribution p_z_d = plsa.document_topics() colorlabel = list(metatable[:, 1]) if format == 'svg': import matplotlib matplotlib.use('SVG') else: