Exemplo n.º 1
0
def topic_distribution(name = None, study = None, order = None, **options):
    '''Given a model p_z,p_w_z,p_d_z, we can plot the document's distribution
    using p(z|d) = normalized((p(d|z)*p(z))) '''
    
    m = microbplsa.MicrobPLSA()
    m.open_model(name = name, study = study, **options) #get model from the results file
    #return document's distribution
    p_z_d = m.model.document_topics()
    
    Z,N =p_z_d.shape #number of samples
    if order is not None:
        p_z_d = p_z_d[:,order]
    n = np.arange(N)
    width = 25.0/float(N) #scale width of bars by number of samples
    p = [] #list of plots
    colors = plt.cm.rainbow(np.linspace(0, 1, Z))    
    
    Lab = Labelling(m, ignore_continuous = False)
    Lab.metadata(non_labels = ['BarcodeSequence'])
    R = Lab.correlate()
    labels_r = Lab.assignlabels(R,num_labels = 1)
    labels, r = zip(*labels_r)
    labels = [l.replace('(','\n(') for l in labels]
    
    #sort and organize labels and topics so they are always plotted in the same order
    labelsUnsorted = zipper(labels,range(0,Z))
    labelsUnsorted.sort()
    labels, Zrange = zip(*labelsUnsorted)
    Zrange = list(Zrange)
    p.append(plt.bar(n, p_z_d[Zrange[0],:], width, color=colors[0], linewidth = 0))
    height = p_z_d[Zrange[0],:]
    for i,z in enumerate(Zrange[1:]):
        p.append(plt.bar(n, p_z_d[z,:], width, color=colors[i+1], bottom=height, linewidth = 0))
        height += p_z_d[z,:]
    
    
    plt.ylabel('Probability P(z|d)')
    plt.xlabel('Sample')
    plt.title('Sample\'s topic distribution')
    #plt.xticks(np.arange(0,width/2.0,N*width), ['S'+str(n) for n in range(1,N)])

    topiclegend = ['Topic' + str(Zrange[labels.index(l)]+1) + ': '+ l + '\n ('+ str(r[Zrange[labels.index(l)]]) + ')' for l in labels]
    fontP = FontProperties()
    if N >60:
        fontP.set_size('xx-small')
    else: fontP.set_size('small')
    ax = plt.subplot(111)
    ratio = float(N)*0.5
    ax.set_aspect(ratio)
    ax.tick_params(axis = 'x', colors='w') #remove tick labels by setting them the same color as background
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, 0.5, box.height])

    if order is not None:
        plt.xticks(n, order, size = 'xx-small')
    if Z > 12: 
        columns = 2
    else: columns = 1
    plt.legend(p, topiclegend, prop = fontP, title = 'Topic Label', loc='center left', bbox_to_anchor=(1, 0.5), ncol = columns)
    return plt
Exemplo n.º 2
0
def makePCA(datafile, num_components):
    m = microbplsa.MicrobPLSA()
    m.open_data(datafile = dataFile) #get data of OTU abundances per sample
    X = m.datamatrix.T
    
    plsa = m.open_model(modelFile = resultfile) #get model from the results file
    #return document's distribution
    p_d_z = plsa.p_d_z
    N,Z =p_d_z.shape
    
    #get topic labels
    if MANUAL_LABELS:
        labels  = MANUAL_LABELS
    else:
        Lab = Labelling(study, Z, ignore_continuous = False)
        Lab.metadata(non_labels = ['BarcodeSequence'])
        R = Lab.correlate()
        labels_r = Lab.assignlabels(R,num_labels = 1)
        labels, r = zip(*labels_r)
        labels = [l.replace('(','\n(') for l in labels]
    
    #get primary topic per sample
    topics = []
    for i, row in enumerate(p_d_z):
        max_topic_index = np.argmax(row)
        topics.append(max_topic_index)    
    topics = np.array(topics)
    pca = PCA(n_components=num_components, whiten = True)
    pca.fit(X)
    X_r = pca.fit(X).transform(X)
    
    # Percentage of variance explained for each components
    print('Explained variance ratio (first two components): %s'
          % str(pca.explained_variance_ratio_))
    
    #initiate plot and colors
    colors = [float(c)/float(Z) for c in range(0,Z)]
    colors = plt.cm.rainbow(np.linspace(0, 1, Z))
    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = plt.subplot(111, projection ='3d')
    if num_components == 2:
        for c, i, l in zip(colors, range(0,Z), labels):
            ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], 'o', color=c, label=l)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, 0.5, box.height])
    elif num_components == 3:
        for c, i, l in zip(colors, range(0,Z), labels):
            ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], X_r[topics == i, 2], 'o', color=c, label=l)
    fontP = FontProperties()
    if Z > 12: 
        columns = 2
    else: columns = 1
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, 0.5, box.height])
    plt.legend(prop = fontP, loc='center left', bbox_to_anchor=(1, 0.5), ncol = columns)
    plt.title('PCA of Study %s with Z=%s' %(study, str(z)))

    plt.show()
    return None
Exemplo n.º 3
0
'''
Created on 22/01/2014

author: sperez8

Shows how to use Labelling class
'''

from labelling import Labelling

study = '1526'
simple = False
for Z in range(2,39):
    Lab = Labelling(study, Z, debug = False,ignore_continuous = False, adjusted_metadata = True)
    if simple:
         labels = Lab.getlabels()
    else:
        Lab.metadata(non_labels = ['BarcodeSequence'])
        m = list(Lab.metadatamatrix[:,7])
        transf = {'DRY':1, "SAFE":2, "DIPPING":3, "UNDER_ISH":4}
        M = [transf[n] for n in m]
        print M
        if 5 in M: print "DDJDJD        "
        import sys
        sys.exit()
        sys.exit()
        R = Lab.correlate()
        labels = Lab.assignlabels(R,num_labels = 8)
        
    Lab.save_labels(labels) 
Exemplo n.º 4
0
f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_' + str(
    z) + '_topics_.txt'
datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_' + study + '_split_library_seqs_and_mapping/study_' + study + '_closed_reference_otu_table.biom'

m = microbplsa.MicrobPLSA()
plsa = m.open_model(f)  #get model from the results file
p_z_d = plsa.document_topics()  #return document's distribution
Z, N = p_z_d.shape  #number of samples

Lab = Labelling(study, Z, ignore_continuous=False,
                adjusted_metadata=True)  #get labels!
x, y, z = Lab.metadata(non_labels=[])
print y
R = Lab.correlate()
labels_r = Lab.assignlabels(R, num_labels=1)
print labels_r
oldlabels, r = zip(*labels_r)
goodlabels = []
for lab, r in labels_r:
    if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD:
        goodlabels.append(lab)
print("Only %i/%i passed the correlation threshold of %1.1f" %
      (len(goodlabels), len(oldlabels), CORRELATION_THRESHOLD))

labels = [replace(l, ' (', '_') for l in oldlabels]
labels = [replace(l, ' ', '_') for l in labels]
labels = [replace(l, ')', '') for l in labels]
labels = [replace(l, ':', '_') for l in labels]
labels = [replace(l, '.', '_') for l in labels]
labels = [replace(l, '-', '_') for l in labels]
Exemplo n.º 5
0
def makePCA(datafile, num_components):
    m = microbplsa.MicrobPLSA()
    m.open_data(datafile=dataFile)  #get data of OTU abundances per sample
    X = m.datamatrix.T

    plsa = m.open_model(modelFile=resultfile)  #get model from the results file
    #return document's distribution
    p_d_z = plsa.p_d_z
    N, Z = p_d_z.shape

    #get topic labels
    if MANUAL_LABELS:
        labels = MANUAL_LABELS
    else:
        Lab = Labelling(study, Z, ignore_continuous=False)
        Lab.metadata(non_labels=['BarcodeSequence'])
        R = Lab.correlate()
        labels_r = Lab.assignlabels(R, num_labels=1)
        labels, r = zip(*labels_r)
        labels = [l.replace('(', '\n(') for l in labels]

    #get primary topic per sample
    topics = []
    for i, row in enumerate(p_d_z):
        max_topic_index = np.argmax(row)
        topics.append(max_topic_index)
    topics = np.array(topics)
    pca = PCA(n_components=num_components, whiten=True)
    pca.fit(X)
    X_r = pca.fit(X).transform(X)

    # Percentage of variance explained for each components
    print('Explained variance ratio (first two components): %s' %
          str(pca.explained_variance_ratio_))

    #initiate plot and colors
    colors = [float(c) / float(Z) for c in range(0, Z)]
    colors = plt.cm.rainbow(np.linspace(0, 1, Z))
    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = plt.subplot(111, projection='3d')
    if num_components == 2:
        for c, i, l in zip(colors, range(0, Z), labels):
            ax.plot(X_r[topics == i, 0],
                    X_r[topics == i, 1],
                    'o',
                    color=c,
                    label=l)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, 0.5, box.height])
    elif num_components == 3:
        for c, i, l in zip(colors, range(0, Z), labels):
            ax.plot(X_r[topics == i, 0],
                    X_r[topics == i, 1],
                    X_r[topics == i, 2],
                    'o',
                    color=c,
                    label=l)
    fontP = FontProperties()
    if Z > 12:
        columns = 2
    else:
        columns = 1
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, 0.5, box.height])
    plt.legend(prop=fontP,
               loc='center left',
               bbox_to_anchor=(1, 0.5),
               ncol=columns)
    plt.title('PCA of Study %s with Z=%s' % (study, str(z)))

    plt.show()
    return None
Exemplo n.º 6
0
Z = 8
only_continuous = True

f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_'+study +'_'
end = '_topics_.txt'

datafile = f+str(Z)+end

format = 'pdf'



Lab = Labelling(study, Z)
metatable, factor_types, factors = Lab.metadata()
R = Lab.correlate()
labels_r = Lab.assignlabels(R)
labels, r = zip(*labels_r)
labels = [l.replace('(','\n(') for l in labels]


m = microbplsa.MicrobPLSA()
plsa = m.open_model(datafile) #get model from the results file

#return document's distribution
p_z_d = plsa.document_topics()

colorlabel = list(metatable[:,1])
    
    
if format == 'svg':
    import matplotlib
CORRELATION_THRESHOLD = 0.0
pcoordfile = _root_dir +'/D3/pcplots/topics.js'

f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_'+study +'_'+str(z)+'_topics_.txt'
datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_'+study+'_split_library_seqs_and_mapping/study_'+study+'_closed_reference_otu_table.biom'

m = microbplsa.MicrobPLSA()
plsa = m.open_model(f) #get model from the results file
p_z_d = plsa.document_topics() #return document's distribution
Z,N =p_z_d.shape #number of samples
        
Lab = Labelling(study, Z, ignore_continuous = False, adjusted_metadata = True) #get labels!
x,y,z = Lab.metadata(non_labels = [])
print y
R = Lab.correlate()
labels_r = Lab.assignlabels(R,num_labels = 1)
print labels_r
oldlabels, r = zip(*labels_r)
goodlabels = []
for lab, r in labels_r:
    if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD:
        goodlabels.append(lab)
print ("Only %i/%i passed the correlation threshold of %1.1f"%(len(goodlabels), len(oldlabels), CORRELATION_THRESHOLD))

labels = [replace(l,' (', '_') for l in oldlabels]
labels = [replace(l,' ', '_') for l in labels]
labels = [replace(l,')','') for l in labels]
labels = [replace(l,':', '_') for l in labels]
labels = [replace(l,'.', '_') for l in labels]
labels = [replace(l,'-', '_') for l in labels]
samplenames = Lab.metadatamatrix[:,0]
Exemplo n.º 8
0
study = '1526'
Z = 8
only_continuous = True

f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_'
end = '_topics_.txt'

datafile = f + str(Z) + end

format = 'pdf'

Lab = Labelling(study, Z)
metatable, factor_types, factors = Lab.metadata()
R = Lab.correlate()
labels_r = Lab.assignlabels(R)
labels, r = zip(*labels_r)
labels = [l.replace('(', '\n(') for l in labels]

m = microbplsa.MicrobPLSA()
plsa = m.open_model(datafile)  #get model from the results file

#return document's distribution
p_z_d = plsa.document_topics()

colorlabel = list(metatable[:, 1])

if format == 'svg':
    import matplotlib
    matplotlib.use('SVG')
else: