예제 #1
0
 def __init__(self, study, z):
     '''selects study and creates MicrobPLSA instance for that study'''
     self.study = study
     self.z = z
     self.m = microbplsa.MicrobPLSA()
     self.plsa = self.m.open_model(study=self.study, z=self.z)
     return None
예제 #2
0
 def __init__(self, study = None, biomFile = None):
     ''' Opens the probs of a model previously computed and saved in a json file '''
     m = microbplsa.MicrobPLSA()
     m.open_data(study = study, dataFile = biomFile) #get data matrix from the results file
     self.rawData = m.datamatrix
     self.N,self.S = self.rawData.shape 
     return None
예제 #3
0
def convert_2_R(results_file):
    '''converts the result matrix probabilities into tab delimited files readable by R'''
    dir = os.path.dirname(os.path.realpath(__file__)) + "/Results/"
    m = microbplsa.MicrobPLSA()
    plsa = m.open_model(modelFile=os.path.oin(
        dir, results_file))  #get model from the results file
    p_z = plsa.p_z
    p_w_z = plsa.p_w_z
    #return document's distribution
    p_z_d = plsa.document_topics()

    R_file_p_z = open(dir + 'Results_for_R/p_z_' + results_file, 'w')
    for row in p_z:
        R_file_p_z.writelines('\n')
        R_file_p_z.writelines('\t' + str(row))
        R_file_p_z.writelines('\n')
    R_file_p_z.close()

    R_file_p_w_z = open(dir + 'Results_for_R/p_w_z_' + results_file, 'w')
    for row in p_w_z:
        R_file_p_w_z.writelines('\n')
        R_file_p_w_z.writelines('\t' + str(i) for i in row)
        R_file_p_w_z.writelines('\n')
    R_file_p_w_z.close()

    R_file_p_z_d = open(dir + 'Results_for_R/p_z_d_' + results_file, 'w')
    #R_file_p_z_d.writelines('\t topic' + (str(i)) for i in range(1,p_z_d.shape[1]+1))
    for row in p_z_d:
        R_file_p_z_d.writelines('\n')
        R_file_p_z_d.writelines('\t' + str(i) for i in row)
        R_file_p_z_d.writelines('\n')
    R_file_p_z_d.close()

    return None
예제 #4
0
def train(k,
          kFolds,
          data,
          study,
          name,
          z,
          numRuns=1,
          seed='None',
          override=False,
          useC=True,
          folder=FOLDER):
    i = 1
    for trainSamples, testSamples in kFolds:
        trainData, testData = data[:, trainSamples], data[:, testSamples]

        print '\nTraining dataset fold {0} of {1}'.format(i, k)
        m = microbplsa.MicrobPLSA()
        m.study = study
        m.name = name
        m.datamatrix = trainData

        add = '_cross_seed' + str(seed) + '_k' + str(k) + '(' + str(i) + ')'

        m.generate_runs(z_i=z,
                        z_f=z,
                        numRuns=numRuns,
                        useC=useC,
                        override=override,
                        folder=folder,
                        add_to_file=add)
        i += 1

    return None
예제 #5
0
def main(*argv):
    '''handles user input and runs plsa'''

    parser = argparse.ArgumentParser(description='This scripts runs plsa for a range of topic numbers.')
    parser.add_argument('-s','--study', help='The study number', default = None)
    parser.add_argument('-n','--name', help='The name of the dataset')
    parser.add_argument('-z','--topics', help='The range of topics to be run [z_start, z_end]', nargs = '+', type = int, required = True)
    parser.add_argument('-z_inc','--increment', help='Increment of topic numbers', type = int, default = 1)
    parser.add_argument('-useC', help='use C code to run plsa', action = 'store_true')
    parser.add_argument('-runs','-numruns', help='Specify the number of runs', type = int, default = 1)
    parser.add_argument('-override', help='Overrides a result file', action = 'store_true')
    parser.add_argument('-add', help='Add text to result file', default = '')
    args = parser.parse_args()
    
    if args.study is None and args.name is None:
        print "***Study number or a data name must be specified.***\n"
        parser.print_help()
        sys.exit()
    
    if args.study:
        study = str(args.study)
    else: study = None
    name = args.name
    if len(args.topics) == 1:
        z_i = args.topics[0]
        z_f = args.topics[0]
    elif len(args.topics) == 2: 
        z_i = min(args.topics)
        if z_i < 2: 
            z_i = 2
        z_f = max(args.topics)
    else:
        print "\n***Too many arguments specified for number of topics***\n"
        parser.print_help()
        sys.exit()
    z_inc = args.increment
    numRuns = args.runs
    useC = args.useC
    override = args.override
    add = args.add

    print ("    Study: %s" % study)
    print ("    Name: %s" % name)
    if z_i != z_f:
        print ("    Topics will be run from {0} to {1} in increments of {2}".format(z_i, z_f, z_inc))
    else:
        print ("    PLSA will be run with {0} topics".format(z_i))
    print ("    Using C: %s" % args.useC)
    print ("    Number of runs: %s" % args.runs)
    print ("    Override result files: %s" % args.override)
    print ("    Text to add to file: %s" % args.add)

    m = microbplsa.MicrobPLSA()
    m.open_data(study = study, name = name)
    m.generate_runs(z_i = z_i, z_f = z_f, z_inc = z_inc, numRuns = numRuns, useC = useC, override = override, add_to_file = add)
예제 #6
0
def makedendrogram(study=None, filename=None, showme=True):
    m = microbplsa.MicrobPLSA()
    m.open_data(study=study, dataFile=filename)
    data = m.datamatrix
    X = data.T

    Y = pdist(X, 'euclidean')

    Z = linkage(Y)

    t = 0.7 * max(Z[:, 2])

    D = dendrogram(Z, color_threshold=t)

    leaves_order = D['ivl']
    if showme:
        show()
    else:
        plt.clf()
    return leaves_order
예제 #7
0
sys.path.insert(0, _root_dir)
import microbplsa
analysis_dir = _root_dir + '/Analysis'
sys.path.insert(0, analysis_dir)
from labelling import Labelling
from string import replace

FOLDER = 'Models'

name = 'bac_final0.03.otutable_GOODSAMPLES'
z = 20
run = 1
pcoordfile = os.path.join(_root_dir, 'D3', 'pcplots',
                          'paracoords_LTSP_topics' + '.js')

m = microbplsa.MicrobPLSA(name=name)
m.open_model(z=z, run=run, useC=True,
             folder=FOLDER)  #get model from the results file
plsa = m.model
p_z_d = plsa.document_topics()  #return document's distribution
Z, N = p_z_d.shape  #number of samples

print Z, N

samples = ['Sample' + str(i) for i in range(1, N + 1)]
labels = ['Topic ' + str(i) for i in range(1, Z + 1)]

f = open(pcoordfile, 'w')
f.write('var topics = [\n')

for s, distribution in enumerate(p_z_d.T):
예제 #8
0
import microbplsa
analysis_dir = _root_dir + '/Analysis'
sys.path.insert(0, analysis_dir)
from labelling import Labelling
from string import replace

study = '1526'
z = 8
CORRELATION_THRESHOLD = 0.0
pcoordfile = _root_dir + '/D3/pcplots/topics.js'

f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_' + str(
    z) + '_topics_.txt'
datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_' + study + '_split_library_seqs_and_mapping/study_' + study + '_closed_reference_otu_table.biom'

m = microbplsa.MicrobPLSA()
plsa = m.open_model(f)  #get model from the results file
p_z_d = plsa.document_topics()  #return document's distribution
Z, N = p_z_d.shape  #number of samples

Lab = Labelling(study, Z, ignore_continuous=False,
                adjusted_metadata=True)  #get labels!
x, y, z = Lab.metadata(non_labels=[])
print y
R = Lab.correlate()
labels_r = Lab.assignlabels(R, num_labels=1)
print labels_r
oldlabels, r = zip(*labels_r)
goodlabels = []
for lab, r in labels_r:
    if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD:
예제 #9
0
def makePCA(datafile, num_components):
    m = microbplsa.MicrobPLSA()
    m.open_data(datafile=dataFile)  #get data of OTU abundances per sample
    X = m.datamatrix.T

    plsa = m.open_model(modelFile=resultfile)  #get model from the results file
    #return document's distribution
    p_d_z = plsa.p_d_z
    N, Z = p_d_z.shape

    #get topic labels
    if MANUAL_LABELS:
        labels = MANUAL_LABELS
    else:
        Lab = Labelling(study, Z, ignore_continuous=False)
        Lab.metadata(non_labels=['BarcodeSequence'])
        R = Lab.correlate()
        labels_r = Lab.assignlabels(R, num_labels=1)
        labels, r = zip(*labels_r)
        labels = [l.replace('(', '\n(') for l in labels]

    #get primary topic per sample
    topics = []
    for i, row in enumerate(p_d_z):
        max_topic_index = np.argmax(row)
        topics.append(max_topic_index)
    topics = np.array(topics)
    pca = PCA(n_components=num_components, whiten=True)
    pca.fit(X)
    X_r = pca.fit(X).transform(X)

    # Percentage of variance explained for each components
    print('Explained variance ratio (first two components): %s' %
          str(pca.explained_variance_ratio_))

    #initiate plot and colors
    colors = [float(c) / float(Z) for c in range(0, Z)]
    colors = plt.cm.rainbow(np.linspace(0, 1, Z))
    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = plt.subplot(111, projection='3d')
    if num_components == 2:
        for c, i, l in zip(colors, range(0, Z), labels):
            ax.plot(X_r[topics == i, 0],
                    X_r[topics == i, 1],
                    'o',
                    color=c,
                    label=l)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, 0.5, box.height])
    elif num_components == 3:
        for c, i, l in zip(colors, range(0, Z), labels):
            ax.plot(X_r[topics == i, 0],
                    X_r[topics == i, 1],
                    X_r[topics == i, 2],
                    'o',
                    color=c,
                    label=l)
    fontP = FontProperties()
    if Z > 12:
        columns = 2
    else:
        columns = 1
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, 0.5, box.height])
    plt.legend(prop=fontP,
               loc='center left',
               bbox_to_anchor=(1, 0.5),
               ncol=columns)
    plt.title('PCA of Study %s with Z=%s' % (study, str(z)))

    plt.show()
    return None
예제 #10
0
def main(*argv):
    '''handles user input and runs different analysis functions using plsa model'''

    parser = argparse.ArgumentParser(
        description='This scripts runs plsa for a range of topic numbers.')
    parser.add_argument('-s', '--study', help='The study number', default=None)
    parser.add_argument('-n', '--name', help='The name of the dataset')
    parser.add_argument('-z',
                        '--topics',
                        help='The range of topics to be run [z_start, z_end]',
                        nargs='+',
                        type=int,
                        required=True)
    parser.add_argument('-z_inc',
                        '--increment',
                        help='Increment of topic numbers',
                        type=int,
                        default=1)
    parser.add_argument('-useC',
                        help='use C code to run plsa',
                        action='store_true')
    parser.add_argument('-run',
                        help='Specify the run number',
                        type=int,
                        default=1)
    parser.add_argument('-topotus',
                        help='Specify to calculate the top otus',
                        action='store_true')
    parser.add_argument(
        '-n_otus',
        help='Specify the number of top otus to return per topic',
        type=int,
        default=5)
    parser.add_argument('-calculateX',
                        help='Specify to calculate X',
                        action='store_true')

    args = parser.parse_args()

    if args.study is None and args.name is None:
        print "***Study number or a data name must be specified.***\n"
        parser.print_help()
        sys.exit()

    if not args.calculateX and not args.topotus:
        print "***Please specify an action to perform from the following: 'topotus', 'calculateX' "
        parser.print_help()
        sys.exit()

    if args.study:
        study = str(args.study)
    else:
        study = None
    name = args.name
    if len(args.topics) == 1:
        z_i = args.topics[0]
        z_f = args.topics[0]
    elif len(args.topics) == 2:
        z_i = min(args.topics)
        if z_i < 2:
            z_i = 2
        z_f = max(args.topics)
    else:
        print "\n***Too many arguments specified for number of topics***\n"
        parser.print_help()
        sys.exit()
    z_inc = args.increment
    run = args.run
    useC = args.useC
    topotus = args.topotus
    n_otus = args.n_otus
    calculateX = args.calculateX

    print("    Study: %s" % study)
    print("    Name: %s" % name)
    if z_i != z_f:
        print("    Topics range: [{0} - {1}] in increments of {2}".format(
            z_i, z_f, z_inc))
    else:
        print("    Topic number used: {0}".format(z_i))
    print("    Using C: %s" % args.useC)
    print("    Run number: %s" % args.run)

    m = microbplsa.MicrobPLSA(study=study, name=name)

    for z in range(z_i, z_f + 1, z_inc):
        if topotus:
            print "Finding top otus per each topic"
            m.open_model(z=z, run=run, useC=True, folder=FOLDER)
            analysis.top_otus(m, z, n_otus=n_otus)
        if calculateX:
            pass
예제 #11
0
def main(*argv):
    '''handles user input and runs plsa'''

    parser = argparse.ArgumentParser(
        description=
        'This scripts runs cross validations to determine the optimal number of topics.'
    )
    parser.add_argument(
        '-action',
        help='Action to perform from "all", "train", "test", "mse"',
        required=True)
    parser.add_argument('-k',
                        help='Number of folds in kFold cross validation',
                        type=int,
                        default=5)
    parser.add_argument('-s', '--study', help='The study number', default=None)
    parser.add_argument('-n', '--name', help='The name of the dataset')
    parser.add_argument('-z',
                        '--topics',
                        help='The range of topics to be run [z_start, z_end]',
                        nargs='+',
                        type=int,
                        required=True)
    parser.add_argument('-z_inc',
                        '--increment',
                        help='Increment of topic numbers',
                        type=int,
                        default=1)
    parser.add_argument('-useC',
                        help='use C code to run plsa',
                        action='store_true')
    parser.add_argument('-run',
                        help='Specify the number of the run',
                        type=int,
                        default=1)
    parser.add_argument('-seed',
                        help='Random seed for kFold generator',
                        type=int,
                        default=2)
    args = parser.parse_args()

    if args.action not in ['all', 'train', 'test', 'mse']:
        print "***The specified action is not recognized.***\n"
        parser.print_help()
        sys.exit()
    else:
        action = str(args.action)

    k = args.k

    if args.study is None and args.name is None:
        print "***Study number or a data name must be specified.***\n"
        parser.print_help()
        sys.exit()

    if args.study:
        study = str(args.study)
    else:
        study = None
    name = args.name
    if len(args.topics) == 1:
        z_i = args.topics[0]
        z_f = args.topics[0]
    elif len(args.topics) == 2:
        z_i = min(args.topics)
        if z_i < 2:
            z_i = 2
        z_f = max(args.topics)
    else:
        print "\n***Too many arguments specified for number of topics***\n"
        parser.print_help()
        sys.exit()
    z_inc = args.increment
    run = args.run
    useC = args.useC
    seed = args.seed

    print("    Study: %s" % study)
    print("    Name: %s" % name)
    if z_i != z_f:
        print("    Topics tested will be from {0} to {1} in increments of {2}".
              format(z_i, z_f, z_inc))
    else:
        print("    Number of topics: {0}".format(z_i))
    print("    Using C: %s" % args.useC)
    print("    Number of run used: %s" % args.run)
    print("    Action performed: %s" % action)
    print("    Number of folds, k= : %s" % k)

    m = microbplsa.MicrobPLSA(study=study, name=name)
    mseAll = []
    if action == 'train' or action == 'test' or action == 'all':
        m.open_data()
        print 'Data loaded.'

    for z in range(z_i, z_f + 1, z_inc):
        if action == 'train' or action == 'all':
            kFolds = kf.create_folds(m, k, z, shuffle=True, seed=seed)
            data = m.datamatrix
            kf.train(k,
                     kFolds,
                     data,
                     study,
                     name,
                     z,
                     numRuns=run,
                     seed=seed,
                     useC=useC,
                     override=False)
        if action == 'test' or action == 'all':
            kFolds = kf.open_kFold(study, name, k, z)
            kf.test(m, kFolds, k, z, useC=useC, seed=seed)
        if action == 'mse' or action == 'all':
            kFolds = kf.open_kFold(study, name, k, z)
            mse = kf.measure_error(m, kFolds, k, z)
            print "\n The cross validation error for study {0} with {1} topics and {2} folds is:     {3} +/-{4}\n".format(
                study, z, k, round(np.mean(mse), 5), round(np.std(mse), 5))
            mse.insert(0, z)
            mseAll.append(mse)

    if mseAll:
        kf.save_mse(mseAll, k, z, study, name, seed, run)