def __init__(self, study, z): '''selects study and creates MicrobPLSA instance for that study''' self.study = study self.z = z self.m = microbplsa.MicrobPLSA() self.plsa = self.m.open_model(study=self.study, z=self.z) return None
def __init__(self, study = None, biomFile = None): ''' Opens the probs of a model previously computed and saved in a json file ''' m = microbplsa.MicrobPLSA() m.open_data(study = study, dataFile = biomFile) #get data matrix from the results file self.rawData = m.datamatrix self.N,self.S = self.rawData.shape return None
def convert_2_R(results_file): '''converts the result matrix probabilities into tab delimited files readable by R''' dir = os.path.dirname(os.path.realpath(__file__)) + "/Results/" m = microbplsa.MicrobPLSA() plsa = m.open_model(modelFile=os.path.oin( dir, results_file)) #get model from the results file p_z = plsa.p_z p_w_z = plsa.p_w_z #return document's distribution p_z_d = plsa.document_topics() R_file_p_z = open(dir + 'Results_for_R/p_z_' + results_file, 'w') for row in p_z: R_file_p_z.writelines('\n') R_file_p_z.writelines('\t' + str(row)) R_file_p_z.writelines('\n') R_file_p_z.close() R_file_p_w_z = open(dir + 'Results_for_R/p_w_z_' + results_file, 'w') for row in p_w_z: R_file_p_w_z.writelines('\n') R_file_p_w_z.writelines('\t' + str(i) for i in row) R_file_p_w_z.writelines('\n') R_file_p_w_z.close() R_file_p_z_d = open(dir + 'Results_for_R/p_z_d_' + results_file, 'w') #R_file_p_z_d.writelines('\t topic' + (str(i)) for i in range(1,p_z_d.shape[1]+1)) for row in p_z_d: R_file_p_z_d.writelines('\n') R_file_p_z_d.writelines('\t' + str(i) for i in row) R_file_p_z_d.writelines('\n') R_file_p_z_d.close() return None
def train(k, kFolds, data, study, name, z, numRuns=1, seed='None', override=False, useC=True, folder=FOLDER): i = 1 for trainSamples, testSamples in kFolds: trainData, testData = data[:, trainSamples], data[:, testSamples] print '\nTraining dataset fold {0} of {1}'.format(i, k) m = microbplsa.MicrobPLSA() m.study = study m.name = name m.datamatrix = trainData add = '_cross_seed' + str(seed) + '_k' + str(k) + '(' + str(i) + ')' m.generate_runs(z_i=z, z_f=z, numRuns=numRuns, useC=useC, override=override, folder=folder, add_to_file=add) i += 1 return None
def main(*argv): '''handles user input and runs plsa''' parser = argparse.ArgumentParser(description='This scripts runs plsa for a range of topic numbers.') parser.add_argument('-s','--study', help='The study number', default = None) parser.add_argument('-n','--name', help='The name of the dataset') parser.add_argument('-z','--topics', help='The range of topics to be run [z_start, z_end]', nargs = '+', type = int, required = True) parser.add_argument('-z_inc','--increment', help='Increment of topic numbers', type = int, default = 1) parser.add_argument('-useC', help='use C code to run plsa', action = 'store_true') parser.add_argument('-runs','-numruns', help='Specify the number of runs', type = int, default = 1) parser.add_argument('-override', help='Overrides a result file', action = 'store_true') parser.add_argument('-add', help='Add text to result file', default = '') args = parser.parse_args() if args.study is None and args.name is None: print "***Study number or a data name must be specified.***\n" parser.print_help() sys.exit() if args.study: study = str(args.study) else: study = None name = args.name if len(args.topics) == 1: z_i = args.topics[0] z_f = args.topics[0] elif len(args.topics) == 2: z_i = min(args.topics) if z_i < 2: z_i = 2 z_f = max(args.topics) else: print "\n***Too many arguments specified for number of topics***\n" parser.print_help() sys.exit() z_inc = args.increment numRuns = args.runs useC = args.useC override = args.override add = args.add print (" Study: %s" % study) print (" Name: %s" % name) if z_i != z_f: print (" Topics will be run from {0} to {1} in increments of {2}".format(z_i, z_f, z_inc)) else: print (" PLSA will be run with {0} topics".format(z_i)) print (" Using C: %s" % args.useC) print (" Number of runs: %s" % args.runs) print (" Override result files: %s" % args.override) print (" Text to add to file: %s" % args.add) m = microbplsa.MicrobPLSA() m.open_data(study = study, name = name) m.generate_runs(z_i = z_i, z_f = z_f, z_inc = z_inc, numRuns = numRuns, useC = useC, override = override, add_to_file = add)
def makedendrogram(study=None, filename=None, showme=True): m = microbplsa.MicrobPLSA() m.open_data(study=study, dataFile=filename) data = m.datamatrix X = data.T Y = pdist(X, 'euclidean') Z = linkage(Y) t = 0.7 * max(Z[:, 2]) D = dendrogram(Z, color_threshold=t) leaves_order = D['ivl'] if showme: show() else: plt.clf() return leaves_order
sys.path.insert(0, _root_dir) import microbplsa analysis_dir = _root_dir + '/Analysis' sys.path.insert(0, analysis_dir) from labelling import Labelling from string import replace FOLDER = 'Models' name = 'bac_final0.03.otutable_GOODSAMPLES' z = 20 run = 1 pcoordfile = os.path.join(_root_dir, 'D3', 'pcplots', 'paracoords_LTSP_topics' + '.js') m = microbplsa.MicrobPLSA(name=name) m.open_model(z=z, run=run, useC=True, folder=FOLDER) #get model from the results file plsa = m.model p_z_d = plsa.document_topics() #return document's distribution Z, N = p_z_d.shape #number of samples print Z, N samples = ['Sample' + str(i) for i in range(1, N + 1)] labels = ['Topic ' + str(i) for i in range(1, Z + 1)] f = open(pcoordfile, 'w') f.write('var topics = [\n') for s, distribution in enumerate(p_z_d.T):
import microbplsa analysis_dir = _root_dir + '/Analysis' sys.path.insert(0, analysis_dir) from labelling import Labelling from string import replace study = '1526' z = 8 CORRELATION_THRESHOLD = 0.0 pcoordfile = _root_dir + '/D3/pcplots/topics.js' f = '/Users/sperez/git/microbPLSA/MicrobProcessor/Results/study_' + study + '_' + str( z) + '_topics_.txt' datafile = '/Users/sperez/Documents/PLSAfun/EMPL data/study_' + study + '_split_library_seqs_and_mapping/study_' + study + '_closed_reference_otu_table.biom' m = microbplsa.MicrobPLSA() plsa = m.open_model(f) #get model from the results file p_z_d = plsa.document_topics() #return document's distribution Z, N = p_z_d.shape #number of samples Lab = Labelling(study, Z, ignore_continuous=False, adjusted_metadata=True) #get labels! x, y, z = Lab.metadata(non_labels=[]) print y R = Lab.correlate() labels_r = Lab.assignlabels(R, num_labels=1) print labels_r oldlabels, r = zip(*labels_r) goodlabels = [] for lab, r in labels_r: if r > CORRELATION_THRESHOLD or r < -CORRELATION_THRESHOLD:
def makePCA(datafile, num_components): m = microbplsa.MicrobPLSA() m.open_data(datafile=dataFile) #get data of OTU abundances per sample X = m.datamatrix.T plsa = m.open_model(modelFile=resultfile) #get model from the results file #return document's distribution p_d_z = plsa.p_d_z N, Z = p_d_z.shape #get topic labels if MANUAL_LABELS: labels = MANUAL_LABELS else: Lab = Labelling(study, Z, ignore_continuous=False) Lab.metadata(non_labels=['BarcodeSequence']) R = Lab.correlate() labels_r = Lab.assignlabels(R, num_labels=1) labels, r = zip(*labels_r) labels = [l.replace('(', '\n(') for l in labels] #get primary topic per sample topics = [] for i, row in enumerate(p_d_z): max_topic_index = np.argmax(row) topics.append(max_topic_index) topics = np.array(topics) pca = PCA(n_components=num_components, whiten=True) pca.fit(X) X_r = pca.fit(X).transform(X) # Percentage of variance explained for each components print('Explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) #initiate plot and colors colors = [float(c) / float(Z) for c in range(0, Z)] colors = plt.cm.rainbow(np.linspace(0, 1, Z)) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = plt.subplot(111, projection='3d') if num_components == 2: for c, i, l in zip(colors, range(0, Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], 'o', color=c, label=l) box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) elif num_components == 3: for c, i, l in zip(colors, range(0, Z), labels): ax.plot(X_r[topics == i, 0], X_r[topics == i, 1], X_r[topics == i, 2], 'o', color=c, label=l) fontP = FontProperties() if Z > 12: columns = 2 else: columns = 1 box = ax.get_position() ax.set_position([box.x0, box.y0, 0.5, box.height]) plt.legend(prop=fontP, loc='center left', bbox_to_anchor=(1, 0.5), ncol=columns) plt.title('PCA of Study %s with Z=%s' % (study, str(z))) plt.show() return None
def main(*argv): '''handles user input and runs different analysis functions using plsa model''' parser = argparse.ArgumentParser( description='This scripts runs plsa for a range of topic numbers.') parser.add_argument('-s', '--study', help='The study number', default=None) parser.add_argument('-n', '--name', help='The name of the dataset') parser.add_argument('-z', '--topics', help='The range of topics to be run [z_start, z_end]', nargs='+', type=int, required=True) parser.add_argument('-z_inc', '--increment', help='Increment of topic numbers', type=int, default=1) parser.add_argument('-useC', help='use C code to run plsa', action='store_true') parser.add_argument('-run', help='Specify the run number', type=int, default=1) parser.add_argument('-topotus', help='Specify to calculate the top otus', action='store_true') parser.add_argument( '-n_otus', help='Specify the number of top otus to return per topic', type=int, default=5) parser.add_argument('-calculateX', help='Specify to calculate X', action='store_true') args = parser.parse_args() if args.study is None and args.name is None: print "***Study number or a data name must be specified.***\n" parser.print_help() sys.exit() if not args.calculateX and not args.topotus: print "***Please specify an action to perform from the following: 'topotus', 'calculateX' " parser.print_help() sys.exit() if args.study: study = str(args.study) else: study = None name = args.name if len(args.topics) == 1: z_i = args.topics[0] z_f = args.topics[0] elif len(args.topics) == 2: z_i = min(args.topics) if z_i < 2: z_i = 2 z_f = max(args.topics) else: print "\n***Too many arguments specified for number of topics***\n" parser.print_help() sys.exit() z_inc = args.increment run = args.run useC = args.useC topotus = args.topotus n_otus = args.n_otus calculateX = args.calculateX print(" Study: %s" % study) print(" Name: %s" % name) if z_i != z_f: print(" Topics range: [{0} - {1}] in increments of {2}".format( z_i, z_f, z_inc)) else: print(" Topic number used: {0}".format(z_i)) print(" Using C: %s" % args.useC) print(" Run number: %s" % args.run) m = microbplsa.MicrobPLSA(study=study, name=name) for z in range(z_i, z_f + 1, z_inc): if topotus: print "Finding top otus per each topic" m.open_model(z=z, run=run, useC=True, folder=FOLDER) analysis.top_otus(m, z, n_otus=n_otus) if calculateX: pass
def main(*argv): '''handles user input and runs plsa''' parser = argparse.ArgumentParser( description= 'This scripts runs cross validations to determine the optimal number of topics.' ) parser.add_argument( '-action', help='Action to perform from "all", "train", "test", "mse"', required=True) parser.add_argument('-k', help='Number of folds in kFold cross validation', type=int, default=5) parser.add_argument('-s', '--study', help='The study number', default=None) parser.add_argument('-n', '--name', help='The name of the dataset') parser.add_argument('-z', '--topics', help='The range of topics to be run [z_start, z_end]', nargs='+', type=int, required=True) parser.add_argument('-z_inc', '--increment', help='Increment of topic numbers', type=int, default=1) parser.add_argument('-useC', help='use C code to run plsa', action='store_true') parser.add_argument('-run', help='Specify the number of the run', type=int, default=1) parser.add_argument('-seed', help='Random seed for kFold generator', type=int, default=2) args = parser.parse_args() if args.action not in ['all', 'train', 'test', 'mse']: print "***The specified action is not recognized.***\n" parser.print_help() sys.exit() else: action = str(args.action) k = args.k if args.study is None and args.name is None: print "***Study number or a data name must be specified.***\n" parser.print_help() sys.exit() if args.study: study = str(args.study) else: study = None name = args.name if len(args.topics) == 1: z_i = args.topics[0] z_f = args.topics[0] elif len(args.topics) == 2: z_i = min(args.topics) if z_i < 2: z_i = 2 z_f = max(args.topics) else: print "\n***Too many arguments specified for number of topics***\n" parser.print_help() sys.exit() z_inc = args.increment run = args.run useC = args.useC seed = args.seed print(" Study: %s" % study) print(" Name: %s" % name) if z_i != z_f: print(" Topics tested will be from {0} to {1} in increments of {2}". format(z_i, z_f, z_inc)) else: print(" Number of topics: {0}".format(z_i)) print(" Using C: %s" % args.useC) print(" Number of run used: %s" % args.run) print(" Action performed: %s" % action) print(" Number of folds, k= : %s" % k) m = microbplsa.MicrobPLSA(study=study, name=name) mseAll = [] if action == 'train' or action == 'test' or action == 'all': m.open_data() print 'Data loaded.' for z in range(z_i, z_f + 1, z_inc): if action == 'train' or action == 'all': kFolds = kf.create_folds(m, k, z, shuffle=True, seed=seed) data = m.datamatrix kf.train(k, kFolds, data, study, name, z, numRuns=run, seed=seed, useC=useC, override=False) if action == 'test' or action == 'all': kFolds = kf.open_kFold(study, name, k, z) kf.test(m, kFolds, k, z, useC=useC, seed=seed) if action == 'mse' or action == 'all': kFolds = kf.open_kFold(study, name, k, z) mse = kf.measure_error(m, kFolds, k, z) print "\n The cross validation error for study {0} with {1} topics and {2} folds is: {3} +/-{4}\n".format( study, z, k, round(np.mean(mse), 5), round(np.std(mse), 5)) mse.insert(0, z) mseAll.append(mse) if mseAll: kf.save_mse(mseAll, k, z, study, name, seed, run)