def test_ace_for_picrust_pic(self): """ test_ace_for_picrust with method 'pic' functions as expected with valid input """ actual,actual_ci= ace_for_picrust(self.in_tree1_fp,self.in_trait1_fp, method="pic") expected=Table(['nodes','trait1','trait2'],[['14','2.9737','2.5436'],['12','1.2727','3'],['11','0.6667','3'],['10','5','2']]) self.assertEqual(actual.tostring(),expected.tostring()) expected_ci=Table(['nodes','trait1','trait2'],\ [['14','0.7955|5.1519','0.3655|4.7218'],\ ['12','-1.1009|3.6464','0.6264|5.3736'],\ ['11','-0.4068|1.7402','1.9265|4.0735'],\ ['10','3.3602|6.6398','0.3602|3.6398'],\ ]) self.assertEqual(actual_ci.tostring(),expected_ci.tostring())
def test_ace_for_picrust_ml(self): """ test_ace_for_picrust with method 'ML' functions as expected with valid input """ actual,actual_ci= ace_for_picrust(self.in_tree1_fp, self.in_trait1_fp, method="ML") expected=Table(['nodes','trait1','trait2'],[['14','2.9737','2.5436'],['12','2.3701','2.7056'],['11','0.8370','2.9706'],['10','4.4826','2.1388']]) self.assertEqual(actual.tostring(),expected.tostring()) expected_ci=Table(['nodes','trait1','trait2'],\ [['14','1.4467|4.5007','2.1979|2.8894'],\ ['12','0.9729|3.7674','2.3892|3.0219'],\ ['11','0.147|1.527','2.8143|3.1268'],\ ['10','3.4227|5.5426','1.8988|2.3788'],\ ['sigma','1.9742|0.6981','0.1012|0.0359'],\ ['loglik','-6.7207','5.1623'],\ ]) self.assertEqual(actual_ci.tostring(),expected_ci.tostring())
def gene_expr_diff_to_table(data_path, sep='\t', stable_id_label='', probeset_label='', exp_label='', sig_label='', pval_label='', allow_probeset_many_gene=False, validate=True): """ As per gene_expr_to_table() but with the addition of sig_label and pval_label columns. """ rr = RunRecord('gene_expr_diff_to_table') rr.addInfo('Reading expression diff file', data_path) genes, probes, exp, sig, pval, probes_present = _read_data_file(\ data_path, sep=sep, stable_id_label=stable_id_label, probeset_label=probeset_label, exp_label=exp_label, sig_label=sig_label, pval_label=pval_label, is_diff=True) if probes_present: if validate: # if probes and exp are mismatched, nuke the gene genes, probes, exp, sig, pval =\ _validate_probes_scores(genes, probes, exp, sig, pval) if not allow_probeset_many_gene: # each probe should map to only one gene genes, probes, exp, sig, pval =\ _remove_multimapped_probesets(genes, probes, exp, sig, pval) header = DIFF_HEADER rows = [[g, p, e, s, v] for g, p, e, s, v in \ zip(genes, probes, exp, sig, pval)] return Table(header=header, rows=rows)
def test_wagner_for_picrust(self): """ test_wagner_for_picrust functions as expected with valid input """ actual = wagner_for_picrust(self.in_tree1_fp, self.in_trait1_fp) expected = Table(['nodes', 'trait1', 'trait2'], [['11', '1', '3'], ['12', '2', '3'], ['10', '5', '2'], ['14', '5', '3']]) self.assertEqual(actual, expected)
def test_wagner_for_picrust_with_funky_tip_labels(self): """ test_wagner_for_picrust for a tree with quoted tip labels """ actual = wagner_for_picrust(self.in_tree2_fp, self.in_trait3_fp) expected = Table(['nodes', 'trait1', 'trait2'], [['11', '1', '3'], ['12', '2', '3'], ['10', '5', '2'], ['14', '5', '3']]) self.assertEqual(actual.tostring(), expected.tostring())
def __str__(self): rows = [] for common in self._common_species: species = self._common_species[common] ensembl = self._species_ensembl[species] rows += [[common, species, ensembl]] return str(Table(['Common Name', 'Species Name', 'Ensembl Db Prefix'], rows=rows, space=2).sorted())
def test_invalid_args_fail(self): """incorrect bedgraph args causes RuntimeError""" rows = [['1', i, i+1, 0] for i in range(100, 121)] +\ [['1', i, i+1, 10] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value'], rows=rows) self.assertRaises(RuntimeError, table.tostring, format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0), abc=None)
def output_pca(PCA_matrix, eigvals, names): """Creates a string output for principal coordinates analysis results. PCA_matrix and eigvals are generated with the get_principal_coordinates function. Names is a list of names that corresponds to the columns in the PCA_matrix. It is the order that samples were represented in the initial distance matrix. returns a cogent Table object""" output = [] #get order to output eigenvectors values. reports the eigvecs according #to their cooresponding eigvals from greatest to least vector_order = list(argsort(eigvals)) vector_order.reverse() # make the eigenvector header line and append to output vec_num_header = ['vec_num-%d' % i for i in range(len(eigvals))] header = ['Label'] + vec_num_header #make data lines for eigenvectors and add to output rows = [] for name_i, name in enumerate(names): row = [name] for vec_i in vector_order: row.append(PCA_matrix[vec_i, name_i]) rows.append(row) eigenvectors = Table(header=header, rows=rows, digits=2, space=2, title='Eigenvectors') output.append('\n') # make the eigenvalue header line and append to output header = ['Label'] + vec_num_header rows = [['eigenvalues'] + [eigvals[vec_i] for vec_i in vector_order]] pcnts = (eigvals / sum(eigvals)) * 100 rows += [['var explained (%)'] + [pcnts[vec_i] for vec_i in vector_order]] eigenvalues = Table(header=header, rows=rows, digits=2, space=2, title='Eigenvalues') return eigenvectors.appended('Type', eigenvalues, title='')
def test_invalid_table_fails(self): """assertion error if table has > 4 columns""" rows = [['1', i, i+1, 0, 1] for i in range(100, 121)] +\ [['1', i, i+1, 10, 1] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value', 'blah'], rows=rows) self.assertRaises(AssertionError, table.tostring, format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0), abc=None)
def test_raises_on_incorrect_format_val(self): """raise AssertionError when provide incorrect format value""" rows = [['1', i, i+1, 0] for i in range(100, 121)] +\ [['1', i, i+1, 10] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value'], rows=rows) self.assertRaises(AssertionError, table.tostring, format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0), windowingFunction='sqrt')
def test_ace_for_picrust_pic_single_trait(self): """ test_ace_for_picrust with method 'pic' functions as expected with single column trait table """ actual, ci = ace_for_picrust(self.in_tree1_fp, self.in_trait2_fp, method="pic") expected = Table(['nodes', 'trait1'], [['14', '2.9737'], ['12', '1.2727'], ['11', '0.6667'], ['10', '5']]) self.assertEqual(actual.tostring(), expected.tostring())
def test_ace_for_picrust_pic_with_funky_tip_labels(self): """ test_ace_for_picrust for a tree with underscores in tip labels """ actual, ci = ace_for_picrust(self.in_tree2_fp, self.in_trait3_fp, method="pic") expected = Table(['nodes', 'trait1', 'trait2'], [['14', '2.9737', '2.5436'], ['12', '1.2727', '3'], ['11', '0.6667', '3'], ['10', '5', '2']]) self.assertEqual(actual.tostring(), expected.tostring())
def __repr__(self): from cogent.util.table import Table labels = [] for (i, label) in enumerate(self.Tags): if hasattr(label, '__len__') and not isinstance(label, basestring): label = ','.join(str(z) for z in label) # Table needs unique labels label = "%s (%s)" % (label, i) labels.append(label) heading = [''] + labels a = [[name] + list(row) for (name, row) in zip(labels, self.Matrix)] return str(Table(header=heading, rows=a))
def test_only_required_columns(self): """generate bedgraph from minimal data""" table = Table(header=['chrom', 'start', 'end', 'value'], rows=[['1', 100, i, 0] for i in range(101,111)] + \ [['1', 150, i, 10] for i in range(151,161)]) bgraph = table.tostring(format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0)) self.assertTrue(bgraph, '\n'.join(['track type=bedGraph name="test track" '\ +'description="test of bedgraph" color=255,0,0', '1\t100\t110\t0', '1\t150\t160\t10']))
def test_merged_overlapping_spans(self): """bedgraph merged overlapping spans, one chrom""" rows = [['1', i, i+1, 0] for i in range(100, 121)] +\ [['1', i, i+1, 10] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value'], rows=rows) bgraph = table.tostring(format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0)) self.assertTrue(bgraph, '\n'.join(['track type=bedGraph name="test track" '\ +'description="test of bedgraph" color=255,0,0', '1\t100\t120\t0', '1\t150\t160\t10']))
def gene_expr_to_table(data_path, sep='\t', stable_id_label='', probeset_label='', exp_label='', allow_probeset_many_gene=False, validate=True): """ Returns a cogent table object Deals with a simple tab-delimited representation of gene expression data which may have come from either micro-array or mRNA-seq experiments. Data from micro-arrays will have probeset information for each gene and a score to match each probe. RNA-seq data will not have probes and simply a single score for each gene. In this case we will create a fake probe for each gene of the form 'P' + a unique integer. Probset id's and expressions scores are separated by the pipe -- | -- character. The probset and expression scores are then converted to tuples of ints or floats respectively. Arguments: - probeset_label: name of column containing probesets - exp_label: name of column containing expression scores - stable_id_label: name of column containing Ensembl stable IDs - allow_probeset_many_gene: whether one probeset can map to multiple genes. If not we remove probes and scores that multi- map. - validate: checks that -- stable IDs are unique in the file, that for each row the number of probesets equals the number of expression scores. Removes the gene entry. """ rr = RunRecord('geneExprDataToTable') rr.addInfo('Reading expression data', data_path) genes, probes, exp, probes_present = _read_data_file(data_path, sep=sep, stable_id_label=stable_id_label, probeset_label=probeset_label, exp_label=exp_label) if probes_present: if validate: # if probes and scores are mismatched, nuke the gene genes, probes, exp = \ _validate_probes_scores(genes, probes, exp) if not allow_probeset_many_gene: # each probe should map to only one gene genes, probes, exp = \ _remove_multimapped_probesets(genes, probes, exp) rows = [[g,p,e] for g,p,e in zip(genes, probes, exp)] return Table(header=EXPR_HEADER, rows=rows)
def test_boolean_correctly_formatted(self): """boolean setting correctly formatted""" rows = [['1', i, i+1, 0] for i in range(100, 121)] +\ [['1', i, i+1, 10] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value'], rows=rows) bgraph = table.tostring(format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0), autoScale=True) self.assertTrue(bgraph, '\n'.join(['track type=bedGraph name="test track" '\ +'description="test of bedgraph" color=255,0,0 autoScale=on', '1\t100\t110\t1', '1\t150\t160\t10']))
def test_int_correctly_formatted(self): """int should be correctly formatted""" rows = [['1', i, i+1, 0] for i in range(100, 121)] +\ [['1', i, i+1, 10] for i in range(150, 161)] table = Table(header=['chrom', 'start', 'end', 'value'], rows=rows) bgraph = table.tostring(format='bedgraph', name='test track', description='test of bedgraph', color=(255,0,0), smoothingWindow=10) self.assertTrue(bgraph, '\n'.join(['track type=bedGraph name="test track" '\ +'description="test of bedgraph" color=255,0,0 smoothingWindow=10', '1\t100\t110\t1', '1\t150\t160\t10']))
def __repr__(self): """print table format""" header = ['Type', 'Levels'] if self.Species not in self._species_feature_levels: result = '' else: collate = [] feature_levels = self._species_feature_levels[self.Species] for feature in feature_levels.keys(): record = feature_levels[feature] collate.append([feature, ', '.join(record.levels)]) result = str(Table(header, collate, title=self.Species)) return result
def __repr__(self): """print table format""" header = ['Type', 'Levels'] result = [] for species in self._species_feature_levels.keys(): feature_levels = self._species_feature_levels[species] collate = [] for feature in feature_levels.keys(): collate.append([feature, feature_levels[feature].levels]) t = Table(header, collate, title=species) result.append(str(t)) result = '\n'.join(result) return result
def parse_wagner_parsimony_output(raw_output_with_comments,remove_num_tips=0): '''Parses wagner parsimony output from Count and returns a Cogent Table object''' #keep only lines with actual ASR count information #throw away first 2 columns and last 4 columns (these are extra stuff from Count) filtered_output=[x.split('\t')[1:-4] for x in raw_output_with_comments if x[0:8] == '# FAMILY'] if(remove_num_tips): #remove columns that contain trait data for tips (not internal node data) filtered_output=[[x[0]]+ x[remove_num_tips+1:] for x in filtered_output] #Take the first row as the header and the rest as rows in the table table=Table(filtered_output[0],filtered_output[1:]) return table
def __str__(self): """Returns string representation of SummaryStatistics object.""" result = [] for field in ["Count", "Sum", "Median", "Mean", "StandardDeviation", \ "Variance", "SumSquares"]: try: val = getattr(self, field) if not val: continue result.append([field, val]) except: pass if not result: return '' return str(Table("Statistic Value".split(), result, column_templates={'Value': "%.4g"}))
def _get_method_link_species_set(self): if self._method_species_link is not None: return self._method_species_link method_link_table = self.ComparaDb.getTable('method_link') query = sql.select( [method_link_table], method_link_table.c['class'].like('%' + 'alignment' + '%')) methods = query.execute().fetchall() method_link_ids = dict([(r['method_link_id'], r) for r in methods]) method_link_species_table = \ self.ComparaDb.getTable('method_link_species_set') query = sql.select([method_link_species_table], sql.and_( method_link_species_table.c.species_set_id.in_( self.species_set), method_link_species_table.c.method_link_id.in_( method_link_ids.keys()))) records = query.execute().fetchall() # store method_link_id, type, species_set_id, # method_link_species_set.name, class header = [ 'method_link_species_set_id', 'method_link_id', 'species_set_id', 'align_method', 'align_clade' ] rows = [] for record in records: ml_id = record['method_link_id'] sp_set_id = record['species_set_id'] ml_sp_set_id = record['method_link_species_set_id'] clade_name = record['name'] aln_name = method_link_ids[ml_id]['type'] rows += [[ml_sp_set_id, ml_id, sp_set_id, aln_name, clade_name]] if rows == []: rows = empty((0, len(header))) t = Table(header=header, rows=rows, space=2, row_ids=True, title='Align Methods/Clades') self._method_species_link = t return t
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False): '''Runs the ancestral state reconstructions in parallel''' asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(verbose): print "Loading trait table..." #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file table=LoadTable(filename=table, header=True, sep='\t') #get dimensions of the table dim=table.Shape created_tmp_files=[] output_files=[] ci_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(verbose): print "Creating temporary input files in: ",tmp_dir #iterate over each column for i in range(1,dim[1]): #create a new table with only a single trait single_col_table=table.getColumns([0,i]) #write the new table to a tmp file single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_') single_col_table.writeToFile(single_col_fp,sep='\t') created_tmp_files.append(single_col_fp) #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_') output_files.append(tmp_output_fp) tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_') ci_files.append(tmp_ci_fp) #create the job command cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp) #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() created_tmp_files.extend(output_files) created_tmp_files.extend(ci_files) if(verbose): print "Launching parallel jobs." #run the job command job_prefix='asr' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs) if(verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files) if(verbose): print "Jobs are done running. Now combining all tmp files." #Combine output files combined_table=combine_asr_tables(output_files) combined_ci_table=combine_asr_tables(ci_files) #create a Table object combined_table=Table(header=combined_table[0],rows=combined_table[1:]) combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:]) #clean up all tmp files for file in created_tmp_files: remove(file) #return the combined table return combined_table,combined_ci_table