def equalize_tables_at_rarefaction_point(otu_table_p,name_otu_table,reference_table_p,name_reference_table,seq_number,output_p): sample_de_novo = subset_samples_by_seq_number(biom_stats(reference_table_p),seq_number) sample_ref = subset_samples_by_seq_number(biom_stats(otu_table_p),seq_number) common_sample_ids = sample_de_novo.intersection(sample_ref) #filtering from otu table new_otu_table = filter_samples(otu_table_p,common_sample_ids,0,np.inf) new_reference_table = filter_samples(reference_table_p,common_sample_ids,0,np.inf) doc1 = open(output_p+"/"+name_otu_table.replace("/","_")+"_equalized_"+str(seq_number)+'.biom',"w") doc1.write(format_biom_table(new_otu_table)) doc1.close() doc2 = open(output_p+"/"+name_reference_table.replace("/","_")+"_equalized_"+str(seq_number)+'.biom',"w") doc2.write(format_biom_table(new_reference_table)) doc2.close() print " \nPercentage of Samples : {}/{} , {}\n".format( len(new_otu_table.SampleIds) , len(otu_table_p.SampleIds) , (len(new_otu_table.SampleIds)/len(otu_table_p.SampleIds)) * 100 ) return
def equalize_tables_at_rarefaction_point(otu_table_p, name_otu_table, reference_table_p, name_reference_table, seq_number, output_p): sample_de_novo = subset_samples_by_seq_number( biom_stats(reference_table_p), seq_number) sample_ref = subset_samples_by_seq_number(biom_stats(otu_table_p), seq_number) common_sample_ids = sample_de_novo.intersection(sample_ref) #filtering from otu table new_otu_table = filter_samples(otu_table_p, common_sample_ids, 0, np.inf) new_reference_table = filter_samples(reference_table_p, common_sample_ids, 0, np.inf) doc1 = open( output_p + "/" + name_otu_table.replace("/", "_") + "_equalized_" + str(seq_number) + '.biom', "w") doc1.write(format_biom_table(new_otu_table)) doc1.close() doc2 = open( output_p + "/" + name_reference_table.replace("/", "_") + "_equalized_" + str(seq_number) + '.biom', "w") doc2.write(format_biom_table(new_reference_table)) doc2.close() print " \nPercentage of Samples : {}/{} , {}\n".format( len(new_otu_table.SampleIds), len(otu_table_p.SampleIds), (len(new_otu_table.SampleIds) / len(otu_table_p.SampleIds)) * 100) return
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd'), constructor=DenseOTUTable) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.otu_table1_fp, 'w').write( format_biom_table(self.otu_table1)) self.otu_table2 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_'], constructor=DenseOTUTable) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.otu_table2_fp, 'w').write( format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list( 'abcd'), constructor=DenseOTUTable) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.single_sample_otu_table_fp, 'w').write( format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.l19_data = numpy.array([ [7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 2, 0, 0, 0, 1, 0, 0, 0], [2, 4, 0, 0, 0, 1, 0, 0, 0], [1, 7, 0, 0, 0, 0, 0, 0, 0], [0, 8, 0, 0, 0, 0, 0, 0, 0], [0, 7, 1, 0, 0, 0, 0, 0, 0], [0, 4, 2, 0, 0, 0, 2, 0, 0], [0, 2, 4, 0, 0, 0, 1, 0, 0], [0, 1, 7, 0, 0, 0, 0, 0, 0], [0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 7, 1, 0, 0, 0, 0, 0], [0, 0, 4, 2, 0, 0, 0, 3, 0], [0, 0, 2, 4, 0, 0, 0, 1, 0], [0, 0, 1, 7, 0, 0, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0], [0, 0, 0, 7, 1, 0, 0, 0, 0], [0, 0, 0, 4, 2, 0, 0, 0, 4], [0, 0, 0, 2, 4, 0, 0, 0, 1], [0, 0, 0, 1, 7, 0, 0, 0, 0] ]) self.l19_sample_names = [ 'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6', 'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', 'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19'] self.l19_taxon_names = ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9'] self.l19_taxon_names_w_underscore = ['ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9'] l19_str = format_biom_table(DenseOTUTable(self.l19_data.T, self.l19_sample_names, self.l19_taxon_names)) fd, self.l19_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') close(fd) open(self.l19_fp, 'w').write(l19_str) l19_str_w_underscore = format_biom_table(DenseOTUTable(self.l19_data.T, self.l19_sample_names, self.l19_taxon_names_w_underscore)) fd, self.l19_str_w_underscore_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') close(fd) open(self.l19_str_w_underscore_fp, 'w').write(l19_str_w_underscore) self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\ ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));' self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode) self.files_to_remove = [self.l19_fp, self.l19_str_w_underscore_fp] self.folders_to_remove = []
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.l19_data = numpy.array([[7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 2, 0, 0, 0, 1, 0, 0, 0], [2, 4, 0, 0, 0, 1, 0, 0, 0], [1, 7, 0, 0, 0, 0, 0, 0, 0], [0, 8, 0, 0, 0, 0, 0, 0, 0], [0, 7, 1, 0, 0, 0, 0, 0, 0], [0, 4, 2, 0, 0, 0, 2, 0, 0], [0, 2, 4, 0, 0, 0, 1, 0, 0], [0, 1, 7, 0, 0, 0, 0, 0, 0], [0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 7, 1, 0, 0, 0, 0, 0], [0, 0, 4, 2, 0, 0, 0, 3, 0], [0, 0, 2, 4, 0, 0, 0, 1, 0], [0, 0, 1, 7, 0, 0, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0], [0, 0, 0, 7, 1, 0, 0, 0, 0], [0, 0, 0, 4, 2, 0, 0, 0, 4], [0, 0, 0, 2, 4, 0, 0, 0, 1], [0, 0, 0, 1, 7, 0, 0, 0, 0]]) self.l19_sample_names = ['sam1', 'sam2', 'sam3', 'sam4', 'sam5','sam6',\ 'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', \ 'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19'] self.l19_taxon_names = ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',\ 'tax6', 'tax7', 'tax8', 'tax9'] self.l19_taxon_names_w_underscore = [ 'ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9' ] l19_str = format_biom_table( DenseOTUTable(self.l19_data.T, self.l19_sample_names, self.l19_taxon_names)) self.l19_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') open(self.l19_fp, 'w').write(l19_str) l19_str_w_underscore = format_biom_table( DenseOTUTable(self.l19_data.T, self.l19_sample_names, self.l19_taxon_names_w_underscore)) self.l19_str_w_underscore_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') open(self.l19_str_w_underscore_fp, 'w').write(l19_str_w_underscore) self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\ ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));' self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode) self.files_to_remove = [self.l19_fp, self.l19_str_w_underscore_fp] self.folders_to_remove = []
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd'), constructor=DenseOTUTable) self.otu_table1_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.otu_table1_fp,'w').write(\ format_biom_table(self.otu_table1)) self.otu_table2 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_'], constructor=DenseOTUTable) self.otu_table2_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.otu_table2_fp,'w').write(\ format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list('abcd'), constructor=DenseOTUTable) self.single_sample_otu_table_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.single_sample_otu_table_fp,'w').write(\ format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [ self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp ]
def test_split_otu_table_on_sample_metadata_extra_mapping_entries(self): """ split_otu_table_on_sample_metadata functions as expected with extra mapping data """ actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1, self.mapping_f2, "Treatment")) actual = [(id_, parse_biom_table(e)) for id_, e in actual] exp = [(id_, parse_biom_table(e)) for id_, e in otu_table_exp1] actual.sort() exp.sort() for a, e in zip(actual, exp): self.assertEqual(a, e, "OTU tables are not equal:\n%s\n%s" % (format_biom_table(a[1]), format_biom_table(e[1])))
def test_split_otu_table_on_sample_metadata_extra_mapping_entries(self): """ split_otu_table_on_sample_metadata functions as expected with extra mapping data """ actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1, self.mapping_f2, "Treatment")) actual = [(id_,parse_biom_table(e)) for id_, e in actual] exp = [(id_,parse_biom_table(e)) for id_, e in otu_table_exp1] actual.sort() exp.sort() for a,e in zip(actual,exp): self.assertEqual(a,e,"OTU tables are not equal:\n%s\n%s" % \ (format_biom_table(a[1]),format_biom_table(e[1])))
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{ 'domain': 'Archaea' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory( self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) _, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(_) _, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(_) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') open(self.otu_table_fp, 'w').write(self.otu_table_str) open(self.otu_table_meta_fp, 'w').write(self.otu_table_meta_str) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def generate_full_otu_table(study, study_input_dir, zip_fname, files_to_remove, biom_files,output_dir): """ Merge OTU tables """ master = parse_biom_table(open(biom_files[0],'U')) # only merge if there is more than 1 biom file if len(biom_files) > 1: for input_fp in biom_files[1:]: master = master.merge(parse_biom_table(open(input_fp,'U'))) # write full biom-table full_biom_table_fname='study_%s_closed_reference_otu_table.biom' % \ (str(study)) full_biom_table_fp=join(output_dir,full_biom_table_fname) # add to list of files to remove files_to_remove.append(full_biom_table_fp) biom_f = open(join(full_biom_table_fp),'w') biom_f.write(format_biom_table(master)) biom_f.close() # zip the full biom-table file #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname, # full_biom_table_fname) #system(cmd_call) return files_to_remove
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) labels = opts.labels.split(',') if opts.all_strings: process_fs = [str] * len(labels) observation_metadata = parse_taxonomy_to_otu_metadata(\ open(opts.taxonomy_fp,'U'),labels=labels,process_fs=process_fs) else: observation_metadata = parse_taxonomy_to_otu_metadata(\ open(opts.taxonomy_fp,'U'),labels=labels) otu_table = parse_biom_table(open(opts.input_fp, 'U')) if otu_table.ObservationMetadata != None: # if there is already metadata associated with the # observations, confirm that none of the metadata names # are already present existing_keys = otu_table.ObservationMetadata[0].keys() for label in labels: if label in existing_keys: option_parser.error(\ "%s is already an observation metadata field." " Can't add it, so nothing is being added." % label) otu_table.addObservationMetadata(observation_metadata) output_f = open(opts.output_fp, 'w') output_f.write(format_biom_table(otu_table)) output_f.close()
def split_otu_table_on_taxonomy_to_files(otu_table_fp, level, output_dir, md_identifier='taxonomy', md_processor=process_md_as_list): """ Split OTU table by taxonomic level, writing otu tables to output dir """ results = [] otu_table = parse_biom_table(open(otu_table_fp, 'U')) create_dir(output_dir) def split_f(obs_md): try: result = md_processor(obs_md, md_identifier, level) except KeyError: raise KeyError,\ "Metadata identifier (%s) is not associated with all (or any) observerations. You can modify the key with the md_identifier parameter." % md_identifier except TypeError: raise TypeError,\ "Can't correctly process the metadata string. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_as_string." except AttributeError: raise AttributeError,\ "Metadata category not found. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_identifier \"Consensus Lineage\"." return result for bin, sub_otu_table in otu_table.binObservationsByMetadata(split_f): output_fp = '%s/otu_table_%s.biom' % (output_dir, bin) output_f = open(output_fp, 'w') output_f.write(format_biom_table(sub_otu_table)) output_f.close() results.append(output_fp) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) labels = opts.labels.split(',') if opts.all_strings: process_fs = [str] * len(labels) observation_metadata = parse_taxonomy_to_otu_metadata(\ open(opts.taxonomy_fp,'U'),labels=labels,process_fs=process_fs) else: observation_metadata = parse_taxonomy_to_otu_metadata(\ open(opts.taxonomy_fp,'U'),labels=labels) otu_table = parse_biom_table(open(opts.input_fp,'U')) if otu_table.ObservationMetadata != None: # if there is already metadata associated with the # observations, confirm that none of the metadata names # are already present existing_keys = otu_table.ObservationMetadata[0].keys() for label in labels: if label in existing_keys: option_parser.error(\ "%s is already an observation metadata field." " Can't add it, so nothing is being added." % label) otu_table.addObservationMetadata(observation_metadata) output_f = open(opts.output_fp,'w') output_f.write(format_biom_table(otu_table)) output_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_biom_table(open(opts.input_otu_table,'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp,'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: result = sort_otu_table(otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds)) # format and write the otu table result_str = format_biom_table(result) of = open(opts.output_fp,'w') of.write(result_str) of.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_biom_table(open(opts.input_otu_table, 'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp, 'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: result = sort_otu_table( otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds)) # format and write the otu table result_str = format_biom_table(result) of = open(opts.output_fp, 'w') of.write(result_str) of.close()
def split_otu_table_on_taxonomy_to_files(otu_table_fp, level, output_dir, md_identifier='taxonomy', md_processor=process_md_as_list): """ Split OTU table by taxonomic level, writing otu tables to output dir """ results = [] otu_table = parse_biom_table(open(otu_table_fp,'U')) create_dir(output_dir) def split_f(obs_md): try: result = md_processor(obs_md,md_identifier,level) except KeyError: raise KeyError,\ "Metadata identifier (%s) is not associated with all (or any) observerations. You can modify the key with the md_identifier parameter." % md_identifier except TypeError: raise TypeError,\ "Can't correctly process the metadata string. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_as_string." except AttributeError: raise AttributeError,\ "Metadata category not found. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_identifier \"Consensus Lineage\"." return result for bin, sub_otu_table in otu_table.binObservationsByMetadata(split_f): output_fp = '%s/otu_table_%s.biom' % (output_dir,bin) output_f = open(output_fp,'w') output_f.write(format_biom_table(sub_otu_table)) output_f.close() results.append(output_fp) return results
def make_otu_table(otu_map_f, otu_to_taxonomy=None, delim='_', table_id=None, sample_metadata=None, constructor=SparseOTUTable): data, sample_ids, otu_ids = parse_otu_map(otu_map_f, delim) if otu_to_taxonomy is not None: otu_metadata = [] for o in otu_ids: try: otu_metadata.append({'taxonomy': otu_to_taxonomy[o]}) except KeyError: otu_metadata.append({'taxonomy': ["None"]}) else: otu_metadata = None if sample_metadata is not None: raise NotImplementedError( "Passing of sample metadata to make_otu_table is not currently supported.") try: otu_table = table_factory(data, sample_ids, otu_ids, sample_metadata=sample_metadata, observation_metadata=otu_metadata, table_id=table_id, constructor=constructor, dtype=int) except ValueError as e: raise ValueError("Couldn't create OTU table. Is your OTU map empty?" " Original error message: %s" % (str(e))) return format_biom_table(otu_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_table = parse_biom_table(open(opts.input_otu_table_fp, 'U')) output_table_f = open(opts.output_otu_table_fp, 'w') metadata_field = opts.metadata_field positive_taxa = opts.positive_taxa negative_taxa = opts.negative_taxa if positive_taxa is not None: positive_taxa = positive_taxa.split(',') else: positive_taxa = None if negative_taxa is not None: negative_taxa = negative_taxa.split(',') else: negative_taxa = None filter_fn = get_otu_ids_from_taxonomy_f( positive_taxa, negative_taxa, metadata_field) output_table = input_table.filterObservations(filter_fn) output_table_f.write(format_biom_table(output_table)) output_table_f.close()
def _write_rarefaction(self, fname, sub_otu_table): """ depth and rep can be numbers or strings """ if sub_otu_table.isEmpty(): return f = open(fname, 'w') f.write(format_biom_table(sub_otu_table)) f.close()
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/" if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory( data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list("XYZ"), observation_ids=list("abcd"), constructor=DenseOTUTable, ) self.otu_table1_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.otu_table1_fp, "w").write(format_biom_table(self.otu_table1)) self.otu_table2 = table_factory( data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list("XYZ"), observation_ids=["a", "b", "c", "d_"], constructor=DenseOTUTable, ) self.otu_table2_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.otu_table2_fp, "w").write(format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list("X"), observation_ids=list("abcd"), constructor=DenseOTUTable ) self.single_sample_otu_table_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.single_sample_otu_table_fp, "w").write(format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick("((a:2,b:3):2,(c:1,d:2):7);") self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp, 'U')) output_f = open(opts.output_fp, 'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp is not None): sample_id_f_ids = set([ l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#') ]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def test_split_otu_table_on_sample_metadata(self): """ split_otu_table_on_sample_metadata functions as expected with valid input """ actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1, self.mapping_f1, "Treatment")) for id_, e in actual: try: parse_biom_table(e) except: print e actual = [(id_,parse_biom_table(e)) for id_, e in actual] exp = [(id_,parse_biom_table(e)) for id_, e in otu_table_exp1] actual.sort() exp.sort() for a,e in zip(actual,exp): self.assertEqual(a,e,"OTU tables are not equal:\n%s\n%s" % \ (format_biom_table(a[1]),format_biom_table(e[1])))
def _write_rarefaction(self, depth, rep, sub_otu_table): """ depth and rep can be numbers or strings """ if sub_otu_table.isEmpty(): return fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.biom' f = open(os.path.join(self.output_dir, fname), 'w') f.write(format_biom_table(sub_otu_table)) f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sample_mapping_fp = opts.sample_mapping_fp output_fp = opts.output_fp verbose = opts.verbose sample_mapping_file = open(sample_mapping_fp, 'U') result = sample_mapping_to_biom_table(sample_mapping_file) open(output_fp, 'w').write(format_biom_table(result))
def _write_rarefaction(self, depth, rep, sub_otu_table): """ depth and rep can be numbers or strings """ if sub_otu_table.isEmpty(): return fname = 'rarefaction_'+str(depth)+'_'+str(rep)+'.biom' f = open(os.path.join(self.output_dir,fname), 'w') f.write(format_biom_table(sub_otu_table)) f.close()
def test_split_otu_table_on_sample_metadata(self): """ split_otu_table_on_sample_metadata functions as expected with valid input """ actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1, self.mapping_f1, "Treatment")) for id_, e in actual: try: parse_biom_table(e) except: print e actual = [(id_, parse_biom_table(e)) for id_, e in actual] exp = [(id_, parse_biom_table(e)) for id_, e in otu_table_exp1] actual.sort() exp.sort() for a, e in zip(actual, exp): self.assertEqual(a, e, "OTU tables are not equal:\n%s\n%s" % (format_biom_table(a[1]), format_biom_table(e[1])))
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = numpy.array([[2,1,0], [0,5,0], [0,3,0], [1,2,0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{'domain':'Archaea'}, {'domain':'Bacteria'}, {'domain':'Bacteria'}, {'domain':'Bacteria'}] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory(self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction',suffix='.biom') self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction',suffix='.biom') self.rare_dir = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction_dir',suffix='',result_constructor=str) os.mkdir(self.rare_dir) open(self.otu_table_fp,'w').write(self.otu_table_str) open(self.otu_table_meta_fp,'w').write(self.otu_table_meta_str) self._paths_to_clean_up=[self.otu_table_fp,self.otu_table_meta_fp] self._dirs_to_clean_up=[self.rare_dir]
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = opts.input_fps master = parse_biom_table(open(input_fps[0], 'U')) for input_fp in input_fps[1:]: master = master.merge(parse_biom_table(open(input_fp, 'U'))) out_f = open(opts.output_fp, 'w') out_f.write(format_biom_table(master)) out_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp != None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp,'U')) output_f = open(opts.output_fp,'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp,'U'),valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp != None): sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp,'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp,'U')) mapping_headers, mapping_data = \ filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp,'w').write(format_mapping_file(mapping_headers,mapping_data))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if not isfile(opts.input_path): raise IOError, \ "Input path (%s) not valid. Does it exist?" % opts.input_path samples, otus, data = parse_trflp(open(opts.input_path,'U')) output_f = open(opts.output_path, 'w') t = table_factory(data,samples,otus) output_f.write(format_biom_table(t)) output_f.close()
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/" self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list("YXZ") self.taxon_names = list("bacd") self.otu_metadata = [ {"domain": "Archaea"}, {"domain": "Bacteria"}, {"domain": "Bacteria"}, {"domain": "Bacteria"}, ] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory( self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata ) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom") self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom") self.rare_dir = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="test_rarefaction_dir", suffix="", result_constructor=str ) os.mkdir(self.rare_dir) open(self.otu_table_fp, "w").write(self.otu_table_str) open(self.otu_table_meta_fp, "w").write(self.otu_table_meta_str) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def simsam_range_to_files(table, tree, simulated_sample_sizes, dissimilarities, output_dir, mapping_f=None, output_table_basename="table", output_map_basename="map"): """Applies sim_otu_table over a range of parameters, writing output to file table: the input table to simulate samples from tree: tree related OTUs in input table simulated_sample_sizes: a list of ints defining how many output samples should be create per input sample dissimilarities: a list of floats containing the dissimilarities to use in simulating tables output_dir: the directory where all output tables and mapping files should be written mapping_f: file handle for metadata mapping file, if a mapping file should be created with the samples from each simulated table output_table_basename: basename for output table files (default: table) output_map_basename: basename for output mapping files (default: map) """ create_dir(output_dir) for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities, mapping_f): output_table = e[0] output_mapping_lines = e[1] simulated_sample_size = e[2] dissimilarity = e[3] output_table_fp = join( output_dir, '%s_n%d_d%r.biom' % (output_table_basename, simulated_sample_size, dissimilarity)) output_table_f = open(output_table_fp, 'w') output_table_f.write(format_biom_table(output_table)) output_table_f.close() if output_mapping_lines != None: output_map_fp = join( output_dir, '%s_n%d_d%r.txt' % (output_map_basename, simulated_sample_size, dissimilarity)) output_map_f = open(output_map_fp, 'w') output_map_f.write(''.join(output_mapping_lines)) output_map_f.close()
def simsam_range_to_files(table, tree, simulated_sample_sizes, dissimilarities, output_dir, mapping_f=None, output_table_basename="table", output_map_basename="map"): """Applies sim_otu_table over a range of parameters, writing output to file table: the input table to simulate samples from tree: tree related OTUs in input table simulated_sample_sizes: a list of ints defining how many output samples should be create per input sample dissimilarities: a list of floats containing the dissimilarities to use in simulating tables output_dir: the directory where all output tables and mapping files should be written mapping_f: file handle for metadata mapping file, if a mapping file should be created with the samples from each simulated table output_table_basename: basename for output table files (default: table) output_map_basename: basename for output mapping files (default: map) """ create_dir(output_dir) for e in simsam_range(table,tree,simulated_sample_sizes,dissimilarities,mapping_f): output_table = e[0] output_mapping_lines = e[1] simulated_sample_size = e[2] dissimilarity = e[3] output_table_fp = join(output_dir,'%s_n%d_d%r.biom' % (output_table_basename, simulated_sample_size, dissimilarity)) output_table_f = open(output_table_fp, 'w') output_table_f.write(format_biom_table(output_table)) output_table_f.close() if output_mapping_lines != None: output_map_fp = join(output_dir,'%s_n%d_d%r.txt' % (output_map_basename, simulated_sample_size, dissimilarity)) output_map_f = open(output_map_fp, 'w') output_map_f.write(''.join(output_mapping_lines)) output_map_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) out_fh = open(opts.output_file,'w') otu_table_fh = open(opts.otu_table,'U') otu_table = parse_biom_table(otu_table_fh) tree_fh = open(opts.tree_file,'U') tree = DndParser(tree_fh) res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \ sim_otu_table(otu_table.SampleIds, otu_table.ObservationIds, otu_table.iterSamples(), otu_table.ObservationMetadata, tree, opts.num, opts.dissim) rich_table = table_factory(res_otu_mtx,res_sam_names,res_otus, observation_metadata=res_otu_metadata) out_fh.write(format_biom_table(rich_table))
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) otu_table = parse_biom_table(otu_table_f) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) try: filtered_otu_table = otu_table.filterSamples( lambda values,id_,metadata: id_ in sample_ids_to_keep) except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, format_biom_table(filtered_otu_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) out_fh = open(opts.output_file, 'w') otu_table_fh = open(opts.otu_table, 'U') otu_table = parse_biom_table(otu_table_fh) tree_fh = open(opts.tree_file, 'U') tree = DndParser(tree_fh) res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \ sim_otu_table(otu_table.SampleIds, otu_table.ObservationIds, otu_table.iterSamples(), otu_table.ObservationMetadata, tree, opts.num, opts.dissim) rich_table = table_factory(res_otu_mtx, res_sam_names, res_otus, observation_metadata=res_otu_metadata) out_fh.write(format_biom_table(rich_table))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp mapping_category = opts.mapping_category otu_table_fp = opts.otu_table_fp output_fp = opts.output_fp normalize = opts.normalize # define a function that returns the bin a sample shouldbe placed into bin_function = lambda sample_metadata: sample_metadata[mapping_category] # parse the sample metadata and add it to the OTU table (we assume that # sample metadata is not already present in the table) mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) # added in ability to combine metadata columns and summarize based on the # new combined category if '&&' in mapping_category: new_mapping = [] new_mapping.append(headers) for i in range(len(mapping)): new_mapping.append(mapping[i]) #Create an array using multiple columns from mapping file combinecolorby = mapping_category.split('&&') mapping = combine_map_label_cols(combinecolorby, new_mapping) sample_metadata = mapping_file_to_dict(mapping, headers) table = parse_biom_table(open(otu_table_fp, 'U')) table.addSampleMetadata(sample_metadata) # create a new OTU table where samples are binned based on their return # value from bin_function result = table.collapseSamplesByMetadata(bin_function, norm=False, min_group_size=1) # normalize the result if requested by the user if normalize: result = result.normObservationBySample() # write a new BIOM file f = open(output_fp, 'w') f.write(format_biom_table(result)) f.close()
def split_otu_table_on_sample_metadata(otu_table_f, mapping_f, mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) otu_table = parse_biom_table(otu_table_f) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) try: filtered_otu_table = otu_table.filterSamples( lambda values, id_, metadata: id_ in sample_ids_to_keep) except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, format_biom_table(filtered_otu_table)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp mapping_category = opts.mapping_category otu_table_fp = opts.otu_table_fp output_fp = opts.output_fp normalize = opts.normalize # define a function that returns the bin a sample shouldbe placed into bin_function = lambda sample_metadata: sample_metadata[mapping_category] # parse the sample metadata and add it to the OTU table (we assume that # sample metadata is not already present in the table) mapping,headers,comments = parse_mapping_file(open(mapping_fp,'U')) # added in ability to combine metadata columns and summarize based on the # new combined category if '&&' in mapping_category: new_mapping=[] new_mapping.append(headers) for i in range(len(mapping)): new_mapping.append(mapping[i]) #Create an array using multiple columns from mapping file combinecolorby=mapping_category.split('&&') mapping=combine_map_label_cols(combinecolorby,new_mapping) sample_metadata=mapping_file_to_dict(mapping,headers) table = parse_biom_table(open(otu_table_fp,'U')) table.addSampleMetadata(sample_metadata) # create a new OTU table where samples are binned based on their return # value from bin_function result = table.collapseSamplesByMetadata(bin_function, norm=False, min_group_size=1) # normalize the result if requested by the user if normalize: result = result.normObservationBySample() # write a new BIOM file f = open(output_fp,'w') f.write(format_biom_table(result)) f.close()
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i,input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir,i) if iteration_output_exists(iteration_output_dir,min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger,[input_fp,refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i,input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id,str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables",merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def iterative_pick_subsampled_open_referenence_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_tax_align_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_referenence_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_tax_align_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) final_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp, final_otu_table_fp], error_on_missing=False) taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(final_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def pick_subsampled_open_referenence_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_tax_align_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking denovo_otu_picking_method = 'uclust' reference_otu_picking_method = 'uclust_ref' # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_otu_map_fp = \ '%s/%s_otus.txt' % (prefilter_dir,input_basename) prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'), valid_states) if len(sample_ids) < 1: option_parser.error(\ "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\ valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp,'U')) otu_counts = [] summary_figure_fp = join(output_dir,'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp,'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids == None: output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write(\ "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\ (fraction_for_core_str, valid_states,' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iterObservations(): output_f.write('%s\t%s\n' % (id_,md[otu_md])) otu_count += 1 output_f.close() # write the core biom table output_table_f = open(output_table_fp,'w') output_table_f.write(format_biom_table(core_table)) output_table_f.close() # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core),max(fractions_for_core)) ylim(0,max(otu_counts)+1) xlabel("Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.map_file = """#SampleID Day time Description #This is some comment about the study 1 090809 1200 some description of sample1 2 090809 1800 some description of sample2 3 090909 1200 some description of sample3 4 090909 1800 some description of sample4 5 091009 1200 some description of sample5""" self.cat_by_sample = { "1": [("Day", "090809"), ("time", "1200")], "2": [("Day", "090809"), ("time", "1800")], "3": [("Day", "090909"), ("time", "1200")], "4": [("Day", "090909"), ("time", "1800")], "5": [("Day", "091009"), ("time", "1200")] } self.sample_by_cat = { ("Day", "090809"): ["1", "2"], ("Day", "090909"): ["3", "4"], ("Day", "091009"): ["5"], ("time", "1200"): ["1", "3", "5"], ("time", "1800"): ["2", "4"] } self.num_cats = 2 self.meta_dict = { "1": ["090809 1200", 0], "2": ["090809 1800", 0], "3": ["090909 1200", 0], "4": ["090909 1800", 0], "5": ["091009 1200", 0] } self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"] self.node_labels = [ "node_name", "node_disp_name", "ntype", "degree", "weighted_degree", "consensus_lin", "Day", "time" ] self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]] self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0], [0, 0, 3, 1, 0], [0, 0, 0, 0, 5], [0, 4, 2, 0, 0], [3, 6, 0, 0, 0], [0, 0, 4, 2, 0], [0, 0, 0, 0, 3], [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]]) otu_table_str = format_biom_table( table_factory(self.otu_table_vals, ['1', '2', '3', '4', '5'], [ 'otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10' ], [None, None, None, None, None], [{ "taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Bacteroidaceae" ] }, { "taxonomy": ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"] }, { "taxonomy": [ "Bacteria", "Spirochaetes", "Spirochaetales", "Spirochaetaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Rikenellaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Odoribacteriaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425" ] }, { "taxonomy": [ "Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970" ] }])) _, self.otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='test_make_otu_network_otu_table', suffix='.biom') close(_) open(self.otu_table_fp, 'w').write(otu_table_str) self.otu_sample_file = """#Full OTU Counts #OTU ID 1 2 3 4 5 Consensus Lineage otu_1 0 1 0 0 6 Bacteria; Actinobacteria; Coriobacteridae otu_2 2 0 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae otu_3 0 0 3 1 0 Bacteria; Firmicutes; Clostridia; Clostridiales otu_4 0 0 0 0 5 Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae otu_5 0 4 2 0 0 Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae otu_6 3 6 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae otu_7 0 0 4 2 0 Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae otu_8 0 0 0 0 3 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_9 2 0 0 5 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_10 0 2 0 4 0 Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970""" self.con_by_sample = { '1': set(['2', '4']), '2': set(['5', '3', '1', '4']), '3': set(['4', '2']), '4': set(['3', '1', '2']), '5': set(['2']) } self.edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 otu_2 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 otu_4 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "5 otu_8 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 091009 1200", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800" ] self.node_file_str = [ "1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "otu_2 otu_node 1 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "otu_4 otu_node 1 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_8 otu_node 1 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu" ] self.red_edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 @1 1.0 missed 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 @5 1.0 missed 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800" ] self.red_node_file_str = [ "1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "@1 otu_collapsed 1 1.0 other otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "@5 otu_collapsed 2 2.0 other otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu" ] self.otu_dc = {1: 3, 2: 7} self.sample_dc = {3: 3, 4: 2} self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2} self.num_con_cat = {"Day": 2, "time": 1} self.num_con = 6 self.num_cat = {"Day": 2, "time": 4} self.num_cat_less = {"Day": 1, "time": 3} self._paths_to_clean_up = [self.otu_table_fp] self._dir_to_clean_up = ''
sample_metadata=None, constructor=SparseOTUTable): data, sample_ids, otu_ids = parse_otu_map(otu_map_f,delim) if otu_to_taxonomy != None: otu_metadata = [] for o in otu_ids: try: otu_metadata.append({'taxonomy':otu_to_taxonomy[o].split(';')}) except KeyError: otu_metadata.append({'taxonomy':["None"]}) else: otu_metadata = None if sample_metadata != None: raise NotImplementedError,\ "Passing of sample metadata to make_otu_table is not currently supported." try: otu_table = table_factory(data, sample_ids, otu_ids, sample_metadata=sample_metadata, observation_metadata=otu_metadata, table_id=table_id, constructor=constructor, dtype=int) except ValueError,e: raise ValueError,\ ("Couldn't create OTU table. Is your OTU map empty?" " Original error message: %s" % (str(e))) return format_biom_table(otu_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or \ min_count_fraction != 0 or \ not isinf(max_count) or \ otu_ids_to_exclude_fp != None or \ min_samples !=0 or not isinf(max_samples)): option_parser.error( "No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = parse_biom_table(open(opts.input_fp, 'U')) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count output_f = open(opts.output_fp, 'w') otu_ids_to_keep = set(otu_table.ObservationIds) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([ id_.strip().split()[0] for id_, seq in MinimalFastaParser( open(otu_ids_to_exclude_fp, 'U')) ]) else: otu_ids_to_exclude = set([ l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U') ]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) output_f.write(format_biom_table(filtered_otu_table)) output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose output_fp = opts.output_fp category_mapping_fp = opts.category_mapping_fp category_mapping = open(category_mapping_fp, 'U') category_mapping = parse_mapping_file(category_mapping) individual_column = opts.individual_column reference_sample_column = opts.reference_sample_column conv_output_fp = opts.converted_otu_table_output_fp relative_abundance = opts.relative_abundance filter = opts.filter test = opts.test category = opts.category if not category: if test != 'paired_T': raise ValueError('a category in the category mapping file must be' +\ ' specified with the -c option for this test') threshold = opts.threshold if threshold and threshold != 'None': threshold = float(threshold) otu_include_fp = opts.otu_include_fp if otu_include_fp and otu_include_fp != 'None': otu_include = open(otu_include_fp) else: otu_include = None otu_table_fp = opts.otu_table_fp if not isdir(opts.otu_table_fp): # if single file, process normally otu_table = open(otu_table_fp, 'U') try: otu_table = parse_biom_table(otu_table) except AttributeError: otu_table = parse_biom_table_str(otu_table) #synchronize the mapping file with the otu table category_mapping, removed_samples = sync_mapping_to_otu_table(otu_table, \ category_mapping) if removed_samples: print "Warning, the following samples were in the category mapping file " +\ "but not the OTU table and will be ignored: " for i in removed_samples: print i + '\n' if test == 'longitudinal_correlation' or test == 'paired_T': converted_otu_table = longitudinal_otu_table_conversion_wrapper( otu_table, category_mapping, individual_column, reference_sample_column) if conv_output_fp: of = open(conv_output_fp, 'w') of.write(format_biom_table(converted_otu_table)) of.close() if test == 'longitudinal_correlation': #set the otu_include list to all of the OTUs, this effectively #deactivates the filter for correlation, because the filtered OTU_list is #rewritten with the otu_include list in the test_wrapper if not otu_include: otu_include = set(otu_table.ObservationIds) output = test_wrapper('correlation', converted_otu_table, \ category_mapping, category, threshold, filter, otu_include, \ 999999999.0, True) elif test == 'paired_T': output = test_wrapper('paired_T', converted_otu_table, \ category_mapping, category, threshold, \ filter, otu_include, 999999999.0, True, \ individual_column, reference_sample_column) else: output = test_wrapper(test, otu_table, category_mapping, \ category, threshold, filter, otu_include, \ otu_table_relative_abundance=relative_abundance) else: if test != 'longitudinal_correlation' and test != 'paired_T': otu_table_paths = glob('%s/*biom' % otu_table_fp) # if directory, get aggregated results parsed_otu_tables = [] for path in otu_table_paths: ot = open(path, 'U') ot = parse_biom_table(ot) parsed_otu_tables.append(ot) #synchronize the mapping file with the otu table #checks with just the first OTU table and assumes that all otu tables #have the same collection of samples category_mapping, removed_samples = sync_mapping_to_otu_table(parsed_otu_tables[0], \ category_mapping) if removed_samples: print "Warning, the following samples were in the category mapping file " +\ "but not the OTU table and will be ignored: " for i in removed_samples: print i + '\n' output = test_wrapper_multiple(test, parsed_otu_tables, \ category_mapping, category, threshold, filter, otu_include,\ otu_table_relative_abundance=relative_abundance) else: raise ValueError( "the longitudinal_correlation and paired_T options cannot be run on a directory" ) of = open(output_fp, 'w') of.write('\n'.join(output)) of.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or min_count_fraction != 0 or not isinf(max_count) or otu_ids_to_exclude_fp is not None or min_samples != 0 or not isinf(max_samples)): option_parser.error("No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = parse_biom_table(open(opts.input_fp, 'U')) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count output_f = open(opts.output_fp, 'w') otu_ids_to_keep = set(otu_table.ObservationIds) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([id_.strip().split()[0] for id_, seq in MinimalFastaParser(open(otu_ids_to_exclude_fp, 'U'))]) else: otu_ids_to_exclude = set([l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U')]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) output_f.write(format_biom_table(filtered_otu_table)) output_f.close()
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.map_file = """#SampleID Day time Description #This is some comment about the study 1 090809 1200 some description of sample1 2 090809 1800 some description of sample2 3 090909 1200 some description of sample3 4 090909 1800 some description of sample4 5 091009 1200 some description of sample5""" self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")], "2": [("Day", "090809"), ("time", "1800")], "3": [("Day", "090909"), ("time", "1200")], "4": [("Day", "090909"), ("time", "1800")], "5": [("Day", "091009"), ("time", "1200")]} self.sample_by_cat = {("Day", "090809"): ["1", "2"], ("Day", "090909"): ["3", "4"], ("Day", "091009"): ["5"], ("time", "1200"): ["1", "3", "5"], ("time", "1800"): ["2", "4"]} self.num_cats = 2 self.meta_dict = {"1": ["090809 1200", 0], "2": ["090809 1800", 0], "3": ["090909 1200", 0], "4": ["090909 1800", 0], "5": ["091009 1200", 0]} self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"] self.node_labels = ["node_name", "node_disp_name", "ntype", "degree", "weighted_degree", "consensus_lin", "Day", "time"] self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]] self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0], [0, 0, 3, 1, 0], [0, 0, 0, 0, 5], [0, 4, 2, 0, 0], [3, 6, 0, 0, 0], [0, 0, 4, 2, 0], [0, 0, 0, 0, 3], [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]]) otu_table_str = format_biom_table(table_factory(self.otu_table_vals, ['1', '2', '3', '4', '5'], ['otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10'], [None, None, None, None, None], [{"taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Bacteroidaceae"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"]}, {"taxonomy": ["Bacteria", "Spirochaetes", "Spirochaetales", "Spirochaetaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Rikenellaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Odoribacteriaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970"]}])) self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_make_otu_network_otu_table', suffix='.biom') open(self.otu_table_fp, 'w').write(otu_table_str) self.otu_sample_file = """#Full OTU Counts #OTU ID 1 2 3 4 5 Consensus Lineage otu_1 0 1 0 0 6 Bacteria; Actinobacteria; Coriobacteridae otu_2 2 0 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae otu_3 0 0 3 1 0 Bacteria; Firmicutes; Clostridia; Clostridiales otu_4 0 0 0 0 5 Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae otu_5 0 4 2 0 0 Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae otu_6 3 6 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae otu_7 0 0 4 2 0 Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae otu_8 0 0 0 0 3 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_9 2 0 0 5 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_10 0 2 0 4 0 Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970""" self.con_by_sample = { '1': set(['2', '4']), '2': set(['5', '3', '1', '4']), '3': set(['4', '2']), '4': set(['3', '1', '2']), '5': set(['2'])} self.edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 otu_2 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 otu_4 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "5 otu_8 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 091009 1200", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "otu_2 otu_node 1 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "otu_4 otu_node 1 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_8 otu_node 1 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.red_edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 @1 1.0 missed 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 @5 1.0 missed 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.red_node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "@1 otu_collapsed 1 1.0 other otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "@5 otu_collapsed 2 2.0 other otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.otu_dc = {1: 3, 2: 7} self.sample_dc = {3: 3, 4: 2} self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2} self.num_con_cat = {"Day": 2, "time": 1} self.num_con = 6 self.num_cat = {"Day": 2, "time": 4} self.num_cat_less = {"Day": 1, "time": 3} self._paths_to_clean_up = [self.otu_table_fp] self._dir_to_clean_up = ''
def test_format_biom_table(self): """ Formatting of BIOM table correctly includes "generated-by" information """ generated_by = "QIIME " + get_qiime_library_version() self.assertTrue(generated_by in format_biom_table(self.biom1))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_include_fp = opts.otu_include_fp output_fp = opts.output_fp verbose = opts.verbose category_mapping_fp = opts.category_mapping_fp individual_column = opts.individual_column reference_sample_column = opts.reference_sample_column conv_output_fp = opts.converted_otu_table_output_fp relative_abundance = opts.relative_abundance filter = opts.filter test = opts.test category = opts.category threshold = opts.threshold collate_results = opts.collate_results # check mapping file category_mapping = open(category_mapping_fp, 'U') category_mapping = parse_mapping_file(category_mapping) if not category: if test != 'paired_T': option_parser.error( 'a category in the category mapping file must be' ' specified with the -c option for this test') # set up threshold value for filtering, if any if threshold and threshold != 'None': threshold = float(threshold) # if specifying a list of OTUs to look at specifically if otu_include_fp and otu_include_fp != 'None': otu_include = open(otu_include_fp) else: otu_include = None # if only passing in a single OTU table if isdir(otu_table_fp) is False: # raise error if collate option is being passed to single table if collate_results is True: option_parser.error( 'Cannot collate the results of only one table.' ' Please rerun the command without passing the -w option') else: #open and parse the biom table fp otu_table = parse_biom_table(open(otu_table_fp, 'U')) # run the statistical test output = test_wrapper( test, otu_table, category_mapping, category, threshold, filter, otu_include, otu_table_relative_abundance=relative_abundance) # write output output_file = open(output_fp, 'w') output_file.write('\n'.join(output)) output_file.close() # if the user has passed in a directory if isdir(otu_table_fp) is True: # negate_collate to return an results file on a per table basis if collate_results is False: # build list of otu tables otu_table_paths = glob('%s/*biom' % otu_table_fp) # if output dir doesn't exist, then make it if exists(output_fp): pass else: makedirs(output_fp) for otu_table_fp in otu_table_paths: #open and parse the biom table fp otu_table = parse_biom_table(open(otu_table_fp, 'U')) #synchronize the mapping file with the otu table category_mapping, removed_samples = \ sync_mapping_to_otu_table(otu_table, category_mapping) if removed_samples: print "Warning, the following samples were in the category mapping file " +\ "but not the OTU table and will be ignored: " for i in removed_samples: print i + '\n' # create naming convention for output file # will look like: otu_table_ANOVA_Treatment.txt output_basename = basename(otu_table_fp) output_basename = output_basename.replace(".biom", "") output_fp_sweep = "%s_%s_%s.txt" % \ (output_basename,test,category) # if the convert_otu_table_fp is passed, save the converted table if test == 'longitudinal_correlation' or test == 'paired_T': converted_otu_table = longitudinal_otu_table_conversion_wrapper( table, category_mapping, individual_column, reference_sample_column) if conv_output_fp: of = open(conv_output_fp, 'w') of.write(format_biom_table(converted_otu_table)) of.close() if test == 'longitudinal_correlation': #set the otu_include list to all of the OTUs, this effectively #deactivates the filter for correlation, because the filtered OTU_list is #rewritten with the otu_include list in the test_wrapper if not otu_include: otu_include = set(otu_table.ObservationIds) output = test_wrapper('correlation', converted_otu_table, category_mapping, category, threshold, filter, otu_include, 999999999.0, True) elif test == 'paired_T': output = test_wrapper('paired_T', converted_otu_table, category_mapping, category, threshold, filter, otu_include, 999999999.0, True, individual_column, reference_sample_column) # run test single input table from the directory else: output = test_wrapper( test, otu_table, category_mapping, category, threshold, filter, otu_include, otu_table_relative_abundance=relative_abundance) # write output file with new naming convention output_file = open(join(output_fp, output_fp_sweep), 'w') output_file.write('\n'.join(output)) output_file.close() # Use when the input dir contains rarefied OTU tables, and you want # to collate the p-values & results into one results file if collate_results is True: if test != 'longitudinal_correlation' and test != 'paired_T': # get biom tables otu_table_paths = glob('%s/*biom' % otu_table_fp) #get aggregated tables parsed_otu_tables = [] for otu_table_fp in otu_table_paths: otu_table = open(otu_table_fp, 'U') otu_table = parse_biom_table(otu_table) parsed_otu_tables.append(otu_table) #synchronize the mapping file with the otu table #checks with just the first OTU table and assumes that all otu tables #have the same collection of samples category_mapping, removed_samples = \ sync_mapping_to_otu_table(parsed_otu_tables[0],category_mapping) if removed_samples: print "Warning, the following samples were in the category mapping file " +\ "but not the OTU table and will be ignored: " for i in removed_samples: print i + '\n' # get output from statistical test output = test_wrapper_multiple( test, parsed_otu_tables, category_mapping, category, threshold, filter, otu_include, otu_table_relative_abundance=relative_abundance) #write out aggregated results output_file = open(output_fp, 'w') output_file.write('\n'.join(output)) output_file.close() else: option_parser.error( "You cannot collate the results obtained from " "using the longitudinal_correlation and paired_T options.")