def load_data_table( data_table_fp, load_data_table_in_biom=False, suppress_subset_loading=False, ids_to_load=None, transpose=False, verbose=False, ): """Load a data table, detecting gziiped files and subset loading data_table_fp -- path to the input data table load_data_table_in_biom -- if True, load the data table as a BIOM table rather than as tab-delimited suppress_subset_loading -- if True, load the entire table, rather than just ids_of_interest ids_to_load -- a list of OTU ids for which data should be loaded gzipped files are detected based on the '.gz' suffix. """ if not path.exists(data_table_fp): raise IOError("File " + data_table_fp + " doesn't exist! Did you forget to download it?") ext = path.splitext(data_table_fp)[1] if ext == ".gz": genome_table_fh = gzip.open(data_table_fp, "rb") else: genome_table_fh = open(data_table_fp, "U") if load_data_table_in_biom: if not suppress_subset_loading: # Now we want to use the OTU table information # to load only rows in the count table corresponding # to relevant OTUs if verbose: print "Loading traits for %i organisms from the trait table" % len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(), ids_to_load, axis="samples") else: if verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(data_table_fp) else: genome_table = convert_precalc_to_biom(genome_table_fh, ids_to_load, transpose=transpose) if verbose: print "Done loading trait table containing %i functions for %i organisms." % ( len(genome_table.ids(axis="observation")), len(genome_table.ids()), ) return genome_table
def load_data_table(data_table_fp,\ load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\ transpose=False,verbose=False): """Load a data table, detecting gziiped files and subset loading data_table_fp -- path to the input data table load_data_table_in_biom -- if True, load the data table as a BIOM table rather than as tab-delimited suppress_subset_loading -- if True, load the entire table, rather than just ids_of_interest ids_to_load -- a list of OTU ids for which data should be loaded gzipped files are detected based on the '.gz' suffix. """ if not path.exists(data_table_fp): raise IOError("File " + data_table_fp + " doesn't exist! Did you forget to download it?") ext = path.splitext(data_table_fp)[1] if (ext == '.gz'): genome_table_fh = gzip.open(data_table_fp, 'rb') else: genome_table_fh = open(data_table_fp, 'U') if load_data_table_in_biom: if not suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if verbose: print "Loading traits for %i organisms from the trait table" % len( ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(), ids_to_load, axis='samples') else: if verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(data_table_fp) else: genome_table = convert_precalc_to_biom(genome_table_fh, ids_to_load, transpose=transpose) if verbose: print "Done loading trait table containing %i functions for %i organisms." % ( len(genome_table.ids(axis='observation')), len(genome_table.ids())) return genome_table
def load_data_table(data_table_fp,\ load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\ transpose=False,verbose=False): """Load a data table, detecting gziiped files and subset loading data_table_fp -- path to the input data table load_data_table_in_biom -- if True, load the data table as a BIOM table rather than as tab-delimited suppress_subset_loading -- if True, load the entire table, rather than just ids_of_interest ids_to_load -- a list of OTU ids for which data should be loaded gzipped files are detected based on the '.gz' suffix. """ ext=path.splitext(data_table_fp)[1] if (ext == '.gz'): genome_table_fh = gzip.open(data_table_fp,'rb') else: genome_table_fh = open(data_table_fp,'U') if load_data_table_in_biom: if not suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = parse_biom_table(genome_table_fh.read()) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load,transpose=transpose) if verbose: print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) return genome_table
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) if(opts.input_count_table is None): if(opts.type_of_prediction == 'KO'): input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz') elif(opts.type_of_prediction == 'COG'): input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz') else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): genome_table_str = gzip.open(input_count_table,'rb').read() else: genome_table_str = open(input_count_table,'U').read() #In the genome/trait table genomes are the samples and #genes are the observations if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs ids_to_load = otu_table.ObservationIds if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage." genome_table = parse_biom_table(genome_table_str) if opts.verbose: print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s]))) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))