Exemplo n.º 1
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
Exemplo n.º 2
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ",opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')):
        scaling_factors[sample_id]=depth    
    
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))
    
    if opts.verbose:
        print "Scaling the metagenome..."
        
    scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    open(opts.output_metagenome_table,'w').write(format_biom_table(scaled_metagenomes))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))
    if opts.verbose:
        print "Predicting the metagenome..."
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    open(opts.output_metagenome_table,'w').write(output_text)
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
Exemplo n.º 5
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
    else:
        if input_ext != '.biom':
            sys.stderr.write("\nOTU table does not have '.biom' extension! If loading causes error consider using '-f' option to load tab-delimited OTU table!\n\n")
        otu_table = parse_biom_table(open(opts.input_otu_fp,'U'))

    ext=path.splitext(opts.input_count_fp)[1]
    if (ext == '.gz'):
        count_table = parse_biom_table(gzip.open(opts.input_count_fp,'rb'))
    else:
        count_table = parse_biom_table(open(opts.input_count_fp,'U'))
        
    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id=count_table.ObservationIds[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable)

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))
            
        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}
        
    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)
            

    normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer)

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp,'w').write(\
     normalized_table.getBiomFormatJsonString('PICRUST'))
Exemplo n.º 6
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf())
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
Exemplo n.º 7
0
def write_metagenome_to_file(
    predicted_metagenome,
    output_fp,
    tab_delimited=False,
    verbose_filetype_message="metagenome prediction",
    verbose=False,
):
    """Write a BIOM Table object to a file, creating the directory if needed
    predicted_metagenome -- a BIOM table object
    output_fp -- the filepath to write the output
    tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file
    verbose -- if True output verbose info to StdOut
    """

    if verbose:
        print "Writing %s results to output file: %s" % (verbose_filetype_message, output_fp)

    make_output_dir_for_file(output_fp)
    if tab_delimited:
        # peek at first observation to decide on what observeration metadata
        # to output in tab-delimited format
        (obs_val, obs_id, obs_metadata) = predicted_metagenome.iter(axis="observation").next()

        # see if there is a metadata field that contains the "Description"
        # (e.g. KEGG_Description or COG_Description)
        h = re.compile(".*Description")
        metadata_names = filter(h.search, obs_metadata.keys())
        if metadata_names:
            # use the "Description" field we found
            metadata_name = metadata_names[0]
        elif obs_metadata.keys():
            # if no "Description" metadata then just output the first
            # observation metadata
            metadata_name = (obs_metadata.keys())[0]
        else:
            # if no observation metadata then don't output any
            metadata_name = None

        open(output_fp, "w").write(
            predicted_metagenome.to_tsv(
                header_key=metadata_name, header_value=metadata_name, metadata_formatter=biom_meta_to_string
            )
        )
    else:
        # output in BIOM format
        format_fs = {
            "KEGG_Description": picrust_formatter,
            "COG_Description": picrust_formatter,
            "KEGG_Pathways": picrust_formatter,
            "COG_Category": picrust_formatter,
        }
        write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
Exemplo n.º 8
0
def write_metagenome_to_file(predicted_metagenome,output_fp,\
    tab_delimited=False,verbose_filetype_message="metagenome prediction",\
    verbose=False):
    """Write a BIOM Table object to a file, creating the directory if needed
    predicted_metagenome -- a BIOM table object
    output_fp -- the filepath to write the output
    tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file
    verbose -- if True output verbose info to StdOut
    """

    if verbose:
        print "Writing %s results to output file: %s"\
          %(verbose_filetype_message,output_fp)

    make_output_dir_for_file(output_fp)
    if tab_delimited:
        #peek at first observation to decide on what observeration metadata
        #to output in tab-delimited format
        (obs_val,obs_id,obs_metadata)=\
          predicted_metagenome.iter(axis='observation').next()

        #see if there is a metadata field that contains the "Description"
        #(e.g. KEGG_Description or COG_Description)
        h = re.compile('.*Description')
        metadata_names = filter(h.search, obs_metadata.keys())
        if metadata_names:
            #use the "Description" field we found
            metadata_name = metadata_names[0]
        elif (obs_metadata.keys()):
            #if no "Description" metadata then just output the first
            #observation metadata
            metadata_name = (obs_metadata.keys())[0]
        else:
            #if no observation metadata then don't output any
            metadata_name = None

        open(output_fp,'w').write(predicted_metagenome.to_tsv(\
          header_key=metadata_name,header_value=metadata_name,metadata_formatter=biom_meta_to_string))
    else:
        #output in BIOM format
        format_fs = {
            'KEGG_Description': picrust_formatter,
            'COG_Description': picrust_formatter,
            'KEGG_Pathways': picrust_formatter,
            'COG_Category': picrust_formatter
        }
        write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
Exemplo n.º 9
0
def write_metagenome_to_file(predicted_metagenome,output_fp,\
    tab_delimited=False,verbose_filetype_message="metagenome prediction",\
    verbose=False):
    """Write a BIOM Table object to a file, creating the directory if needed
    predicted_metagenome -- a BIOM table object
    output_fp -- the filepath to write the output
    tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file
    verbose -- if True output verbose info to StdOut
    """

    if verbose:
        print "Writing %s results to output file: %s"\
          %(verbose_filetype_message,output_fp)

    make_output_dir_for_file(output_fp)
    if tab_delimited:
        #peek at first observation to decide on what observeration metadata
        #to output in tab-delimited format
        (obs_val,obs_id,obs_metadata)=\
          predicted_metagenome.iterObservations().next()

        #see if there is a metadata field that contains the "Description" 
        #(e.g. KEGG_Description or COG_Description)
        h = re.compile('.*Description')
        metadata_names=filter(h.search,obs_metadata.keys())
        if metadata_names:
            #use the "Description" field we found
            metadata_name=metadata_names[0]
        elif(obs_metadata.keys()):
            #if no "Description" metadata then just output the first 
            #observation metadata
            metadata_name=(obs_metadata.keys())[0]
        else:
            #if no observation metadata then don't output any
            metadata_name=None
            
        open(output_fp,'w').write(predicted_metagenome.delimitedSelf(\
          header_key=metadata_name,header_value=metadata_name))
    else:
        #output in BIOM format
        open(output_fp,'w').write(format_biom_table(predicted_metagenome))
Exemplo n.º 10
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ", opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id, depth in parse_seq_count_file(open(opts.input_seq_depth_file, "U")):
        scaling_factors[sample_id] = depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table, scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ", opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
Exemplo n.º 11
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ",opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')):
        scaling_factors[sample_id]=depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
Exemplo n.º 12
0
        try:
            assert set(variance_table.ids()) == set(genome_table.ids())
        except AssertionError, e:
            for var_id in variance_table.ids():
                if var_id not in genome_table.ids():
                    print "Variance table SampleId %s not in genome_table SampleIds" % var_id
            raise AssertionError(
                "Variance table and genome table contain different OTU ids")

        #sort the ObservationIds and SampleIds to be in the same order
        variance_table = variance_table.sort_order(
            genome_table.ids(axis='observation'), axis='observation')
        variance_table = variance_table.sort_order(genome_table.ids(),
                                                   axis='sample')

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        weighted_nsti = calc_nsti(otu_table, genome_table, weighted=True)
        samples = weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        samples_and_nstis = zip(samples, nstis)
        if opts.verbose:
            print "Writing NSTI information to file:", opts.accuracy_metrics
        accuracy_output_fh = open(opts.accuracy_metrics, 'w')
        accuracy_output_fh.write("#Sample\tMetric\tValue\n")
        for sample, nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" % (sample, str(nsti))
            accuracy_output_fh.write(line)
Exemplo n.º 13
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext = path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table = parse_classic_table_to_rich_table(
            open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp, 'U'))
        except ValueError:
            raise ValueError(
                "Error loading OTU table! If not in BIOM format use '-f' option.\n"
            )

    ids_to_load = otu_table.ObservationIds

    if (opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = '_'.join(
            ['16S', opts.gg_version, 'precalculated.tab.gz'])
        input_count_table = join(get_picrust_project_dir(), 'picrust', 'data',
                                 precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table, 'rb')
    else:
        count_table_fh = open(input_count_table, 'U')

    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id = count_table.ObservationIds[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table = table_factory(filtered_values,
                                       otu_table.SampleIds,
                                       filtered_otus,
                                       constructor=DenseOTUTable)

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id, x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)

    normalized_table = filtered_otu_table.normObservationByMetadata(
        opts.metadata_identifer)

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,
                                                     normalized_table,
                                                     'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,
                                                    normalized_table,
                                                    'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
Exemplo n.º 14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Exemplo n.º 15
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics=True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    if opts.no_round:
        round_opt = False 
    else:
        round_opt = True

    # Load Tree
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers=[]
    traits={}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:",opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." %(len(asr_max_vals))
                print "Brownian motion parameter:",brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)


    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits,tree, trait_label=trait_label)


    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

             if opts.verbose:
                 print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
             brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                      upper_bound_trait_label="upper_bound",\
                      lower_bound_trait_label="lower_bound",\
                      trait_label=trait_label,\
                      confidence=0.95)
             if opts.verbose:
                 print "Inferred the following rate parameters:",brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True,True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)


        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table,"U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {'NSTI': min_distances[organism]}

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:",opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only,'w+')
            f.write("metric\torganism\tvalue\n")
            lines =[]
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()


    if opts.verbose:
        print "Generating predictions using method:",opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances=None #Overwritten by methods that calc variance
    confidence_intervals=None #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn=weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

        else:
             predictions =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              weight_fn =weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)



    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh=open(opts.output_trait_table,'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ",opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix='.biom'
        else:
            suffix='.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base,extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base+"_variances"+suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:",variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))


    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base+"_upper_CI"+suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:",upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base+"_lower_CI"+suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file",lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
Exemplo n.º 16
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if opts.verbose:
        print "Loading tree from file:", opts.tree
    
    # Load Tree
    #tree = LoadTree(opts.tree)
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers =[]
    traits={}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
            
            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output)
            brownian_motion_parameter = params['sigma'][0]
            brownian_motion_error = params['sigma'][1]
            if opts.verbose:
                print "Done. Loaded %i confidence interval values." %(len(asr_max_vals))
                print "Brownian motion parameter:",brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)


    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)
        
    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"
   
    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits,tree, trait_label=trait_label)

    
    if opts.reconstruction_confidence: 
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")


    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]
    
    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True,True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]
        
        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)
        
        
        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))
        
        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict))
    
    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table,"U")
        #Parse OTU table for ids
        
        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")
        
        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))
        
        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.output_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)
            
            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {'NSTI': min_distances[organism]}
        
            if opts.verbose:
                print "NSTI:", nsti_result
   
        #Write accuracy metrics to file
        if opts.verbose:
            print "Writing accuracy metrics to file:",opts.output_accuracy_metrics
   
        f = open(opts.output_accuracy_metrics,'w+')
        lines = ["metric\torganism\tvalue\n"]
        for organism in accuracy_metric_results.keys():
            for metric in accuracy_metric_results[organism].keys():
                lines.append('\t'.join([metric,organism,\
                  str(accuracy_metric_results[organism][metric])])+'\n')
        f.writelines(sorted(lines))
        f.close()


    if opts.verbose:
        print "Generating predictions using method:",opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)
    elif opts.weighting_method == 'linear':
        #Linear weight function
        weight_fn = linear_weight
    elif opts.weighting_method == 'equal_weight':
        weight_fn = equal_weight

    variances=None #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting': 
  
        if opts.reconstruction_confidence:
        # Perform predictions using reconstructed ancestral states
            predictions,variances =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              use_self_in_prediction = True,\
              weight_fn =weight_fn,verbose=opts.verbose)
    
        else:
             predictions =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              use_self_in_prediction = True,\
              weight_fn =weight_fn,verbose=opts.verbose)
    
    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          use_self_in_prediction = True,\
          weight_fn =weight_fn,verbose=opts.verbose)
        


    elif opts.prediction_method == 'nearest_neighbor':
        
        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,\
          use_self_in_prediction = True, tips_only = True)

    elif opts.prediction_method == 'random_neighbor':
        
        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label,\
          use_self_in_prediction = True)
    else:
        error_template =\
          "Prediction method '%s' is not supported.  Valid methods are: %s'"
        
        error_text = error_template %(opts.prediction_method,\
          ", ".join(METHOD_CHOICES))

    if opts.verbose:
        print "Converting results to .biom format for output..."
    #convert to biom format (and transpose)
    biom_predictions=biom_table_from_predictions(predictions,table_headers)
    #In the .biom table, organisms are 'samples' and traits are 'observations 
    #(by analogy with a metagenomic sample)
    
    #Therefore, we associate the trait variances with the per-observation metadata
    
    #print "variances:",variances
    #print "BIOM observations:", [o for o in biom_predictions.iterObservations()] 
    #print "BIOM samples:", [s for s in biom_predictions.iterSamples()] 
    
    if variances is not None:
        if opts.verbose:
            print "Adding variance information to output .biom table, as per-observation metadata with key 'variance'..."
        biom_predictions.addSampleMetadata(variances)
    
    if accuracy_metric_results is not None:
        if opts.verbose:
            print "Adding accuracy metrics (%s) to biom table as per-observation metadata..." %(",".join(accuracy_metrics))
        biom_predictions.addSampleMetadata(accuracy_metric_results)
        
    #Add variance information as per observation metadata
    
    if opts.verbose:
        print "Writing biom format prediction results to file: ",opts.output_trait_table
    #write biom table to file
    make_output_dir_for_file(opts.output_trait_table)
    open(opts.output_trait_table,'w').write(\
     format_biom_table(biom_predictions))
Exemplo n.º 17
0
                if var_id not in genome_table.ObservationIds:
                    print "Variance table ObsId %s not in genome_table ObsIds" %var_id
            raise AssertionError("Variance table and genome table contain different gene ids")
        try:
            assert set(variance_table.SampleIds) == set(genome_table.SampleIds)
        except AssertionError,e:
            for var_id in variance_table.SampleIds:
                if var_id not in genome_table.SampleIds:
                    print "Variance table SampleId %s not in genome_table SampleIds" %var_id
            raise AssertionError("Variance table and genome table contain different OTU ids")

        #sort the ObservationIds and SampleIds to be in the same order
        variance_table=variance_table.sortObservationOrder(genome_table.ObservationIds)
        variance_table=variance_table.sortSampleOrder(genome_table.SampleIds)

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        samples_and_nstis = zip(samples,nstis)
        if opts.verbose:
            print "Writing NSTI information to file:", opts.accuracy_metrics
        accuracy_output_fh = open(opts.accuracy_metrics,'w')
        accuracy_output_fh.write("#Sample\tMetric\tValue\n")
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            accuracy_output_fh.write(line)
Exemplo n.º 18
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
Exemplo n.º 19
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    min_args = 1
    if len(args) < min_args:
        option_parser.error(
            'One or more predicted biom files must be provided.')
    observed_files = args

    make_output_dir_for_file(opts.output_fp)
    out_fh = open(opts.output_fp, 'w')

    if verbose:
        print "Loading expected trait table file:", opts.exp_trait_table_fp

    exp_table = load_table(opts.exp_trait_table_fp)

    header_printed = False
    header_keys = []
    delimiter = "\t"

    for observed_file in observed_files:
        observed_file_name = basename(observed_file)

        if verbose:
            print "Loading predicted trait table file:", observed_file_name

        obs_table = load_table(observed_file)

        if opts.compare_observations:
            if verbose:
                print "Transposing tables to allow evaluation of observations (instead of samples)..."
            obs_table = obs_table.transpose()
            exp_table = exp_table.transpose()

        if verbose:
            print "Matching predicted and expected tables..."

        obs, exp = match_biom_tables(
            obs_table,
            exp_table,
            verbose=verbose,
            limit_to_expected_observations=opts.limit_to_expected_observations,
            limit_to_observed_observations=opts.limit_to_observed_observations,
            normalize=opts.normalize,
            shuffle_samples=opts.shuffle_samples)

        if verbose:
            print "Calculating accuracy stats for all observations..."

        #import pdb; pdb.set_trace()
        for i in obs:
            if verbose:
                print "Calculating stats for: ", i
            if opts.not_relative_abundance_scores:
                results = calculate_accuracy_stats_from_observations(
                    obs[i], exp[i], success_criterion='binary')
            else:
                results = calculate_accuracy_stats_from_observations(
                    obs[i], exp[i], success_criterion='ra_exact')

            #If first pass then print out header
            if not header_printed:
                header_printed = True
                header_keys = sorted(results.keys())
                out_fh.write(
                    delimiter.join(['file', 'label'] + header_keys) + "\n")

            #print results using same order as header
            values = [observed_file_name, i
                      ] + ['{0:.3g}'.format(results[x]) for x in header_keys]
            out_str = delimiter.join(map(str, values)) + "\n"
            out_fh.write(out_str)
Exemplo n.º 20
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    if(opts.input_count_table is None):
        if(opts.type_of_prediction == 'KO'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz')
        elif(opts.type_of_prediction == 'COG'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz')
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    
    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        genome_table_str = gzip.open(input_count_table,'rb').read()
    else:
        genome_table_str = open(input_count_table,'U').read()
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations
    
    if not opts.suppress_subset_loading:
        #Now we want to use the OTU table information
        #to load only rows in the count table corresponding
        #to relevant OTUs
        ids_to_load = otu_table.ObservationIds

        if opts.verbose:
            print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

        genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples')
    else:
        if opts.verbose:
            print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage."
        genome_table = parse_biom_table(genome_table_str)
    
    if opts.verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s])))
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
Exemplo n.º 21
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp,'U'))
        except ValueError:
            raise ValueError("Error loading OTU table! If not in BIOM format use '-f' option.\n")

    ids_to_load = otu_table.ObservationIds
    
    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')
       
    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh,ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id=count_table.ObservationIds[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable)

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))
            
        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}
        
    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)
            

    normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer)
    
    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,normalized_table,'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,normalized_table,'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp,'w').write(format_biom_table(normalized_table))
Exemplo n.º 22
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis="observation")

    if opts.input_count_fp is None:
        # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"])
        input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if ext == ".gz":
        count_table_fh = gzip.open(input_count_table, "rb")
    else:
        count_table_fh = open(input_count_table, "U")

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    # Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iter(axis="observation"):
        ids.append(str(x[1]))

    ob_id = count_table.ids(axis="observation")[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.exists(x, axis="sample"):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis="observation"))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id, x)
        try:
            # data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation")

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])

    normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation")

    # move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation")

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Exemplo n.º 23
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir = 'jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py')

    if (opts.parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (opts.parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (opts.parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if (opts.verbose):
        print "Loading tree..."

    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]

    if (opts.verbose):
        print "Total number of possible tips to predict: {0}".format(
            len(all_tips))

    created_tmp_files = []
    output_files = {}
    output_files['counts'] = []
    if opts.reconstruction_confidence:
        output_files['variances'] = []
        output_files['upper_CI'] = []
        output_files['lower_CI'] = []

    if opts.already_calculated:
        all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(
                len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    if (opts.verbose):
        print "Creating temporary input files in: ", tmp_dir

    num_tips_per_job = 1000
    for tips_to_predict in [
            all_tips[i:i + num_tips_per_job]
            for i in range(0, len(all_tips), num_tips_per_job)
    ]:

        #create tmp output files
        tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str = ','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base, extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base + "_variances.tab")
            output_files['upper_CI'].append(outfile_base + "_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base + "_lower_CI.tab")

            #create the job command
            cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, opts.reconstruction_confidence,
                tip_to_predict_str, tmp_output_fp)

        else:
            cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, tip_to_predict_str,
                tmp_output_fp)

        #NOTE: Calculating NSTI this way is convenient,
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd = cmd + " -a"

        #add job command to the the jobs file
        jobs.write(cmd + "\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if (opts.verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix = 'picrust'
    submit_jobs(cluster_jobs_fp,
                jobs_fp,
                job_prefix,
                num_jobs=opts.num_jobs,
                delay=opts.delay)

    if (opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if (opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base, extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
        #Combine output files
        if opts.verbose:
            print "Combining all output files for " + predict_type

        combined_predictions = combine_predict_trait_output(
            output_files[predict_type])

        if opts.verbose:
            print "Writing combined file for " + predict_type

        if predict_type == 'counts':
            #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table, 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table, 'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base + "_" + predict_type + ".biom", 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base + "_" + predict_type + ".tab",
                     'w').write(combined_predictions)

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
Exemplo n.º 24
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir='jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    if(opts.parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(opts.parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(opts.parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(opts.verbose):
        print "Loading tree..."
        
    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]
    
    if(opts.verbose):
        print "Total number of possible tips to predict: {0}".format(len(all_tips))

    created_tmp_files=[]
    output_files={}
    output_files['counts']=[]
    if opts.reconstruction_confidence:
        output_files['variances']=[]
        output_files['upper_CI']=[]
        output_files['lower_CI']=[]

    if opts.already_calculated:
        all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(opts.verbose):
        print "Creating temporary input files in: ",tmp_dir
    
    num_tips_per_job=1000
    for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]:
        
        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str=','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base,extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base+"_variances.tab")
            output_files['upper_CI'].append(outfile_base+"_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base+"_lower_CI.tab")
            
            #create the job command
            cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp)

        else:
            cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp)
            

        #NOTE: Calculating NSTI this way is convenient, 
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd=cmd+" -a"

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if(opts.verbose):
        print "Launching parallel jobs."
        
    #run the job command
    job_prefix='picrust'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay)

    if(opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if(opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base,extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
       #Combine output files
        if opts.verbose:
            print "Combining all output files for "+ predict_type

        combined_predictions=combine_predict_trait_output(output_files[predict_type])
        
        if opts.verbose:
            print "Writing combined file for "+predict_type

        if predict_type == 'counts':
        #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table,'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions)    
        
    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
Exemplo n.º 25
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics = True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    # Load Tree
    #tree = LoadTree(opts.tree)
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers = []
    traits = {}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:", opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." % (
                    len(asr_max_vals))
                print "Brownian motion parameter:", brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)

    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits, tree, trait_label=trait_label)

    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

            if opts.verbose:
                print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
            brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                     upper_bound_trait_label="upper_bound",\
                     lower_bound_trait_label="lower_bound",\
                     trait_label=trait_label,\
                     confidence=0.95)
            if opts.verbose:
                print "Inferred the following rate parameters:", brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True, True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)

        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table, "U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" % (
                [",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {
                    'NSTI': min_distances[organism]
                }

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:", opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only, 'w+')
            f.write("metric\torganism\tvalue\n")
            lines = []
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()

    if opts.verbose:
        print "Generating predictions using method:", opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances = None  #Overwritten by methods that calc variance
    confidence_intervals = None  #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn =weight_fn,verbose=opts.verbose)

        else:
            predictions =\
             predict_traits_from_ancestors(tree,nodes_to_predict,\
             trait_label=trait_label,\
             weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh = open(opts.output_trait_table, 'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ", opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix = '.biom'
        else:
            suffix = '.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base, extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base + "_variances" + suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:", variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))

    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base + "_upper_CI" + suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:", upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base + "_lower_CI" + suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file", lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table
    
    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations

    
    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp
        
    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
Exemplo n.º 27
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    verbose=opts.verbose

    min_args = 1
    if len(args) < min_args:
       option_parser.error('One or more predicted biom files must be provided.')
    observed_files=args
   

    make_output_dir_for_file(opts.output_fp)
    out_fh=open(opts.output_fp,'w')

    if verbose:
        print "Loading expected trait table file:",opts.exp_trait_table_fp

    exp_table =parse_biom_table(open(opts.exp_trait_table_fp,'U'))

    header_printed=False
    header_keys=[]
    delimiter="\t"


    for observed_file in observed_files:
        observed_file_name=basename(observed_file)

        if verbose:
            print "Loading predicted trait table file:",observed_file_name

        obs_table =parse_biom_table(open(observed_file,'U'))

        if opts.compare_observations:
            if verbose:
                print "Transposing tables to allow evaluation of observations (instead of samples)..."
            obs_table=transpose_biom(obs_table)
            exp_table=transpose_biom(exp_table)

        if verbose:
           print "Matching predicted and expected tables..."    

        obs,exp=match_biom_tables(obs_table,exp_table,verbose=verbose,limit_to_expected_observations=opts.limit_to_expected_observations,limit_to_observed_observations=opts.limit_to_observed_observations,normalize=opts.normalize,shuffle_samples=opts.shuffle_samples)
           
        if verbose:
            print "Calculating accuracy stats for all observations..."

        #import pdb; pdb.set_trace()
        for i in obs:
            if verbose:
                print "Calculating stats for: ",i
            if opts.not_relative_abundance_scores:
                results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='binary')
            else:
                results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='ra_exact')

            #If first pass then print out header
            if not header_printed:
                header_printed=True
                header_keys=sorted(results.keys())
                out_fh.write(delimiter.join(['file','label']+header_keys)+"\n")

            #print results using same order as header
            values=[observed_file_name,i]+['{0:.3g}'.format(results[x]) for x in header_keys]
            out_str=delimiter.join(map(str,values))+"\n"
            out_fh.write(out_str)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)