예제 #1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_table_fp = opts.output_otu_table_fp
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    input_table = load_table(opts.input_otu_table_fp)

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa,
                                            metadata_field)
    input_table.filter(filter_fn, axis='observation')

    write_biom_table(input_table, output_table_fp)
예제 #2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        with open(opts.mapping_fp, 'U') as map_f:
            mapping_data, mapping_header, mapping_comments = \
                parse_mapping_file(map_f)

        sample_metadata = mapping_file_to_dict(mapping_data, mapping_header)

    with open(opts.otu_map_fp, 'U') as otu_map_f:
        biom_otu_table = make_otu_table(otu_map_f,
                                        otu_to_taxonomy=otu_to_taxonomy,
                                        otu_ids_to_exclude=ids_to_exclude,
                                        sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
예제 #3
0
 def _call_cleanup(self,
                   input_fp,
                   output_dir,
                   params,
                   job_prefix,
                   poll_directly,
                   suppress_submit_jobs):
     """ Called as the last step in __call__.
     """
     if poll_directly:
         if params['observation_metadata_fp'] is not None:
             observation_metadata = \
                 parse_observation_metadata(
                     open(params['observation_metadata_fp'], 'U'))
         else:
             observation_metadata = None
         biom_fp = join(output_dir, 'observation_table.biom')
         biom_table = make_otu_table(
             open(join(output_dir, 'observation_map.txt'), 'U'),
             observation_metadata)
         write_biom_table(biom_table, biom_fp)
     else:
         # can't construct the final biom file if not polling
         # directly as the final observation map won't have been created yet
         pass
예제 #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_table_fp = opts.output_otu_table_fp
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    input_table = load_table(opts.input_otu_table_fp)

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa,
                                            metadata_field)
    input_table.filter(filter_fn, axis='observation')

    try:
        write_biom_table(input_table, output_table_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no OTUs remained after filtering.")
예제 #5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        mapping_data, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, 'U'))
        sample_metadata = assemble_sample_metadata(mapping_data, mapping_header, mapping_comments)
                
    biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'),
                                    otu_to_taxonomy=otu_to_taxonomy,
                                    otu_ids_to_exclude=ids_to_exclude,
                                    sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
예제 #6
0
def split_otu_table_on_taxonomy_to_files(otu_table_fp, level, output_dir,
                                         md_identifier='taxonomy',
                                         md_processor=process_md_as_list):
    """ Split OTU table by taxonomic level, writing otu tables to output dir
    """
    results = []
    otu_table = load_table(otu_table_fp)
    create_dir(output_dir)

    def split_f(id_, obs_md):
        try:
            result = md_processor(obs_md, md_identifier, level)
        except KeyError:
            raise KeyError("Metadata identifier (%s) is not associated with "
                           "all (or any) observerations. You can modify the "
                           "key with the md_identifier parameter." %
                           md_identifier)
        except TypeError:
            raise TypeError("Can't correctly process the metadata string. If "
                            "your input file was generated from QIIME 1.4.0 or"
                            " earlier you may need to pass --md_as_string.")
        except AttributeError:
            raise AttributeError("Metadata category not found. If your input "
                                 "file was generated from QIIME 1.4.0 or "
                                 "earlier you may need to pass --md_identifier"
                                 " \"Consensus Lineage\".")

        return result

    for bin, sub_otu_table in otu_table.partition(split_f, axis='observation'):
        output_fp = '%s/otu_table_%s.biom' % (output_dir, bin)
        write_biom_table(sub_otu_table, output_fp)

        results.append(output_fp)
    return results
예제 #7
0
 def _call_cleanup(self,
                   input_fp,
                   output_dir,
                   params,
                   job_prefix,
                   poll_directly,
                   suppress_submit_jobs):
     """ Called as the last step in __call__.
     """
     if poll_directly:
         if params['observation_metadata_fp'] is not None:
             observation_metadata = \
                 parse_observation_metadata(
                     open(params['observation_metadata_fp'], 'U'))
         else:
             observation_metadata = None
         biom_fp = join(output_dir, 'observation_table.biom')
         biom_table = make_otu_table(
             open(join(output_dir, 'observation_map.txt'), 'U'),
             observation_metadata)
         write_biom_table(biom_table, biom_fp)
     else:
         # can't construct the final biom file if not polling
         # directly as the final observation map won't have been created yet
         pass
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_table_fp = opts.output_otu_table_fp
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    input_table = load_table(opts.input_otu_table_fp)

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa,
                                            metadata_field)
    input_table.filter(filter_fn, axis='observation')

    try:
        write_biom_table(input_table, output_table_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no OTUs remained after filtering.")
예제 #9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table_fp = opts.otu_table_fp
    mapping_fp = opts.mapping_fp
    mapping_field = opts.mapping_field
    output_dir = opts.output_dir
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    create_dir(output_dir)

    # split mapping file
    mapping_f = open(mapping_fp, 'U')
    for fp_str, sub_mapping_s in split_mapping_file_on_field(mapping_f, mapping_field):
        mapping_output_fp = join(output_dir, 'mapping_%s.txt' % fp_str)
        open(mapping_output_fp, 'w').write(sub_mapping_s)

    # split otu table
    otu_table_base_name = splitext(split(otu_table_fp)[1])[0]
    mapping_f = open(mapping_fp, 'U')

    otu_table = load_table(otu_table_fp)

    try:
        for fp_str, sub_otu_table_s in split_otu_table_on_sample_metadata(
                otu_table,
                mapping_f,
                mapping_field):
            otu_table_output_fp = join(output_dir, '%s_%s.biom' % (
                otu_table_base_name, fp_str))

            write_biom_table(sub_otu_table_s, otu_table_output_fp)
    except OTUTableSplitError as e:
        option_parser.error(e)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_table_fp = opts.output_otu_table_fp
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    input_table = load_table(opts.input_otu_table_fp)

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa,
                                            metadata_field)
    input_table.filter(filter_fn, axis='observation')

    write_biom_table(input_table, output_table_fp)
예제 #11
0
    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        self.l19_data = np.array([[7, 1, 0, 0, 0, 0, 0, 0, 0],
                                  [4, 2, 0, 0, 0, 1, 0, 0, 0],
                                  [2, 4, 0, 0, 0, 1, 0, 0, 0],
                                  [1, 7, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 8, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 7, 1, 0, 0, 0, 0, 0, 0],
                                  [0, 4, 2, 0, 0, 0, 2, 0, 0],
                                  [0, 2, 4, 0, 0, 0, 1, 0, 0],
                                  [0, 1, 7, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 8, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 7, 1, 0, 0, 0, 0, 0],
                                  [0, 0, 4, 2, 0, 0, 0, 3, 0],
                                  [0, 0, 2, 4, 0, 0, 0, 1, 0],
                                  [0, 0, 1, 7, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 8, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 7, 1, 0, 0, 0, 0],
                                  [0, 0, 0, 4, 2, 0, 0, 0, 4],
                                  [0, 0, 0, 2, 4, 0, 0, 0, 1],
                                  [0, 0, 0, 1, 7, 0, 0, 0, 0]])
        self.l19_sample_names = [
            'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6', 'sam7', 'sam8',
            'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', 'sam14', 'sam15',
            'sam16', 'sam17', 'sam18', 'sam19'
        ]
        self.l19_taxon_names = [
            'tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7',
            'tax8', 'tax9'
        ]
        self.l19_taxon_names_w_underscore = [
            'ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7',
            'tax8', 'tax9'
        ]

        l19 = Table(self.l19_data.T, self.l19_taxon_names,
                    self.l19_sample_names)
        fd, self.l19_fp = mkstemp(dir=self.tmp_dir,
                                  prefix='test_bdiv_otu_table',
                                  suffix='.blom')
        os.close(fd)
        write_biom_table(l19, self.l19_fp)

        l19_w_underscore = Table(self.l19_data.T,
                                 self.l19_taxon_names_w_underscore,
                                 self.l19_sample_names)
        fd, self.l19_w_underscore_fp = mkstemp(dir=self.tmp_dir,
                                               prefix='test_bdiv_otu_table',
                                               suffix='.blom')
        os.close(fd)
        write_biom_table(l19_w_underscore, self.l19_w_underscore_fp)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\
 ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'

        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)

        self.files_to_remove = [self.l19_fp, self.l19_w_underscore_fp]
        self.folders_to_remove = []
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or
            min_count != 0 or
            not isinf(max_count) or
            sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table =  load_table(opts.input_fp)

    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.ids()

    if sample_id_fp is not None:
        sample_id_f_ids = set([l.strip().split()[0]
                              for l in open(sample_id_fp, 'U') if not l.startswith('#')])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count,
                                                       max_count)
    write_biom_table(filtered_otu_table, output_fp)

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(
            output_mapping_fp,
            'w').write(
            format_mapping_file(
                mapping_headers,
                mapping_data))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if mapping_fp is None and valid_states is not None:
        option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination "
            "of those)."
        )
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate" " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, "U")
            sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp
    )

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering."
        )

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U"))
        mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids())
        open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
예제 #14
0
    def _generate_biom_output(self, observation_map_fp, output_biom_fp, observation_metadata_fp):
        if observation_metadata_fp is not None:
            observation_metadata = parse_taxonomy(open(observation_metadata_fp, "U"))
        else:
            observation_metadata = None

        biom_table = make_otu_table(open(observation_map_fp, "U"), observation_metadata)
        write_biom_table(biom_table, output_biom_fp)
예제 #15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
예제 #16
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
예제 #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or
            min_count_fraction != 0 or
            not isinf(max_count) or
            otu_ids_to_exclude_fp is not None or
            min_samples != 0 or not isinf(max_samples)):
        option_parser.error("No filtering requested. Must provide either "
                            "min counts, max counts, min samples, max samples, min_count_fraction, "
                            "or exclude_fp (or some combination of those).")

    otu_table = load_table(opts.input_fp)

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    otu_ids_to_keep = set(otu_table.observation_ids)

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([id_.strip().split()[0]
                                      for id_, seq in parse_fasta(open(otu_ids_to_exclude_fp, 'U'))])
        else:
            otu_ids_to_exclude = set([l.strip().split('\t')[0]
                                      for l in open(otu_ids_to_exclude_fp, 'U')])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table,
                                                    otu_ids_to_keep,
                                                    min_count,
                                                    max_count,
                                                    min_samples,
                                                    max_samples,
                                                    negate_ids_to_exclude)
    write_biom_table(filtered_otu_table, opts.output_fp)
예제 #18
0
    def _write_rarefaction(self, depth, rep, sub_otu_table):
        """ depth and rep can be numbers or strings
        """
        if sub_otu_table.is_empty():
            return

        fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.biom'
        fname = os.path.join(self.output_dir, fname)
        write_biom_table(sub_otu_table, fname)
예제 #19
0
    def setUp(self):
        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.l19_data = np.array([
            [7, 1, 0, 0, 0, 0, 0, 0, 0],
            [4, 2, 0, 0, 0, 1, 0, 0, 0],
            [2, 4, 0, 0, 0, 1, 0, 0, 0],
            [1, 7, 0, 0, 0, 0, 0, 0, 0],
            [0, 8, 0, 0, 0, 0, 0, 0, 0],
            [0, 7, 1, 0, 0, 0, 0, 0, 0],
            [0, 4, 2, 0, 0, 0, 2, 0, 0],
            [0, 2, 4, 0, 0, 0, 1, 0, 0],
            [0, 1, 7, 0, 0, 0, 0, 0, 0],
            [0, 0, 8, 0, 0, 0, 0, 0, 0],
            [0, 0, 7, 1, 0, 0, 0, 0, 0],
            [0, 0, 4, 2, 0, 0, 0, 3, 0],
            [0, 0, 2, 4, 0, 0, 0, 1, 0],
            [0, 0, 1, 7, 0, 0, 0, 0, 0],
            [0, 0, 0, 8, 0, 0, 0, 0, 0],
            [0, 0, 0, 7, 1, 0, 0, 0, 0],
            [0, 0, 0, 4, 2, 0, 0, 0, 4],
            [0, 0, 0, 2, 4, 0, 0, 0, 1],
            [0, 0, 0, 1, 7, 0, 0, 0, 0]
        ])
        self.l19_sample_names = [
            'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6',
            'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13',
            'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19']
        self.l19_taxon_names = ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',
                                'tax6', 'tax7', 'tax8', 'tax9']
        self.l19_taxon_names_w_underscore = ['ta_x1', 'tax2', 'tax3', 'tax4',
                                             'endbigtaxon', 'tax6', 'tax7',
                                             'tax8', 'tax9']

        l19 = Table(self.l19_data.T, self.l19_taxon_names,
                    self.l19_sample_names)
        fd, self.l19_fp = mkstemp(dir=self.tmp_dir,
                                prefix='test_bdiv_otu_table', suffix='.blom')
        os.close(fd)
        write_biom_table(l19, self.l19_fp)

        l19_w_underscore = Table(self.l19_data.T,
                                 self.l19_taxon_names_w_underscore,
                                 self.l19_sample_names)
        fd, self.l19_w_underscore_fp = mkstemp(dir=self.tmp_dir,
                                               prefix='test_bdiv_otu_table',
                                               suffix='.blom')
        os.close(fd)
        write_biom_table(l19_w_underscore, self.l19_w_underscore_fp)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\
 ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'
        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)

        self.files_to_remove = [self.l19_fp, self.l19_w_underscore_fp]
        self.folders_to_remove = []
예제 #20
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or min_count_fraction != 0 or not isinf(max_count)
            or otu_ids_to_exclude_fp is not None or min_samples != 0
            or not isinf(max_samples)):
        option_parser.error(
            "No filtering requested. Must provide either "
            "min counts, max counts, min samples, max samples, min_count_fraction, "
            "or exclude_fp (or some combination of those).")

    otu_table = load_table(opts.input_fp)

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    otu_ids_to_keep = set(otu_table.ids(axis='observation'))

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([
                id_.strip().split()[0]
                for id_, seq in parse_fasta(open(otu_ids_to_exclude_fp, 'U'))
            ])
        else:
            otu_ids_to_exclude = set([
                l.strip().split('\t')[0]
                for l in open(otu_ids_to_exclude_fp, 'U')
            ])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep,
                                                    min_count, max_count,
                                                    min_samples, max_samples,
                                                    negate_ids_to_exclude)
    write_biom_table(filtered_otu_table, opts.output_fp)
예제 #21
0
    def _write_rarefaction(self, depth, rep, sub_otu_table):
        """ depth and rep can be numbers or strings
        """
        if sub_otu_table.is_empty():
            return

        fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.biom'
        fname = os.path.join(self.output_dir, fname)
        write_biom_table(sub_otu_table, fname)
예제 #22
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_fps = opts.input_fps

    master = load_table(input_fps[0])

    for input_fp in input_fps[1:]:
        master = master.merge(load_table(input_fp))

    write_biom_table(master, opts.output_fp)
예제 #23
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_fps = opts.input_fps

    master = load_table(input_fps[0])

    for input_fp in input_fps[1:]:
        master = master.merge(load_table(input_fp))

    write_biom_table(master, opts.output_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = load_table(opts.input_fp)

    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.ids()

    if sample_id_fp is not None:
        o = open(sample_id_fp, 'U')
        sample_id_f_ids = set(
            [l.strip().split()[0] for l in o if not l.startswith('#')])
        o.close()
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count, max_count)
    write_biom_table(filtered_otu_table, output_fp)

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
예제 #25
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if not isfile(opts.input_path):
        raise IOError("Input path (%s) not valid.  Does it exist?" %
                      opts.input_path)

    samples, otus, data = parse_trflp(open(opts.input_path, 'U'))

    t = Table(data, otus, samples)
    write_biom_table(t, opts.output_path)
예제 #26
0
    def _generate_biom_output(self, observation_map_fp, output_biom_fp,
                              observation_metadata_fp):
        if observation_metadata_fp is not None:
            observation_metadata = \
                parse_taxonomy(open(observation_metadata_fp, 'U'))
        else:
            observation_metadata = None

        biom_table = make_otu_table(open(observation_map_fp, 'U'),
                                    observation_metadata)
        write_biom_table(biom_table, output_biom_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if not isfile(opts.input_path):
        raise IOError(
            "Input path (%s) not valid.  Does it exist?" %
            opts.input_path)

    samples, otus, data = parse_trflp(open(opts.input_path, 'U'))

    t = Table(data, otus, samples)
    write_biom_table(t, opts.output_path)
예제 #28
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0],
                                        [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{
            'domain': 'Archaea'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }]

        self.otu_table = Table(self.otu_table_data,
                               self.taxon_names,
                               self.sample_names,
                               observation_metadata=[{}, {}, {}, {}],
                               sample_metadata=[{}, {}, {}])

        self.otu_table_meta = Table(self.otu_table_data,
                                    self.taxon_names,
                                    self.sample_names,
                                    observation_metadata=self.otu_metadata)

        fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                        prefix='test_rarefaction',
                                        suffix='.biom')
        close(fd)
        fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                             prefix='test_rarefaction',
                                             suffix='.biom')
        close(fd)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir',
                                suffix='')

        write_biom_table(self.otu_table, self.otu_table_fp)
        write_biom_table(self.otu_table_meta, self.otu_table_meta_fp)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
예제 #29
0
파일: simsam.py 프로젝트: Honglongwu/qiime
def simsam_range_to_files(
    table,
    tree,
    simulated_sample_sizes,
    dissimilarities,
    output_dir,
    mapping_f=None,
    output_table_basename="table",
    output_map_basename="map",
):
    """Applies sim_otu_table over a range of parameters, writing output to file

     table: the input table to simulate samples from
     tree: tree related OTUs in input table
     simulated_sample_sizes: a list of ints defining how many
      output samples should be create per input sample
     dissimilarities: a list of floats containing the
      dissimilarities to use in simulating tables
     output_dir: the directory where all output tables and
      mapping files should be written
     mapping_f: file handle for metadata mapping file, if
      a mapping file should be created with the samples from
      each simulated table
     output_table_basename: basename for output table files
      (default: table)
     output_map_basename: basename for output mapping files
      (default: map)
    """
    create_dir(output_dir)
    for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities, mapping_f):
        output_table = e[0]
        output_mapping_lines = e[1]
        simulated_sample_size = e[2]
        dissimilarity = e[3]

        output_table_fp = join(
            output_dir, "%s_n%d_d%r.biom" % (output_table_basename, simulated_sample_size, dissimilarity)
        )
        write_biom_table(output_table, output_table_fp)

        if output_mapping_lines is not None:
            output_map_fp = join(
                output_dir, "%s_n%d_d%r.txt" % (output_map_basename, simulated_sample_size, dissimilarity)
            )
            output_map_f = open(output_map_fp, "w")
            output_map_f.write("".join(output_mapping_lines))
            output_map_f.close()
예제 #30
0
def simsam_range_to_files(table,
                          tree,
                          simulated_sample_sizes,
                          dissimilarities,
                          output_dir,
                          mapping_f=None,
                          output_table_basename="table",
                          output_map_basename="map"):
    """Applies sim_otu_table over a range of parameters, writing output to file

     table: the input table to simulate samples from
     tree: tree related OTUs in input table
     simulated_sample_sizes: a list of ints defining how many
      output samples should be create per input sample
     dissimilarities: a list of floats containing the
      dissimilarities to use in simulating tables
     output_dir: the directory where all output tables and
      mapping files should be written
     mapping_f: file handle for metadata mapping file, if
      a mapping file should be created with the samples from
      each simulated table
     output_table_basename: basename for output table files
      (default: table)
     output_map_basename: basename for output mapping files
      (default: map)
    """
    create_dir(output_dir)
    for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities,
                          mapping_f):
        output_table = e[0]
        output_mapping_lines = e[1]
        simulated_sample_size = e[2]
        dissimilarity = e[3]

        output_table_fp = join(
            output_dir, '%s_n%d_d%r.biom' %
            (output_table_basename, simulated_sample_size, dissimilarity))
        write_biom_table(output_table, output_table_fp)

        if output_mapping_lines is not None:
            output_map_fp = join(
                output_dir, '%s_n%d_d%r.txt' %
                (output_map_basename, simulated_sample_size, dissimilarity))
            output_map_f = open(output_map_fp, 'w')
            output_map_f.write(''.join(output_mapping_lines))
            output_map_f.close()
예제 #31
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    if not opts.counts_fname:
        seq_counts = None
    else:
        seq_counts = {}
        with open(opts.counts_fname, 'U') as infile:
            for line in infile:
                (key, val) = line.split()
                seq_counts[key] = val

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        with open(opts.mapping_fp, 'U') as map_f:
            mapping_data, mapping_header, mapping_comments = \
                parse_mapping_file(map_f)

        sample_metadata = mapping_file_to_dict(mapping_data,
                                               mapping_header)
    with open(opts.otu_map_fp, 'U') as otu_map_f:
        biom_otu_table = make_otu_table(otu_map_f,
                                        otu_to_taxonomy=otu_to_taxonomy,
                                        otu_ids_to_exclude=ids_to_exclude,
                                        sample_metadata=sample_metadata,seq_counts=seq_counts)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
예제 #32
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda id_, sample_metadata:\
        sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        # Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    with biom_open(otu_table_fp, 'U') as biom_file:
        table = parse_biom_table(biom_file)
    table.add_metadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapse(bin_function,
                            norm=False,
                            min_group_size=1,
                            axis='sample')

    # normalize the result if requested by the user
    if normalize:
        result.norm(axis='sample', inplace=True)

    # write a new BIOM file
    write_biom_table(result, output_fp)
예제 #33
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.otu_table_data = np.array([[2, 1, 0],
                                        [0, 5, 0],
                                        [0, 3, 0],
                                        [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{'domain': 'Archaea'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'}]

        self.otu_table = Table(self.otu_table_data,
                               self.taxon_names,
                               self.sample_names,
                               observation_metadata=[{}, {}, {}, {}],
                               sample_metadata=[{}, {}, {}])

        self.otu_table_meta = Table(self.otu_table_data,
                                    self.taxon_names, self.sample_names,
                                    observation_metadata=self.otu_metadata)

        fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                        prefix='test_rarefaction',
                                        suffix='.biom')
        close(fd)
        fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                             prefix='test_rarefaction',
                                             suffix='.biom')
        close(fd)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir', suffix='')

        write_biom_table(self.otu_table, self.otu_table_fp)
        write_biom_table(self.otu_table_meta, self.otu_table_meta_fp)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
예제 #34
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda id_, sample_metadata:\
        sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        # Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    with biom_open(otu_table_fp, 'U') as biom_file:
        table = parse_biom_table(biom_file)
    table.add_metadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapse(bin_function, norm=False, min_group_size=1,
                            axis='sample')

    # normalize the result if requested by the user
    if normalize:
        result.norm(axis='sample', inplace=True)

    # write a new BIOM file
    write_biom_table(result, output_fp)
예제 #35
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table_data = load_table(opts.input_otu_table)
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp

    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp, 'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U'))
        result = sort_otu_table(otu_table_data, sorted_sample_ids)
    else:
        result = sort_otu_table(otu_table_data,
                                natsort_case_insensitive(otu_table_data.ids()))

    write_biom_table(result, opts.output_fp)
예제 #36
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table_data = load_table(opts.input_otu_table)
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp

    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp, 'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        result = sort_otu_table(otu_table_data,
            natsort_case_insensitive(otu_table_data.ids()))

    write_biom_table(result, opts.output_fp)
예제 #37
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    mapping_fp = opts.mapping_fp
    collapse_fields = opts.collapse_fields.split(',')
    input_biom_fp = opts.input_biom_fp
    collapse_mode = opts.collapse_mode
    output_biom_fp = opts.output_biom_fp
    output_mapping_fp = opts.output_mapping_fp
    normalize = opts.normalize

    collapsed_metadata, collapsed_table = \
        collapse_samples(load_table(input_biom_fp),
                         open(mapping_fp, 'U'),
                         collapse_fields,
                         collapse_mode)

    if normalize:
        collapsed_table.norm(axis='sample', inplace=True)

    write_biom_table(collapsed_table, output_biom_fp)
    output_map_lines = mapping_lines_from_collapsed_df(collapsed_metadata)
    with open(output_mapping_fp, 'w') as output_mapping_f:
        output_mapping_f.write('\n'.join(output_map_lines))
예제 #38
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    mapping_fp = opts.mapping_fp
    collapse_fields = opts.collapse_fields.split(',')
    input_biom_fp = opts.input_biom_fp
    collapse_mode = opts.collapse_mode
    output_biom_fp = opts.output_biom_fp
    output_mapping_fp = opts.output_mapping_fp
    normalize = opts.normalize

    collapsed_metadata, collapsed_table = \
        collapse_samples(load_table(input_biom_fp),
                         open(mapping_fp, 'U'),
                         collapse_fields,
                         collapse_mode)

    if normalize:
        collapsed_table.norm(axis='sample', inplace=True)

    write_biom_table(collapsed_table, output_biom_fp)
    output_map_lines = mapping_lines_from_collapsed_df(collapsed_metadata)
    with open(output_mapping_fp, 'w') as output_mapping_f:
        output_mapping_f.write('\n'.join(output_map_lines))
예제 #39
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))
    biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'),
                                    otu_to_taxonomy=otu_to_taxonomy,
                                    otu_ids_to_exclude=ids_to_exclude)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
예제 #40
0
    def setUp(self):
        """Define some test data."""
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []

        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        self.otu_table1 = Table(data=array([[2, 0, 0, 1],
                                            [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                           sample_ids=list('XYZ'),
                                           observation_ids=list('abcd'))
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table1, self.otu_table1_fp)

        self.otu_table2 = Table(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'])
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table2, self.otu_table2_fp)

        self.single_sample_otu_table = Table(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list(
                'abcd'))
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom')
        close(fd)
        write_biom_table(self.single_sample_otu_table,
                         self.single_sample_otu_table_fp)

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp,
                                self.single_sample_otu_table_fp]
예제 #41
0
    def setUp(self):
        """Define some test data."""
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []

        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        self.otu_table1 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                sample_ids=list('XYZ'),
                                observation_ids=list('abcd'))
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                         prefix='alpha_diversity_tests',
                                         suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table1, self.otu_table1_fp)

        self.otu_table2 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                sample_ids=list('XYZ'),
                                observation_ids=['a', 'b', 'c', 'd_'])
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                         prefix='alpha_diversity_tests',
                                         suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table2, self.otu_table2_fp)

        self.single_sample_otu_table = Table(data=array([[2, 0, 0, 1]]).T,
                                             sample_ids=list('X'),
                                             observation_ids=list('abcd'))
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom')
        close(fd)
        write_biom_table(self.single_sample_otu_table,
                         self.single_sample_otu_table_fp)

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [
            self.otu_table1_fp, self.otu_table2_fp,
            self.single_sample_otu_table_fp
        ]
예제 #42
0
    def setUp(self):
        """Define some test data."""
        self.tmp_dir = get_qiime_temp_dir()

        self.otu_table1 = Table(data=array([[2, 0, 0, 1],
                                            [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                           sample_ids=list('XYZ'),
                                           observation_ids=list('abcd'))
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table1, self.otu_table1_fp)

        self.otu_table2 = Table(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'])
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table2, self.otu_table2_fp)

        self.single_sample_otu_table = Table(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list(
                'abcd'))
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom')
        close(fd)
        write_biom_table(self.single_sample_otu_table,
                         self.single_sample_otu_table_fp)

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp,
                                self.single_sample_otu_table_fp]
예제 #43
0
    def setUp(self):
        """Define some test data."""
        self.tmp_dir = get_qiime_temp_dir()

        self.otu_table1 = Table(data=array([[2, 0, 0, 1],
                                            [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                           sample_ids=list('XYZ'),
                                           observation_ids=list('abcd'))
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table1, self.otu_table1_fp)

        self.otu_table2 = Table(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'])
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table2, self.otu_table2_fp)

        self.single_sample_otu_table = Table(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list(
                'abcd'))
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom')
        close(fd)
        write_biom_table(self.single_sample_otu_table,
                         self.single_sample_otu_table_fp)

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp,
                                self.single_sample_otu_table_fp]
예제 #44
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=None,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        suppress_index_page=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = [
        'uclust_ref', 'usearch61_ref', 'sortmerna'
    ]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
            ('Run summary data', log_fp, _index_headers['run_summary']))
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger,
                prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])
            index_links.append(
                ('Pre-filtered sequence identifiers '
                 '(failed to hit reference at %1.1f%% identity)' %
                 (float(prefilter_percent_id) * 100),
                 prefilter_failures_list_fp, _index_headers['sequences']))

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir,
                                                 reference_otu_picking_method,
                                                 refseqs_fp, parallel, params,
                                                 logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(
            step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                     'python -c "import qiime; qiime.util.subsample_fasta' +
                     '(\'%s\', \'%s\', \'%f\')\n\n"' %
                     (abspath(step1_failures_fasta_fp),
                      abspath(step2_input_fasta_fp), percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                     new_ref_set_id, denovo_otu_picking_method,
                                     params, logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                        reference_otu_picking_method,
                                        step2_repset_fasta_fp, parallel,
                                        params, logger)

        commands.append([('Pick reference OTUs using de novo rep set',
                          step3_cmd)])

        index_links.append((
            'Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
            merged_otu_map_fp, _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                              step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step,
                          step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([
            ('Move final failures file to top-level directory',
             'mv %s %s/final_failures.txt' % (failures_fp, output_dir))
        ])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    index_links.append(
        ('Final map of OTU identifier to sequence identifers excluding '
         'OTUs with fewer than %d sequences' % min_otu_size,
         otu_no_singletons_fp, _index_headers['otu_maps']))

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(('OTU representative sequences', final_repset_fp,
                        _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append((
        'New reference sequences (i.e., OTU representative sequences plus input '
        'reference sequences)', new_refseqs_fp, _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 2 and step 4 to the final representative set and the new reference'
            + ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 4 to the final representative set and the new reference' +
            ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp, _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments'
            % min_otu_size, pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST' % min_otu_size,
            pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(('OTU taxonomic assignments', taxonomy_fp,
                                _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(('OTU phylogenetic tree', rep_set_tree_fp,
                            _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
예제 #45
0
 def _write_rarefaction(self, fname, sub_otu_table):
     """ depth and rep can be numbers or strings
     """
     if sub_otu_table.is_empty():
         return
     write_biom_table(sub_otu_table, fname)
예제 #46
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_assign_tax=False,
                run_align_and_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                suppress_md5=suppress_md5,
                suppress_index_page=True,
                denovo_otu_picking_method=denovo_otu_picking_method,
                reference_otu_picking_method=reference_otu_picking_method,
                status_update_callback=status_update_callback,
                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    lower_percentage = opts.lower_percentage
    upper_percentage = opts.upper_percentage
    otu_table_fp = opts.otu_table_fp
    otu_table = load_table(otu_table_fp)
    delimiter = opts.delimiter
    mapping_fp = opts.mapping
    md_as_string = opts.md_as_string
    md_identifier = opts.md_identifier
    levels = opts.level.split(',')
    suppress_classic_table_output = opts.suppress_classic_table_output
    suppress_biom_table_output = opts.suppress_biom_table_output

    if upper_percentage is not None and lower_percentage is not None:
        raise ValueError(
            "upper_percentage and lower_percentage are mutually exclusive")

    if upper_percentage is not None and lower_percentage is not None and \
            mapping:
        raise ValueError("upper_percentage and lower_percentage can not be "
                         "using with mapping file")

    if upper_percentage is not None and \
            (upper_percentage < 0 or upper_percentage > 1.0):
        raise ValueError('max_otu_percentage should be between 0.0 and 1.0')

    if lower_percentage is not None and \
            (lower_percentage < 0 or lower_percentage > 1.0):
        raise ValueError('lower_percentage should be between 0.0 and 1.0')

    if mapping_fp:
        mapping_file = open(mapping_fp, 'U')
        mapping, header, comments = parse_mapping_file(mapping_file)

        # use the input Mapping file for producing the output filenames
        map_dir_path, map_fname = split(mapping_fp)
        map_basename, map_fname_ext = splitext(map_fname)
    else:
        if suppress_classic_table_output and suppress_biom_table_output:
            option_parser.error("Both classic and BIOM output formats were "
                                "suppressed.")

    if not opts.absolute_abundance:
        otu_table = otu_table.norm(axis='sample', inplace=False)

    # introduced output directory to will allow for multiple outputs
    if opts.output_dir:
        create_dir(opts.output_dir, False)
        output_dir_path = opts.output_dir
    else:
        output_dir_path = './'

    # use the input OTU table to produce the output filenames
    dir_path, fname = split(otu_table_fp)
    basename, fname_ext = splitext(fname)

    # Iterate over the levels and generate a summarized taxonomy for each
    for level in levels:
        if mapping_fp:
            # define output filename
            output_fname = join(output_dir_path,
                                map_basename + '_L%s.txt' % (level))

            summary, tax_order = add_summary_mapping(otu_table, mapping,
                                                     int(level), md_as_string,
                                                     md_identifier)

            write_add_taxa_summary_mapping(summary, tax_order, mapping, header,
                                           output_fname, delimiter)
        else:
            # define the output filename. The extension will be added to the
            # end depending on the output format
            output_fname = join(output_dir_path, basename + '_L%s' % level)

            summary, header = make_summary(otu_table, int(level),
                                           upper_percentage, lower_percentage,
                                           md_as_string, md_identifier)

            sample_ids = header[1:]

            observation_ids = []
            data = []
            for row in summary:
                # Join taxonomic levels to create an observation ID.
                observation_ids.append(delimiter.join(row[0]))
                data.append(row[1:])

            table = Table(np.asarray(data), observation_ids, sample_ids)
            if opts.transposed_output:
                table = table.transpose()

            if not suppress_classic_table_output:
                with open(output_fname + '.txt', 'w') as outfile:
                    outfile.write(table.to_tsv())

            if not suppress_biom_table_output:
                write_biom_table(table, output_fname + '.biom')
예제 #48
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    lower_percentage = opts.lower_percentage
    upper_percentage = opts.upper_percentage
    otu_table_fp = opts.otu_table_fp
    otu_table = load_table(otu_table_fp)
    delimiter = opts.delimiter
    mapping_fp = opts.mapping
    md_as_string = opts.md_as_string
    md_identifier = opts.md_identifier
    levels = opts.level.split(',')
    suppress_classic_table_output = opts.suppress_classic_table_output
    suppress_biom_table_output = opts.suppress_biom_table_output

    if upper_percentage is not None and lower_percentage is not None:
        raise ValueError(
            "upper_percentage and lower_percentage are mutually exclusive")

    if upper_percentage is not None and lower_percentage is not None and \
            mapping:
        raise ValueError("upper_percentage and lower_percentage can not be "
                         "using with mapping file")

    if upper_percentage is not None and \
            (upper_percentage < 0 or upper_percentage > 1.0):
        raise ValueError('max_otu_percentage should be between 0.0 and 1.0')

    if lower_percentage is not None and \
            (lower_percentage < 0 or lower_percentage > 1.0):
        raise ValueError('lower_percentage should be between 0.0 and 1.0')

    if mapping_fp:
        mapping_file = open(mapping_fp, 'U')
        mapping, header, comments = parse_mapping_file(mapping_file)

        # use the input Mapping file for producing the output filenames
        map_dir_path, map_fname = split(mapping_fp)
        map_basename, map_fname_ext = splitext(map_fname)
    else:
        if suppress_classic_table_output and suppress_biom_table_output:
            option_parser.error("Both classic and BIOM output formats were "
                                "suppressed.")

    if not opts.absolute_abundance:
        otu_table = otu_table.norm(axis='sample', inplace=False)

    # introduced output directory to will allow for multiple outputs
    if opts.output_dir:
        create_dir(opts.output_dir, False)
        output_dir_path = opts.output_dir
    else:
        output_dir_path = './'

    # use the input OTU table to produce the output filenames
    dir_path, fname = split(otu_table_fp)
    basename, fname_ext = splitext(fname)

    # Iterate over the levels and generate a summarized taxonomy for each
    for level in levels:
        if mapping_fp:
            # define output filename
            output_fname = join(output_dir_path,
                                map_basename + '_L%s.txt' % (level))

            summary, tax_order = add_summary_mapping(otu_table,
                                                     mapping,
                                                     int(level),
                                                     md_as_string,
                                                     md_identifier)

            write_add_taxa_summary_mapping(summary, tax_order, mapping,
                                           header, output_fname, delimiter)
        else:
            # define the output filename. The extension will be added to the
            # end depending on the output format
            output_fname = join(output_dir_path, basename + '_L%s' % level)

            summary, header = make_summary(otu_table,
                                           int(level),
                                           upper_percentage,
                                           lower_percentage,
                                           md_as_string,
                                           md_identifier)

            sample_ids = header[1:]

            observation_ids = []
            data = []
            for row in summary:
                # Join taxonomic levels to create an observation ID.
                observation_ids.append(delimiter.join(row[0]))
                data.append(row[1:])

            table = Table(np.asarray(data), observation_ids, sample_ids)
            if opts.transposed_output:
                table = table.transpose()

            if not suppress_classic_table_output:
                with open(output_fname + '.txt', 'w') as outfile:
                    outfile.write(table.to_tsv())

            if not suppress_biom_table_output:
                write_biom_table(table, output_fname + '.biom')
예제 #49
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if (mapping_fp is None and valid_states is not None):
        option_parser.error("--mapping_fp must be provided if --valid_states "
                            "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and "
                            "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, 'U')
            sample_id_f_ids = set(
                [l.strip().split()[0] for l in o if not l.startswith('#')])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table,
        sample_ids_to_keep,
        min_count,
        max_count,
        negate_ids_to_keep=negate_sample_id_fp)

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no samples remained after filtering.")

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
예제 #50
0
    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        self.map_file = """#SampleID	Day	time	Description
#This is some comment about the study
1	090809	1200	some description of sample1
2	090809	1800	some description of sample2
3	090909	1200	some description of sample3
4	090909	1800	some description of sample4
5	091009	1200	some description of sample5"""
        self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")],
                              "2": [("Day", "090809"), ("time", "1800")],
                              "3": [("Day", "090909"), ("time", "1200")],
                              "4": [("Day", "090909"), ("time", "1800")],
                              "5": [("Day", "091009"), ("time", "1200")]}
        self.sample_by_cat = {("Day", "090809"): ["1", "2"],
                              ("Day", "090909"): ["3", "4"],
                              ("Day", "091009"): ["5"],
                              ("time", "1200"): ["1", "3", "5"],
                              ("time", "1800"): ["2", "4"]}

        self.num_cats = 2
        self.meta_dict = {"1": ["090809	1200", 0],
                          "2": ["090809	1800", 0],
                          "3": ["090909	1200", 0],
                          "4": ["090909	1800", 0],
                          "5": ["091009	1200", 0]}
        self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"]
        self.node_labels = ["node_name", "node_disp_name", "ntype", "degree",
                            "weighted_degree", "consensus_lin", "Day", "time"]
        self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]]

        self.otu_table_vals = array([[0, 1, 0, 0, 6],
                                     [2, 0, 0, 0, 0],
                                     [0, 0, 3, 1, 0],
                                     [0, 0, 0, 0, 5],
                                     [0, 4, 2, 0, 0],
                                     [3, 6, 0, 0, 0],
                                     [0, 0, 4, 2, 0],
                                     [0, 0, 0, 0, 3],
                                     [2, 0, 0, 5, 0],
                                     [0, 2, 0, 4, 0]])

        otu_table = Table(self.otu_table_vals,
                          ['otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5',
                           'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10'],
                          ['1', '2', '3', '4', '5'],
                          [{"taxonomy": ["Bacteria", "Actinobacteria",
                                         "Coriobacteridae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Bacteroidaceae"]},
                           {"taxonomy": ["Bacteria", "Firmicutes",
                                         "Clostridia", "Clostridiales"]},
                           {"taxonomy": ["Bacteria", "Spirochaetes",
                                         "Spirochaetales", "Spirochaetaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Rikenellaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales",
                                         "Odoribacteriaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae",
                                         "otu_425"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae",
                                         "otu_425"]},
                           {"taxonomy": ["Bacteria", "Firmicutes",
                                         "Mollicutes",
                                         "Clostridium_aff_innocuum_CM970"]}],
                          [None, None, None, None, None])

        fd, self.otu_table_fp = mkstemp(
            dir=self.tmp_dir, prefix='test_make_otu_network_otu_table',
            suffix='.biom')
        close(fd)
        write_biom_table(otu_table, self.otu_table_fp)

        self.otu_sample_file = """#Full OTU Counts
#OTU ID	1	2	3	4	5	Consensus Lineage
otu_1	0	1	0	0	6	Bacteria; Actinobacteria; Coriobacteridae
otu_2	2	0	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae
otu_3	0	0	3	1	0	Bacteria; Firmicutes; Clostridia; Clostridiales
otu_4	0	0	0	0	5	Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae
otu_5	0	4	2	0	0	Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae
otu_6	3	6	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae
otu_7	0	0	4	2	0	Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae
otu_8	0	0	0	0	3	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_9	2	0	0	5	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_10	0	2	0	4	0	Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970"""

        self.con_by_sample = {
            '1': set(['2', '4']), '2': set(['5', '3', '1', '4']),
            '3': set(['4', '2']), '4': set(['3', '1', '2']),
            '5': set(['2'])}

        self.edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	otu_2	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	otu_4	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "5	otu_8	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	091009	1200",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                              "2	2	user_node	4	13.0	other	090809	1800",
                              "3	3	user_node	3	9.0	other	090909	1200",
                              "4	4	user_node	4	12.0	other	090909	1800",
                              "5	5	user_node	3	14.0	other	091009	1200",
                              "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                              "otu_2		otu_node	1	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	otu	otu",
                              "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                              "otu_4		otu_node	1	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	otu	otu",
                              "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                              "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                              "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                              "otu_8		otu_node	1	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.red_edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	@1	1.0	missed	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	@5	1.0	missed	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.red_node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                                  "2	2	user_node	4	13.0	other	090809	1800",
                                  "3	3	user_node	3	9.0	other	090909	1200",
                                  "4	4	user_node	4	12.0	other	090909	1800",
                                  "5	5	user_node	3	14.0	other	091009	1200",
                                  "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                                  "@1		otu_collapsed	1	1.0	other	otu	otu",
                                  "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                                  "@5		otu_collapsed	2	2.0	other	otu	otu",
                                  "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                                  "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                                  "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                                  "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                                  "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.otu_dc = {1: 3, 2: 7}
        self.sample_dc = {3: 3, 4: 2}
        self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2}

        self.num_con_cat = {"Day": 2,
                            "time": 1}
        self.num_con = 6
        self.num_cat = {"Day": 2,
                        "time": 4}
        self.num_cat_less = {"Day": 1,
                             "time": 3}
        self._paths_to_clean_up = [self.otu_table_fp]
        self._dir_to_clean_up = ''
예제 #51
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir

    if opts.num_fraction_for_core_steps < 2:
        option_parser.error(
            "Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = np.linspace(opts.min_fraction_for_core,
                                     opts.max_fraction_for_core,
                                     opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp

    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'),
            valid_states)
        if len(sample_ids) < 1:
            option_parser.error(
                "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %
                valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the
        # samples to work with
        sample_ids = None

    input_table = parse_biom_table(open(input_fp, 'U'))

    otu_counts = []
    summary_figure_fp = join(output_dir, 'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)

        # prep output files
        output_fp = join(
            output_dir,
            'core_otus_%s.txt' %
            fraction_for_core_str)
        output_table_fp = join(
            output_dir,
            'core_table_%s.biom' %
            fraction_for_core_str)
        output_f = open(output_fp, 'w')

        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write(
                "# No OTUs present in %s %% of samples." %
                fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue

        # write some header information to file
        if sample_ids is None:
            output_f.write(
                "# Core OTUs across %s %% of samples.\n" %
                fraction_for_core_str)
        else:
            output_f.write(
                "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %
                (fraction_for_core_str, valid_states, ' '.join(sample_ids)))

        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iter(axis='observation'):
            output_f.write('%s\t%s\n' % (id_, md[otu_md]))
            otu_count += 1
        output_f.close()

        # write the core biom table
        write_biom_table(core_table, output_table_fp)

        # append the otu count to the list of counts
        otu_counts.append(otu_count)

    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core), max(fractions_for_core))
    ylim(0, max(otu_counts) + 1)
    xlabel(
        "Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
예제 #52
0
 def _write_rarefaction(self, fname, sub_otu_table):
     """ depth and rep can be numbers or strings
     """
     if sub_otu_table.is_empty():
         return
     write_biom_table(sub_otu_table, fname)
예제 #53
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                                refseqs_fp=refseqs_fp,
                                                output_dir=iteration_output_dir,
                                                percent_subsample=percent_subsample,
                                                new_ref_set_id='.'.join(
                                                    [new_ref_set_id, str(i)]),
                                                command_handler=command_handler,
                                                params=params,
                                                qiime_config=qiime_config,
                                                run_assign_tax=False,
                                                run_align_and_tree=False,
                                                prefilter_refseqs_fp=prefilter_refseqs_fp,
                                                prefilter_percent_id=prefilter_percent_id,
                                                min_otu_size=min_otu_size,
                                                step1_otu_map_fp=step1_otu_map_fp,
                                                step1_failures_fasta_fp=step1_failures_fasta_fp,
                                                parallel=parallel,
                                                suppress_step4=suppress_step4,
                                                logger=logger,
                                                suppress_md5=suppress_md5,
                                                suppress_index_page=True,
                                                denovo_otu_picking_method=denovo_otu_picking_method,
                                                reference_otu_picking_method=reference_otu_picking_method,
                                                status_update_callback=status_update_callback,
                                                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append(
            '%s/otu_table_mc%d.biom' %
            (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
예제 #54
0
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout,
                                        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref',
                                             'sortmerna']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp,
                        step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                     step2_dir,
                                     new_ref_set_id,
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(
            step1_failures_fasta_fp,
            step3_dir,
            reference_otu_picking_method,
            step2_repset_fasta_fp,
            parallel,
            params,
            logger)

        commands.append([
            ('Pick reference OTUs using de novo rep set', step3_cmd)])

        index_links.append(
            ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
             merged_otu_map_fp,
             _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                            step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (failures_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 2 and step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(
                    ('OTU taxonomic assignments',
                    taxonomy_fp,
                    _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)