def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.input_fp
    categories = opts.categories
    header_names = opts.categories_header_names
    output_fp = opts.output_fp

    if len(categories) != len(header_names):
        option_parser.error('This shouldnt be happening what are you doing?')

    data, headers, _ = parse_mapping_file(open(mapping_fp, 'U'))


    headers, data = apply_operation_on_mapping_file_columns(headers, data,
        categories, header_names)
    # for j in range(0,len(categories)):
    #     headers.append(header_names[j])
    #     for k, line in enumerate(data):
    #         temp = 0.0
    #         indices = map(lambda x: headers.index(x), categories[j].split(','))
    #         for index in indices:
    #             temp = temp + float(line[index])
    #         data[k].append('%f' % temp)

    lines = format_mapping_file(headers, data)

    fd = open(output_fp, 'w')
    fd.writelines(lines)
    fd.close()
Пример #2
0
def create_replicated_mapping_file(map_f, num_replicates, sample_ids):
    """Returns a formatted mapping file with replicated sample IDs.

    Each sample ID will have an ascending integer appended to it from the range
    [0, num_replicates - 1]. For example, if there are two input sample IDs, S1
    and S2, with 3 replicates each, the output will be:
        S1.0
        S1.1
        S1.2
        S2.0
        S2.1
        S2.2

    All other metadata columns will simply be copied to the output mapping
    file. The order of input sample IDs is preserved.

    Arguments:
        map_f - input mapping file to replicate (file-like object)
        num_replicates - number of replicates at each sample
        sample_ids - only sample IDs in the mapping file that are in this list
            will be replicated. Sample IDs in the mapping file that are not
            found in this list will not be added to the resulting mapping file
    """
    if num_replicates < 1:
        raise ValueError("Must specify at least one sample replicate (was "
                         "provided %d)." % num_replicates)
    map_data, header, comments = parse_mapping_file(map_f)

    rep_map_data = []
    for row in map_data:
        if row[0] in sample_ids:
            for rep_num in range(num_replicates):
                rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:])

    return format_mapping_file(header, rep_map_data, comments)
Пример #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.input_fp
    out_mapping_fp = opts.output_fp
    valid_states = opts.valid_states

    if opts.sample_id_fp:
        valid_sample_ids = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif mapping_fp and valid_states:
        valid_sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)

    data, headers, _ = parse_mapping_file(open(mapping_fp, 'U'))

    good_mapping_file = []
    for line in data:
        if line[0] in valid_sample_ids:
            good_mapping_file.append(line)

    lines = format_mapping_file(headers, good_mapping_file)

    fd = open(out_mapping_fp, 'w')
    fd.write(lines)
    fd.close()
Пример #4
0
def create_replicated_mapping_file(map_f, num_replicates, sample_ids):
    """Returns a formatted mapping file with replicated sample IDs.

    Each sample ID will have an ascending integer appended to it from the range
    [0, num_replicates - 1]. For example, if there are two input sample IDs, S1
    and S2, with 3 replicates each, the output will be:
        S1.0
        S1.1
        S1.2
        S2.0
        S2.1
        S2.2

    All other metadata columns will simply be copied to the output mapping
    file. The order of input sample IDs is preserved.

    Arguments:
        map_f - input mapping file to replicate (file-like object)
        num_replicates - number of replicates at each sample
        sample_ids - only sample IDs in the mapping file that are in this list
            will be replicated. Sample IDs in the mapping file that are not
            found in this list will not be added to the resulting mapping file
    """
    if num_replicates < 1:
        raise ValueError("Must specify at least one sample replicate (was "
                         "provided %d)." % num_replicates)
    map_data, header, comments = parse_mapping_file(map_f)

    rep_map_data = []
    for row in map_data:
        if row[0] in sample_ids:
            for rep_num in range(num_replicates):
                rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:])

    return format_mapping_file(header, rep_map_data, comments)
Пример #5
0
def create_personal_mapping_file(map_as_list,
                                 header, 
                                 comments, 
                                 personal_id_of_interest, 
                                 output_fp, 
                                 personal_id_index, 
                                 individual_titles):
    """ creates mapping file on a per-individual basis """
    if individual_titles == None: 
        individual_titles = ['Self', 'Other']
    else: 
        individual_titles = individual_titles.split(',')   
    personal_map = []
    for line in map_as_list:
        personal_map.append(line[:])
    for i in personal_map:   
        if i[personal_id_index] == personal_id_of_interest: 
            i.append(individual_titles[0])
        else: 
            i.append(individual_titles[1])
    personal_mapping_file = format_mapping_file(header, personal_map, comments) 
    output_f = open(output_fp,'w')
    output_f.write(personal_mapping_file)
    output_f.close()
    return personal_map
Пример #6
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """
    
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    
    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
    
    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case "+\
                "and white-space sensitive). \n\tProvided field: "+\
                "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        # parse mapping file each time though the loop as filtering operates on values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
                                         mapping_data, 
                                         mapping_headers,
                                         sample_ids_to_keep,
                                         include_repeat_cols=include_repeat_cols, 
                                         column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    columns_to_merge = opts.columns_to_merge
    mapping_fp = opts.mapping_fp
    output_fp = opts.output_fp

    try:
        data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    except:
        option_parser.error('Bro, that doesn\'t look like a mapping file')

    for merging in columns_to_merge:
        retrieve = lambda x: headers.index(x)
        indices = map(retrieve, merging.split('&&'))

        headers.append(''.join([headers[element] for element in indices]))

        for line in data:
            line.append(''.join([line[element] for element in indices]))

    # this should never happen
    assert len(headers) == len(data[0]), "Something went horribly wrong, "+\
        "that's what you get for using non-unit-tested software"

    lines = format_mapping_file(headers, data, comments)

    fd = open(output_fp, 'w')
    fd.writelines(lines)
    fd.close()
Пример #8
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """

    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)

    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)

    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case " +
                           "and white-space sensitive). \n\tProvided field: " +
                           "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers)))

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        # parse mapping file each time though the loop as filtering operates on
        # values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
            mapping_data,
            mapping_headers,
            sample_ids_to_keep,
            include_repeat_cols=include_repeat_cols,

            column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if mapping_fp is None and valid_states is not None:
        option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination "
            "of those)."
        )
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate" " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, "U")
            sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp
    )

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering."
        )

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U"))
        mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids())
        open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or
            min_count != 0 or
            not isinf(max_count) or
            sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table =  load_table(opts.input_fp)

    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.ids()

    if sample_id_fp is not None:
        sample_id_f_ids = set([l.strip().split()[0]
                              for l in open(sample_id_fp, 'U') if not l.startswith('#')])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count,
                                                       max_count)
    write_biom_table(filtered_otu_table, output_fp)

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(
            output_mapping_fp,
            'w').write(
            format_mapping_file(
                mapping_headers,
                mapping_data))
Пример #11
0
 def test_format_mapping_file(self):
     """ format_mapping file should match expected result"""
     headers = ['SampleID','col1','col0','Description']
     samples =\
      [['bsample','v1_3','v0_3','d1'],['asample','aval','another','d2']]
     comments = ['this goes after headers','this too']
     self.assertEqual(format_mapping_file(headers,samples,comments),
      example_mapping_file)
Пример #12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
Пример #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
Пример #14
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))
    output_f = open(opts.output_fp, 'w')

    if (mapping_fp and valid_states):
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.SampleIds

    if (sample_id_fp is not None):
        sample_id_f_ids = set([
            l.strip().split()[0] for l in open(sample_id_fp, 'U')
            if not l.startswith('#')
        ])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count, max_count)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.SampleIds)
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Пример #15
0
 def test_format_mapping_file(self):
     """ format_mapping file should match expected result"""
     headers = ['SampleID', 'col1', 'col0', 'Description']
     samples =\
         [['bsample', 'v1_3', 'v0_3', 'd1'],
          ['asample', 'aval', 'another', 'd2']]
     comments = ['this goes after headers', 'this too']
     self.assertEqual(format_mapping_file(headers, samples, comments),
                      example_mapping_file)
Пример #16
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    alpha_fps = opts.alpha_fps
    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    binning_method = opts.binning_method
    missing_value_name = opts.missing_value_name
    depth = opts.depth
    number_of_bins = opts.number_of_bins
    collated_input = opts.collated_input

    # if using collated data, make sure they specify a depth
    if collated_input:
        alpha_dict = {}

        # build up a dictionary with the filenames as keys and lines as values
        for single_alpha_fp in alpha_fps:
            alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
                single_alpha_fp, 'U').readlines()

        # format the collated data
        try:
            metrics, alpha_sample_ids, alpha_data = mean_alpha(
                alpha_dict, depth)
        except ValueError as e:  # see mean_alpha for the possible exceptions
            option_parser.error(e.message)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error(
                'A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(
                open(alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
                                                   alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
                                                   binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers,
                                out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')
    fd_out.writelines(lines)
    fd_out.close()
Пример #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    alpha_fps = opts.alpha_fps
    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    binning_method = opts.binning_method
    missing_value_name = opts.missing_value_name
    depth = opts.depth

    # make sure the number of bins is an integer
    try:
        number_of_bins = int(opts.number_of_bins)
    except ValueError:
        raise ValueError, 'The number of bins must be an integer, not %s'\
            % opts.number_of_bins

    # if using collated data, make sure they specify a depth
    if depth is not None:
        alpha_dict = {}

        # build up a dictionary with the filenames as keys and lines as values
        for single_alpha_fp in alpha_fps:
            alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
                single_alpha_fp, 'U').readlines()

        # format the collated data
        metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict,
            depth)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error('A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(open(
                alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
        alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
        binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')
    fd_out.writelines(lines)
    fd_out.close()
Пример #18
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    try:
        data, headers, comments = parse_mapping_file(open(opts.input_fp, 'U'))
    except:
        option_parser.error('That doesn\'t look like a mapping file')

    lines = format_mapping_file(headers, data, comments)

    fd = open(opts.input_fp, 'w')
    fd.writelines(lines)
    fd.close()
Пример #19
0
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False):
    """ Filter rows from a metadata mapping file """
    mapping_data, header, comments = parse_mapping_file(mapping_f)
    filtered_mapping_data = []
    sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep)
    for mapping_datum in mapping_data:
        if mapping_datum[0] in sample_ids_to_keep:
            filtered_mapping_data.append(mapping_datum)
        elif negate:
            filtered_mapping_data.append(mapping_datum)
        else:
            pass
    
    return format_mapping_file(header,filtered_mapping_data)
Пример #20
0
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False):
    """ Filter rows from a metadata mapping file """
    mapping_data, header, comments = parse_mapping_file(mapping_f)
    filtered_mapping_data = []
    sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep)
    
    for mapping_datum in mapping_data:
        hit = mapping_datum[0] in sample_ids_to_keep
        if hit and not negate:
            filtered_mapping_data.append(mapping_datum)
        elif not hit and negate:
            filtered_mapping_data.append(mapping_datum)
        else:
            pass
    return format_mapping_file(header,filtered_mapping_data)
Пример #21
0
def format_vectors_to_js(mapping_file_data,
                         mapping_file_headers,
                         coords_data,
                         coords_headers,
                         connected_by_header,
                         sorted_by_header=None):
    """Write a string representing the vectors in a PCoA plot as javascript

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of
    numpy 2-D arrays for jackknifed input
    coords_headers: headers of the coords in the PCoA plot or a list of lists
    with the headers for jackknifed input
    connected_by_header: header of the mapping file that represents how the
    lines will be connected
    sorted_by_header: numeric-only header name to sort the samples in the
    vectors

    Output:
    js_vectors_string: string that represents the vectors in the shape of a
    javascript object

    Notes:
    If using jackknifed input, the coordinates and headers that will be used are
    the ones belonging to the master coords i. e. the first element.
    """

    js_vectors_string = []
    js_vectors_string.append('\nvar g_vectorPositions = new Array();\n')

    if connected_by_header != None:
        # check if we are processing jackknifed input, if so just get the master
        if type(coords_data) == list:
            coords_data = coords_data[0]
            coords_headers = coords_headers[0]

        columns_to_keep = ['SampleID', connected_by_header]

        # do not ad None if sorted_by_header is None or empty
        if sorted_by_header:
            columns_to_keep.append(sorted_by_header)

        # reduce the amount of data by keeping the required fields only
        mapping_file_data, mapping_file_headers =\
            keep_columns_from_mapping_file(mapping_file_data,
            mapping_file_headers, columns_to_keep)

        # format the mapping file to use this with the filtering function
        mf_string = format_mapping_file(mapping_file_headers,
                                        mapping_file_data)

        index = mapping_file_headers.index(connected_by_header)
        connected_by = list(set([line[index] for line in mapping_file_data]))

        for category in connected_by:
            # convert to StringIO to for each iteration; else the object
            # won't be usable after the first iteration & you'll get an error
            sample_ids = sample_ids_from_metadata_description(
                StringIO(mf_string), '%s:%s' % (connected_by_header, category))

            # if there is a sorting header, sort the coords using these values
            if sorted_by_header:
                sorting_index = mapping_file_headers.index(sorted_by_header)
                to_sort = [line for line in mapping_file_data if line[0] in\
                    sample_ids]

                # get the sorted sample ids from the sorted-reduced mapping file
                sample_ids = zip(
                    *sorted(to_sort, key=lambda x: float(x[sorting_index])))[0]

            # each category value is a new vector
            js_vectors_string.append(
                "g_vectorPositions['%s'] = new Array();\n" % (category))

            for s in sample_ids:
                index = coords_headers.index(s)

                # print the first three elements of each coord for each sample
                js_vectors_string.append(
                    "g_vectorPositions['%s']['%s'] = %s;\n" %
                    (category, s, coords_data[index, :3].tolist()))

    return ''.join(js_vectors_string)
Пример #22
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    otu_table_fp = opts.otu_table_fp
    output_dir = opts.output_dir
    mapping_fp = opts.mapping_fp
    tree_fp = opts.tree_fp
    verbose = opts.verbose
    print_only = opts.print_only
    seqs_per_sample = int(opts.seqs_per_sample)
    parallel = opts.parallel
    min_seqs_sample = opts.min_seqs_sample
    subject_category = opts.subject_name

    try:
        makedirs(output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"
                " a different directory, or force overwrite with -f.")


    ## ******************** make_evident_selectors ********************
    ## The code for make_evident_selectors.py is here and has to go before the params
    ## validation as we need to know the main cats before creating the params file
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    biom_table = parse_biom_table(open(otu_table_fp, 'U'))

    # getting valid samples from biom file
    real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\
        biom_table.SampleIds, include_repeat_cols=False)

    if subject_category not in real_map_headers:
        option_parser.error('This column: %s is not in the mapping file, try %s'%\
            (subject_category, real_map_headers))
 
    sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table)

    mapping_file_tuple = (real_map_data, real_map_headers)

    # calculate the available subjects at each rarefaction level
    results, main_map_cat = make_selectors(sorted_counts_per_sample, min_seqs_sample,\
        mapping_file_tuple, subject_category, verbose=verbose)

    fout = open(join(output_dir,'selectors.txt'),'w')
    fout.write('#Sequences\tSubjects\tSamples\tMetadata\n')
    fout.write('\n'.join(results))
    fout.close()
    
    fout = open(join(output_dir,'mapping_file.txt'),'w')
    fout.write(format_mapping_file(real_map_headers, real_map_data))
    fout.close()
    ## ******************** make_evident_selectors ********************

    fout = open(join(output_dir,'study_preferences.txt'),'w')
    fout.write('%d\n' % seqs_per_sample)
    fout.write('%s\n' % subject_category)
    fout.close()

    ## ******************** filter_samples_from_otu_table ********************
    ## Filtering original biom file to only have samples above the max length to avoid
    ## ugly plots
    alpha_biom_file = join(output_dir,'filtered_otu_table_for_alpha.biom')
    fout = open(alpha_biom_file,'w')
    sample_ids_to_keep = biom_table.SampleIds
    filtered_otu_table = filter_samples_from_otu_table(biom_table,
                                                       sample_ids_to_keep,
                                                       min_count=seqs_per_sample,
                                                       max_count=inf)
    fout.write(format_biom_table(filtered_otu_table))
    fout.close()
    ## ******************** filter_samples_from_otu_table ********************

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            option_parser.error("Can't open parameters file (%s). Does it exist? " \
            "Do you have read access?" % opts.parameter_fp)
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters(
            ['beta_diversity:metrics unweighted_unifrac',\
             'make_rarefaction_plots:prefs_path %s' % join(output_dir,'prefs.txt'),
             'make_rarefaction_plots:colorby %s' % ','.join(main_map_cat), 
             'make_rarefaction_plots:output_type memory', 
             'multiple_rarefactions:min %d' % int(seqs_per_sample/4),
             'multiple_rarefactions:max %d' % (seqs_per_sample+1),
             'multiple_rarefactions:step %d' % int(seqs_per_sample/4),
             'multiple_rarefactions:num-reps 4',
            ])
        # empty list returns empty defaultdict for now
    
    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params,
                                   jobs_to_start,
                                   default_jobs_to_start,
                                   parallel,
                                   option_parser)


    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    copyfile(otu_table_fp, join(output_dir,'raw.biom'))
    
    run_beta_diversity_through_plots(otu_table_fp=otu_table_fp,
     mapping_fp=mapping_fp,
     output_dir=output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     color_by_interesting_fields_only=False,
     sampling_depth=seqs_per_sample,
     histogram_categories=None,
     tree_fp=tree_fp,
     parallel=parallel,
     suppress_3d_plots=True,
     suppress_2d_plots=True,
     status_update_callback=status_update_callback)
    
    output_dir = join(output_dir,'alpha')
    run_alpha_rarefaction(otu_table_fp=alpha_biom_file,\
     mapping_fp=mapping_fp,\
     output_dir=output_dir,\
     command_handler=command_handler,\
     params=params,
     qiime_config=qiime_config,\
     tree_fp=tree_fp,\
     num_steps=4,\
     parallel=parallel,\
     min_rare_depth=10,
     max_rare_depth=20,
     status_update_callback=status_update_callback,
     plot_stderr_and_stddev=True)
Пример #23
0
 def test_format_mapping_file(self):
     """ format_mapping file should match expected result"""
     headers = ["SampleID", "col1", "col0", "Description"]
     samples = [["bsample", "v1_3", "v0_3", "d1"], ["asample", "aval", "another", "d2"]]
     comments = ["this goes after headers", "this too"]
     self.assertEqual(format_mapping_file(headers, samples, comments), example_mapping_file)
Пример #24
0
def make_distance_boxplots(dm_f,
                           map_f,
                           fields,
                           width=None,
                           height=6.0,
                           suppress_all_within=False,
                           suppress_all_between=False,
                           suppress_individual_within=False,
                           suppress_individual_between=False,
                           y_min=0.0,
                           y_max=1.0,
                           whisker_length=1.5,
                           box_width=0.5,
                           box_color=None,
                           color_individual_within_by_field=None,
                           sort=None):
    """Generates various types of boxplots for distance comparisons.

    Returns a list of tuples, one for each field. Each tuple contains the
    following:
        1) the name of the field (string)
        2) a matplotlib.figure.Figure object containing the boxplots
        3) a list of lists containing the raw plot data that was passed to mpl
        4) a list of labels for each of the boxplots (string)
        5) a list of mpl-compatible colors (one for each boxplot)

    The Figure can be saved, and the raw data and labels can be useful (for
    example) performing statistical tests or writing the raw data to disk.

    The input arguments are exactly derived from the make_distance_boxplots.py
    script (see the script options for details). To avoid duplicated effort,
    their descriptions are not reproduced here.
    """
    # Parse data files and do some preliminary error checking.
    dm_header, dm_data = parse_distmat(dm_f)
    map_data, map_header, map_comments = parse_mapping_file(map_f)

    if fields is None or len(fields) < 1:
        raise ValueError("You must provide at least one field to analyze.")

    for field in fields:
        if field not in map_header:
            raise ValueError("The field '%s' is not in the provided mapping "
                             "file. Please supply correct fields "
                             "corresponding to fields in the mapping file." %
                             field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = _cast_y_axis_extrema(y_min)
    y_max = _cast_y_axis_extrema(y_max)

    # Collate the distributions of distances that will comprise each boxplot.
    # Suppress the generation of the indicated types of boxplots.
    results = []
    for field in fields:
        plot_data = []
        plot_labels = []
        plot_colors = []
        legend = None

        # Little bit of duplicate code here... not sure it's worth the effort
        # to clean up though.
        if not suppress_all_within:
            plot_data.append(
                get_all_grouped_distances(dm_header,
                                          dm_data,
                                          map_header,
                                          map_data,
                                          field,
                                          within=True))
            plot_labels.append("All within %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_all_between:
            plot_data.append(
                get_all_grouped_distances(dm_header,
                                          dm_data,
                                          map_header,
                                          map_data,
                                          field,
                                          within=False))
            plot_labels.append("All between %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_individual_within:
            within_dists = get_grouped_distances(dm_header,
                                                 dm_data,
                                                 map_header,
                                                 map_data,
                                                 field,
                                                 within=True)
            field_states = []
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
                field_states.append(grouping[0])

            # If we need to color these boxplots by a field, build up a
            # list of colors and a legend.
            if color_individual_within_by_field is not None:
                colors, color_mapping = _color_field_states(
                    format_mapping_file(map_header,
                                        map_data).split('\n'), dm_header,
                    field, field_states, color_individual_within_by_field)
                plot_colors.extend(colors)
                legend = (color_mapping.values(), color_mapping.keys())
            else:
                plot_colors.extend([box_color] * len(field_states))

        if not suppress_individual_between:
            between_dists = get_grouped_distances(dm_header,
                                                  dm_data,
                                                  map_header,
                                                  map_data,
                                                  field,
                                                  within=False)

            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

                if color_individual_within_by_field is not None:
                    plot_colors.append(None)
                else:
                    plot_colors.append(box_color)

        assert (len(plot_data) == len(plot_labels) and
                len(plot_labels) == len(plot_colors)), "The number " +\
            "of boxplot labels and colors do not match the number of " +\
            "boxplots."

        # We now have our data and labels ready, so plot them!
        if plot_data:
            if sort is not None:
                plot_data, plot_labels, plot_colors = _sort_distributions(
                    plot_data, plot_labels, plot_colors, sort)

            if width is None:
                width = len(plot_data) * box_width + 2
            if width <= 0 or height <= 0:
                raise ValueError("The specified width and height of the plot "
                                 "must be greater than zero.")

            plot_figure = boxplots(plot_data,
                                   x_tick_labels=plot_labels,
                                   title="%s Distances" % field,
                                   x_label="Grouping",
                                   y_label="Distance",
                                   x_tick_labels_orientation='vertical',
                                   y_min=y_min,
                                   y_max=y_max,
                                   whisker_length=whisker_length,
                                   box_width=box_width,
                                   box_colors=plot_colors,
                                   figure_width=width,
                                   figure_height=height,
                                   legend=legend)

            results.append(
                (field, plot_figure, plot_data, plot_labels, plot_colors))
        else:
            raise ValueError("The generation of all plots was suppressed. At "
                             "least one type of plot must be unsuppressed.")

    return results
Пример #25
0
                "diversity data and also selecting a rarefaction depth with the"
                " --depth option."
            )
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(open(alpha_fps[0], "U"))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(open(mapping_fp, "U"))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = add_alpha_diversity_values_to_mapping_file(
        metrics,
        alpha_sample_ids,
        alpha_data,
        mapping_file_headers,
        mapping_file_data,
        number_of_bins,
        binning_method,
        missing_value_name,
    )

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data)
    fd_out = open(output_mapping_fp, "w")
    fd_out.writelines(lines)
    fd_out.close()


if __name__ == "__main__":
    main()
Пример #26
0
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data,
                        coords_headers, connected_by_header,
                        sorted_by_header=None):
    """Write a string representing the vectors in a PCoA plot as javascript

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of
    numpy 2-D arrays for jackknifed input
    coords_headers: headers of the coords in the PCoA plot or a list of lists
    with the headers for jackknifed input
    connected_by_header: header of the mapping file that represents how the
    lines will be connected
    sorted_by_header: numeric-only header name to sort the samples in the
    vectors

    Output:
    js_vectors_string: string that represents the vectors in the shape of a
    javascript object

    Notes:
    If using jackknifed input, the coordinates and headers that will be used are
    the ones belonging to the master coords i. e. the first element.
    """

    js_vectors_string = []
    js_vectors_string.append('\nvar g_vectorPositions = new Array();\n')

    if connected_by_header != None:
        # check if we are processing jackknifed input, if so just get the master
        if type(coords_data) == list:
            coords_data = coords_data[0]
            coords_headers = coords_headers[0]

        columns_to_keep = ['SampleID', connected_by_header]

        # do not ad None if sorted_by_header is None or empty
        if sorted_by_header:
            columns_to_keep.append(sorted_by_header)

        # reduce the amount of data by keeping the required fields only
        mapping_file_data, mapping_file_headers =\
            keep_columns_from_mapping_file(mapping_file_data,
            mapping_file_headers, columns_to_keep)

        # format the mapping file to use this with the filtering function
        mf_string = format_mapping_file(mapping_file_headers, mapping_file_data)

        index = mapping_file_headers.index(connected_by_header)
        connected_by = list(set([line[index] for line in mapping_file_data]))

        for category in connected_by:
            # convert to StringIO to for each iteration; else the object
            # won't be usable after the first iteration & you'll get an error
            sample_ids = sample_ids_from_metadata_description(
                StringIO(mf_string),'%s:%s' % (connected_by_header,category))

            # if there is a sorting header, sort the coords using these values
            if sorted_by_header:
                sorting_index = mapping_file_headers.index(sorted_by_header)
                to_sort = [line for line in mapping_file_data if line[0] in\
                    sample_ids]

                # get the sorted sample ids from the sorted-reduced mapping file
                sample_ids = zip(*sorted(to_sort,
                    key=lambda x: float(x[sorting_index])))[0]

            # each category value is a new vector
            js_vectors_string.append("g_vectorPositions['%s'] = new Array();\n"
                % (category))

            for s in sample_ids:
                index = coords_headers.index(s)

                # print the first three elements of each coord for each sample
                js_vectors_string.append("g_vectorPositions['%s']['%s'] = %s;\n"
                    % (category, s, coords_data[index, :3].tolist()))

    return ''.join(js_vectors_string)
    else:
        if len(alpha_fps) > 1:
            option_parser.error(
                'A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(
                open(alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
        alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
        binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers,
                                out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')
    fd_out.writelines(lines)
    fd_out.close()


if __name__ == "__main__":
    main()
Пример #28
0
def create_personal_results(output_dir,
                            mapping_fp,
                            coord_fp,
                            collated_dir,
                            otu_table_fp,
                            prefs_fp,
                            personal_id_column,
                            personal_ids=None,
                            column_title='Self',
                            individual_titles=None,
                            category_to_split='BodySite',
                            time_series_category='WeeksSinceStart',
                            rarefaction_depth=10000,
                            alpha=0.05,
                            rep_set_fp=None,
                            parameter_fp=None,
                            body_site_rarefied_otu_table_dir=None,
                            retain_raw_data=False,
                            suppress_alpha_rarefaction=False,
                            suppress_beta_diversity=False,
                            suppress_taxa_summary_plots=False,
                            suppress_alpha_diversity_boxplots=False,
                            suppress_otu_category_significance=False,
                            command_handler=call_commands_serially,
                            status_update_callback=no_status_updates):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, 'support_files')
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), 'my_microbes', 'support_files'),
                 support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U'))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column "
                         "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file "
            "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = '%s&&%s' % (personal_id_column, category_to_split)
    header.insert(len(header)-1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None: 
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError("'%s' is not a personal ID in the mapping "
                                 "file column '%s'." %
                                 (pid, personal_id_column))

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column "
                         "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir,
                add_filename_suffix(otu_table_fp,
                                    '_even%d' % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = 'Rarefying OTU table'
            cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp,
                    rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, 'per_body_site_otu_tables')

            cmd_title = 'Splitting rarefied OTU table by body site'
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (
                    rarefied_otu_table_fp, mapping_fp, category_to_split,
                    per_body_site_dir)
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest,
                                        'mapping_file.txt')
        html_fp = join(output_dir, person_of_interest, 'index.html')

        personal_mapping_data = create_personal_mapping_file(mapping_data,
                person_of_interest, personal_id_index, bodysite_index,
                individual_titles)

        personal_mapping_f = open(personal_mapping_file_fp, 'w')
        personal_mapping_f.write(
                format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index]
                                   for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ''
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest,
                                     'adiv_boxplots')
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" %
                         person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                    collated_dir, personal_mapping_file_fp,
                    category_to_split, column_title, rarefaction_depth,
                    adiv_boxplots_dir)

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename)
                        for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = \
                    create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest,
                                   'alpha_rarefaction')
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest
            cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % (
                    collated_dir, personal_mapping_file_fp, prefs_fp,
                    rarefaction_dir)
            commands.append([(cmd_title, cmd)])

            raw_data_dirs.append(join(rarefaction_dir, 'average_plots'))
            raw_data_dirs.append(join(rarefaction_dir, 'average_tables'))

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity')
            pcoa_time_series_dir = join(output_dir, person_of_interest, 
                                         'beta_diversity_time_series')
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = 'Creating beta diversity time series plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % (
                personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\
                '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category,
                site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])
            
            cmd_title = 'Creating beta diversity plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py  -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp,
                                                                 prefs_fp, coord_fp, 
                                                                 pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Time series taxa summary plots steps
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, 'time_series')
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            ## Split OTU table into self/other per-body-site tables
            commands = []
            cmd_title = 'Splitting OTU table into self/other (%s)' % \
                        person_of_interest
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp,
                    personal_mapping_file_fp, column_title, area_plots_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            for column_title_value in column_title_values:
                biom_fp = join(area_plots_dir,
                               add_filename_suffix(otu_table_fp,
                                                   '_%s' % column_title_value))
                column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' %
                                                           column_title_value)
                raw_data_files.append(biom_fp)
                raw_data_files.append(column_title_map_fp)

                body_site_dir = join(area_plots_dir, column_title_value)

                commands = []
                cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \
                            (column_title_value, person_of_interest)
                cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp,
                        personal_mapping_file_fp, category_to_split,
                        body_site_dir)
                commands.append([(cmd_title, cmd)])
                raw_data_dirs.append(body_site_dir)

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

                commands = []
                for cat_value in cat_values:
                    body_site_otu_table_fp = join(body_site_dir,
                            add_filename_suffix(biom_fp, '_%s' % cat_value))

                    # We won't always get an OTU table if the mapping file
                    # category contains samples that aren't in the OTU table
                    # (e.g. the 'na' state for body site).
                    if exists(body_site_otu_table_fp):
                        plots = join(area_plots_dir, 'taxa_plots_%s_%s' % (
                            column_title_value, cat_value))

                        cmd_title = 'Creating taxa summary plots (%s)' % \
                                    person_of_interest
                        cmd = ('summarize_taxa_through_plots.py -i %s '
                               '-o %s -c %s -m %s -s' %
                               (body_site_otu_table_fp, plots,
                                time_series_category,
                                personal_mapping_file_fp))
                        if parameter_fp is not None:
                            cmd += ' -p %s' % parameter_fp
                            
                        commands.append([(cmd_title, cmd)])

                        raw_data_files.append(join(plots, '*.biom'))
                        raw_data_files.append(join(plots, '*.txt'))

                        create_comparative_taxa_plots_html(cat_value, 
                                join(area_plots_dir, '%s_comparative.html' %
                                                     cat_value))

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ''
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest,
                                   'otu_category_significance')
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(per_body_site_dir,
                        add_filename_suffix(rarefied_otu_table_fp,
                                            '_%s' % cat_value))

                if exists(body_site_otu_table_fp):
                    otu_cat_output_fp = join(otu_cat_sig_dir,
                                             'otu_cat_sig_%s.txt' % cat_value)

                    cmd_title = ('Testing for significant differences in '
                                 'OTU abundances in "%s" body site (%s)' % (
                                 cat_value, person_of_interest))
                    cmd = ('otu_category_significance.py -i %s -m %s -c %s '
                           '-o %s' % (body_site_otu_table_fp,
                                      personal_mapping_file_fp,
                                      column_title,
                                      otu_cat_output_fp))
                    commands.append([(cmd_title, cmd)])
                    raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = \
                    format_otu_category_significance_tables_as_html(
                            otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, 
                            individual_titles, rep_set_fp=rep_set_fp)

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename)
                    for html_filename in otu_cat_sig_html_filenames]

            otu_category_significance_html = \
                    create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(person_of_interest, html_fp,
                alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
                otu_category_significance_html=otu_category_significance_html)

    logger.close()

    # Clean up the unnecessary raw data files and directories. glob will only
    # grab paths that exist.
    if not retain_raw_data:
        for raw_data_fp_glob in raw_data_files:
            remove_files(glob(raw_data_fp_glob))

        for raw_data_dir_glob in raw_data_dirs:
            for dir_to_remove in glob(raw_data_dir_glob):
                rmtree(dir_to_remove)

    return output_directories
def main():
    smp_map_f = open('smp_map.txt', 'U')
    smp_map_data, smp_map_header, smp_map_comments = \
            parse_mapping_file(smp_map_f)
    smp_map_f.close()

    dist_f = open('new_disturbance_list.txt', 'U')
    anti_dist_map, sick_dist_map, menst_dist_map = \
            parse_disturbance_file(dist_f)
    dist_f.close()

    pid_idx = smp_map_header.index('PersonalID')
    wss_idx = smp_map_header.index('WeeksSinceStart')

    # Add the three new columns at the end.
    new_smp_map_header = smp_map_header[:]
    new_smp_map_header.extend([anti_dist_name, sick_dist_name,
                               menst_dist_name])

    new_smp_map_data = []
    for row in smp_map_data:
        pid = row[pid_idx]
        school = pid[:-3]
        week = row[wss_idx]

        anti_dist = False
        sick_dist = False
        menst_dist = False

        # Figure out if we should try to map this sample.
        valid_sample = True

        try:
            int(pid[-3:])
        except:
            valid_sample = False

        if school not in schools:
            valid_sample = False

        try:
            week = float(week)
        except:
            valid_sample = False

        if valid_sample:
            if pid in anti_dist_map and week in anti_dist_map[pid]:
                anti_dist = True
            if pid in sick_dist_map and week in sick_dist_map[pid]:
                sick_dist = True
            if pid in menst_dist_map and week in menst_dist_map[pid]:
                menst_dist = True

        # Write out our results in three new columns.
        anti_dist_str = 'Yes' if anti_dist else 'No'
        sick_dist_str = 'Yes' if sick_dist else 'No'
        menst_dist_str = 'Yes' if menst_dist else 'No'

        new_row = row[:]
        new_row.extend([anti_dist_str, sick_dist_str, menst_dist_str])
        new_smp_map_data.append(new_row)

    new_smp_map_f = open('new_smp_map.txt', 'w')
    new_smp_map_f.write(format_mapping_file(new_smp_map_header,
                                            new_smp_map_data,
                                            smp_map_comments))
    new_smp_map_f.close()
Пример #30
0
def create_personal_results(
    output_dir,
    mapping_fp,
    coord_fp,
    collated_dir,
    otu_table_fp,
    prefs_fp,
    personal_id_column,
    personal_ids=None,
    column_title="Self",
    individual_titles=None,
    category_to_split="BodySite",
    time_series_category="WeeksSinceStart",
    rarefaction_depth=10000,
    alpha=0.05,
    rep_set_fp=None,
    body_site_rarefied_otu_table_dir=None,
    retain_raw_data=False,
    suppress_alpha_rarefaction=False,
    suppress_beta_diversity=False,
    suppress_taxa_summary_plots=False,
    suppress_alpha_diversity_boxplots=False,
    suppress_otu_category_significance=False,
    command_handler=call_commands_serially,
    status_update_callback=no_status_updates,
):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, "support_files")
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U"))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = "%s&&%s" % (personal_id_column, category_to_split)
    header.insert(len(header) - 1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None:
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError(
                    "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)
                )

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = "Rarefying OTU table"
            cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, "per_body_site_otu_tables")

            cmd_title = "Splitting rarefied OTU table by body site"
            cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (
                rarefied_otu_table_fp,
                mapping_fp,
                category_to_split,
                per_body_site_dir,
            )
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        # Files to clean up on a per-individual basis.
        personal_raw_data_files = []
        personal_raw_data_dirs = []

        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt")
        html_fp = join(output_dir, person_of_interest, "index.html")

        personal_mapping_data = create_personal_mapping_file(
            mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles
        )

        personal_mapping_f = open(personal_mapping_file_fp, "w")
        personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        personal_raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index] for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ""
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots")
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                collated_dir,
                personal_mapping_file_fp,
                category_to_split,
                column_title,
                rarefaction_depth,
                adiv_boxplots_dir,
            )

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction")
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = "Creating rarefaction plots (%s)" % person_of_interest
            cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % (
                collated_dir,
                personal_mapping_file_fp,
                prefs_fp,
                rarefaction_dir,
            )
            commands.append([(cmd_title, cmd)])

            personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots"))
            personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables"))

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, "beta_diversity")
            pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series")
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % (
                personal_mapping_file_fp,
                prefs_fp,
                coord_fp,
                pcoa_time_series_dir,
            ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])

            cmd_title = "Creating beta diversity plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py  -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Time series taxa summary plots steps
        taxa_summary_plots_html = ""
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, "time_series")
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            files_to_remove, dirs_to_remove = _generate_taxa_summary_plots(
                otu_table_fp,
                personal_mapping_file_fp,
                person_of_interest,
                column_title,
                column_title_values,
                category_to_split,
                cat_values,
                time_series_category,
                area_plots_dir,
                command_handler,
                status_update_callback,
                logger,
            )

            personal_raw_data_files.extend(files_to_remove)
            personal_raw_data_dirs.extend(dirs_to_remove)

            taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ""
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance")
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            valid_body_sites = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(
                    per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value)
                )

                if exists(body_site_otu_table_fp):
                    # Make sure we have at least one sample for Self, otherwise
                    # otu_category_significance.py crashes with a division by
                    # zero error.
                    with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open(
                        personal_mapping_file_fp, "U"
                    ) as personal_mapping_file_f:
                        personal_sample_count = _count_per_individual_samples(
                            body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest
                        )

                        if personal_sample_count < 1:
                            continue
                        else:
                            valid_body_sites.append(cat_value)

                    otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value)

                    cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % (
                        cat_value,
                        person_of_interest,
                    )
                    cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % (
                        body_site_otu_table_fp,
                        personal_mapping_file_fp,
                        column_title,
                        otu_cat_output_fp,
                    )
                    commands.append([(cmd_title, cmd)])

                    personal_raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            # Hack to allow print-only mode.
            if command_handler is not print_commands and not valid_body_sites:
                raise ValueError(
                    "None of the body sites for personal ID '%s' "
                    "could be processed because there were no "
                    "matching samples in the rarefied OTU table." % person_of_interest
                )

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = create_otu_category_significance_html_tables(
                otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp
            )

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [
                join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames
            ]

            otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(
            person_of_interest,
            html_fp,
            taxa_summary_plots_html=taxa_summary_plots_html,
            alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
            otu_category_significance_html=otu_category_significance_html,
        )

        # Clean up the unnecessary raw data files and directories for the
        # current individual. glob will only grab paths that exist.
        if not retain_raw_data:
            clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs)

    # Clean up any remaining raw data files that weren't created on a
    # per-individual basis.
    if not retain_raw_data:
        clean_up_raw_data_files(raw_data_files, raw_data_dirs)

    logger.close()

    return output_directories
Пример #31
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if (mapping_fp is None and valid_states is not None):
        option_parser.error("--mapping_fp must be provided if --valid_states "
                            "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and "
                            "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, 'U')
            sample_id_f_ids = set(
                [l.strip().split()[0] for l in o if not l.startswith('#')])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table,
        sample_ids_to_keep,
        min_count,
        max_count,
        negate_ids_to_keep=negate_sample_id_fp)

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no samples remained after filtering.")

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Пример #32
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample = compute_seqs_per_library_stats(
        otu_table, opts.num_otus
    )
    num_otus = len(otu_table.ObservationIds)

    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)

    num_samples = len(counts_per_sample)
    print "Num samples: %s" % str(num_samples)
    print "Num otus: %s" % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print "Num observations (sequences): %s" % str(num_observations)
        # port denisty functionality to a tested function. the following is broken (should be
        # count of non-zero cells rather than number of observations in the numerator)
        # print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus))
    print

    if opts.num_otus:
        print "OTUs/sample summary:"
    else:
        print "Seqs/sample summary:"
    print " Min: %s" % str(min_counts)
    print " Max: %s" % str(max_counts)
    print " Median: %s" % str(median_counts)
    print " Mean: %s" % str(mean_counts)
    print " Std. dev.: %s" % (str(std(counts_per_sample_values)))
    print " Median Absolute Deviation: %s" % str(med_abs_dev)
    print " Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s" % str(even_sampling_depth)
    print ""
    if opts.num_otus:
        print "OTUs/sample detail:"
    else:
        print "Seqs/sample detail:"
    sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v, k in sorted_counts_per_sample:
        total_count += v
        print " %s: %s" % (k, str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError("input mapping file supplied, but no path to" + " output file")
        f = open(opts.mapping_fp, "U")
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers) == 1:
            endoffset = 0  # if we only have the sample id, this data -> last col
        else:
            endoffset = 1  # usually make this data the penultimate column.
        headers.insert(len(headers) - endoffset, "NumIndividuals")
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = "na"
            map_line.insert(len(map_line) - endoffset, depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, "w")
        f.write(new_map_str)
        f.close()
Пример #33
0
def make_distance_boxplots(
    dm_f,
    map_f,
    fields,
    width=None,
    height=6.0,
    suppress_all_within=False,
    suppress_all_between=False,
    suppress_individual_within=False,
    suppress_individual_between=False,
    y_min=0.0,
    y_max=1.0,
    whisker_length=1.5,
    box_width=0.5,
    box_color=None,
    color_individual_within_by_field=None,
    sort=None,
):
    """Generates various types of boxplots for distance comparisons.

    Returns a list of tuples, one for each field. Each tuple contains the
    following:
        1) the name of the field (string)
        2) a matplotlib.figure.Figure object containing the boxplots
        3) a list of lists containing the raw plot data that was passed to mpl
        4) a list of labels for each of the boxplots (string)
        5) a list of mpl-compatible colors (one for each boxplot)

    The Figure can be saved, and the raw data and labels can be useful (for
    example) performing statistical tests or writing the raw data to disk.

    The input arguments are exactly derived from the make_distance_boxplots.py
    script (see the script options for details). To avoid duplicated effort,
    their descriptions are not reproduced here.
    """
    # Parse data files and do some preliminary error checking.
    dm_header, dm_data = parse_distmat(dm_f)
    map_data, map_header, map_comments = parse_mapping_file(map_f)

    if fields is None or len(fields) < 1:
        raise ValueError("You must provide at least one field to analyze.")

    for field in fields:
        if field not in map_header:
            raise ValueError(
                "The field '%s' is not in the provided mapping "
                "file. Please supply correct fields "
                "corresponding to fields in the mapping file." % field
            )

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = _cast_y_axis_extrema(y_min)
    y_max = _cast_y_axis_extrema(y_max)

    # Collate the distributions of distances that will comprise each boxplot.
    # Suppress the generation of the indicated types of boxplots.
    results = []
    for field in fields:
        plot_data = []
        plot_labels = []
        plot_colors = []
        legend = None

        # Little bit of duplicate code here... not sure it's worth the effort
        # to clean up though.
        if not suppress_all_within:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True))
            plot_labels.append("All within %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_all_between:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False))
            plot_labels.append("All between %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_individual_within:
            within_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True)
            field_states = []
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
                field_states.append(grouping[0])

            # If we need to color these boxplots by a field, build up a
            # list of colors and a legend.
            if color_individual_within_by_field is not None:
                colors, color_mapping = _color_field_states(
                    format_mapping_file(map_header, map_data).split("\n"),
                    dm_header,
                    field,
                    field_states,
                    color_individual_within_by_field,
                )
                plot_colors.extend(colors)
                legend = (color_mapping.values(), color_mapping.keys())
            else:
                plot_colors.extend([box_color] * len(field_states))

        if not suppress_individual_between:
            between_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False)

            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

                if color_individual_within_by_field is not None:
                    plot_colors.append(None)
                else:
                    plot_colors.append(box_color)

        assert len(plot_data) == len(plot_labels) and len(plot_labels) == len(plot_colors), (
            "The number " + "of boxplot labels and colors do not match the number of " + "boxplots."
        )

        # We now have our data and labels ready, so plot them!
        if plot_data:
            if sort is not None:
                plot_data, plot_labels, plot_colors = _sort_distributions(plot_data, plot_labels, plot_colors, sort)

            if width is None:
                width = len(plot_data) * box_width + 2
            if width <= 0 or height <= 0:
                raise ValueError("The specified width and height of the plot " "must be greater than zero.")

            plot_figure = boxplots(
                plot_data,
                x_tick_labels=plot_labels,
                title="%s Distances" % field,
                x_label="Grouping",
                y_label="Distance",
                x_tick_labels_orientation="vertical",
                y_min=y_min,
                y_max=y_max,
                whisker_length=whisker_length,
                box_width=box_width,
                box_colors=plot_colors,
                figure_width=width,
                figure_height=height,
                legend=legend,
            )

            results.append((field, plot_figure, plot_data, plot_labels, plot_colors))
        else:
            raise ValueError(
                "The generation of all plots was suppressed. At " "least one type of plot must be unsuppressed."
            )

    return results
Пример #34
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
     compute_seqs_per_library_stats(otu_table, opts.num_otus)
    num_otus = len(otu_table.ObservationIds)

    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)

    num_samples = len(counts_per_sample)
    print 'Num samples: %s' % str(num_samples)
    print 'Num otus: %s' % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print 'Num observations (sequences): %s' % str(num_observations)
        # port denisty functionality to a tested function. the following is broken (should be
        # count of non-zero cells rather than number of observations in the numerator)
        #print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus))
    print

    if opts.num_otus:
        print 'OTUs/sample summary:'
    else:
        print 'Seqs/sample summary:'
    print ' Min: %s' % str(min_counts)
    print ' Max: %s' % str(max_counts)
    print ' Median: %s' % str(median_counts)
    print ' Mean: %s' % str(mean_counts)
    print ' Std. dev.: %s' % (str(std(counts_per_sample_values)))
    print ' Median Absolute Deviation: %s' % str(med_abs_dev)
    print ' Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s' %\
     str(even_sampling_depth)
    print ''
    if opts.num_otus:
        print 'OTUs/sample detail:'
    else:
        print 'Seqs/sample detail:'
    sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v, k in sorted_counts_per_sample:
        total_count += v
        print ' %s: %s' % (k, str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError('input mapping file supplied, but no path to'+\
             ' output file')
        f = open(opts.mapping_fp, 'U')
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers) == 1:
            endoffset = 0  # if we only have the sample id, this data -> last col
        else:
            endoffset = 1  # usually make this data the penultimate column.
        headers.insert(len(headers) - endoffset, 'NumIndividuals')
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = 'na'
            map_line.insert(len(map_line) - endoffset, depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, 'w')
        f.write(new_map_str)
        f.close()
Пример #35
0
def main():
    option_parser, opts,args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
     compute_seqs_per_library_stats(otu_table, opts.num_otus)
    num_otus = len(otu_table.ObservationIds)
    
    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)
    
    try:
        sample_md_keys = otu_table.SampleMetadata[0].keys()
    except TypeError:
        sample_md_keys = ["None provided"]
    try:
        observation_md_keys = otu_table.ObservationMetadata[0].keys()
    except TypeError:
        observation_md_keys = ["None provided"]
    
    num_samples = len(counts_per_sample)
    print 'Num samples: %s' % str(num_samples)
    print 'Num otus: %s' % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print 'Num observations (sequences): %s' % str(num_observations)
        print 'Table density (fraction of non-zero values): %1.4f' % \
              otu_table.getTableDensity()
    print

    if opts.num_otus:
        print 'OTUs/sample summary:'
    else:
        print 'Seqs/sample summary:' 
    print ' Min: %s' % str(min_counts)
    print ' Max: %s' % str(max_counts)
    print ' Median: %s' % str(median_counts)
    print ' Mean: %s' % str(mean_counts)
    print ' Std. dev.: %s' % (str(std(counts_per_sample_values)))
    print ' Median Absolute Deviation: %s' % str(med_abs_dev)
    print ' Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s' %\
     str(even_sampling_depth)
    print ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)
    print ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)
     
    print ''
    if opts.num_otus:
        print 'OTUs/sample detail:'
    else:
        print 'Seqs/sample detail:'
    sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v,k in sorted_counts_per_sample:
        total_count += v
        print ' %s: %s' % (k,str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError('input mapping file supplied, but no path to'+\
             ' output file')
        f = open(opts.mapping_fp,'U')
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers)==1:
            endoffset = 0 # if we only have the sample id, this data -> last col
        else:
            endoffset = 1 # usually make this data the penultimate column.
        headers.insert(len(headers)-endoffset,'SequenceCount')
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = 'na'
            map_line.insert(len(map_line)-endoffset,depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, 'w')
        f.write(new_map_str)
        f.close()