Пример #1
0
 def test_parse_mapping_file(self):
     """parse_mapping_file functions as expected"""
     s1 = ['#sample\ta\tb', '#comment line to skip',\
           'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     exp = ([['x','y','z'],['i','j','k']],\
            ['sample','a','b'],\
            ['comment line to skip','more skip'])
     obs = parse_mapping_file(s1)
     self.assertEqual(obs, exp)
     
     # We don't currently support this, but we should soon...
     # # check that first non-comment, non-blank line is used as 
     # # header
     # s1 = ['sample\ta\tb', '#comment line to skip',\
     #       'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     # exp = ([['x','y','z'],['i','j','k']],\
     #        ['sample','a','b'],\
     #        ['comment line to skip','more skip'])
     # obs = parse_mapping_file(s1)
     # self.assertEqual(obs, exp)
     
     #check that we strip double quotes by default
     s2 = ['#sample\ta\tb', '#comment line to skip',\
           '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
     obs = parse_mapping_file(s2)
     self.assertEqual(obs, exp)
Пример #2
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """

    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)

    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)

    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case " +
                           "and white-space sensitive). \n\tProvided field: " +
                           "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers)))

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        # parse mapping file each time though the loop as filtering operates on
        # values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
            mapping_data,
            mapping_headers,
            sample_ids_to_keep,
            include_repeat_cols=include_repeat_cols,

            column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
Пример #3
0
    def test_make_otu_table_with_sample_metadata(self):
        # Want to make sure that the order of the sample IDs in the OTU
        # map and the order of the IDs in the mapping file do not matter
        otu_map_lines = """0	ABC_0	DEF_1
1	ABC_1
x	GHI_2	GHI_3	GHI_77
z	DEF_3	XYZ_1""".split('\n')
        mapping_f = StringIO(MAPPING_FILE)
        sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ']
        data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]]

        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        sample_md = [sample_metadata[sample_id] for sample_id in sample_ids]

        obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
        exp = Table(data, ['0', '1', 'x', 'z'],
                    sample_ids,
                    sample_metadata=sample_md,
                    input_is_dense=True)

        self.assertEqual(obs, exp)

        # Test with a mapping file that is missing a sample's metadata,
        # make sure it raises the KeyError
        mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE)
        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        with self.assertRaises(KeyError):
            obs = make_otu_table(otu_map_lines,
                                 sample_metadata=sample_metadata)
Пример #4
0
    def test_make_otu_table_with_sample_metadata(self):
        # Want to make sure that the order of the sample IDs in the OTU
        # map and the order of the IDs in the mapping file do not matter
        otu_map_lines = """0	ABC_0	DEF_1
1	ABC_1
x	GHI_2	GHI_3	GHI_77
z	DEF_3	XYZ_1""".split('\n')
        mapping_f = StringIO(MAPPING_FILE)
        sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ']
        data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]]

        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        sample_md = [sample_metadata[sample_id] for sample_id in sample_ids]

        obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
        exp = Table(data, ['0', '1', 'x', 'z'], sample_ids,
                    sample_metadata=sample_md, input_is_dense=True)

        self.assertEqual(obs, exp)

        # Test with a mapping file that is missing a sample's metadata,
        # make sure it raises the KeyError
        mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE)
        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        with self.assertRaises(KeyError):
            obs = make_otu_table(otu_map_lines,
                                 sample_metadata=sample_metadata)
Пример #5
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """
    
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    
    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
    
    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case "+\
                "and white-space sensitive). \n\tProvided field: "+\
                "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        # parse mapping file each time though the loop as filtering operates on values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
                                         mapping_data, 
                                         mapping_headers,
                                         sample_ids_to_keep,
                                         include_repeat_cols=include_repeat_cols, 
                                         column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
Пример #6
0
def test_wrapper(test, otu_table, category_mapping, category, threshold, \
                 _filter, otu_include=None):
    """runs statistical test to look for category/OTU associations"""

    if test == 'ANOVA' or test == 'correlation':
        otu_table = convert_OTU_table_relative_abundance(otu_table)
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        category_info, category_values = \
            get_category_info(mapping_data, header, category, threshold)
        OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= False, \
            category_mapping_info=category_info)
    elif test == 'g_test':
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        category_info, category_values = \
            get_category_info(mapping_data, header, category, threshold)
        OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= True, \
            category_mapping_info=category_info)
    else:
        raise ValueError(
            "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test."
        )

    #filter OTU_list with the otu_include list
    if otu_include:
        otu_include = [line.strip() for line in otu_include]
        OTU_list = [OTU for OTU in OTU_list if OTU in otu_include]
    if len(OTU_list) == 0:
        raise ValueError(
            "No OTUs remain after applying the filter. Try lowering the filter value (-f option)"
        )
    if test == 'ANOVA':
        results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \
                        category_values)
        output = output_results_ANOVA(results, category_values, taxonomy_info)
    elif test == 'correlation':
        results = run_correlation_OTUs(OTU_list, category_info,
                                       otu_sample_info)
        output = output_results_correlation(results, taxonomy_info)
    elif test == 'g_test':
        results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \
                         category_values)
        output = output_results_G_test(results, taxonomy_info)
    return output
Пример #7
0
 def test_parse_mapping_file(self):
     """parse_mapping_file functions as expected"""
     s1 = ['#sample\ta\tb', '#comment line to skip',\
           'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     exp = ([['x','y','z'],['i','j','k']],\
            ['sample','a','b'],\
            ['comment line to skip','more skip'])
     obs = parse_mapping_file(s1)
     self.assertEqual(obs, exp)
     
     #check that we strip double quotes by default
     s2 = ['#sample\ta\tb', '#comment line to skip',\
           '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
     obs = parse_mapping_file(s2)
     self.assertEqual(obs, exp)
    def test_get_category_info(self):
        """get_category_info works"""
        category_mapping = """#SampleID\tcat1\tcat2
sample1\tA\t0
sample2\tB\t8.0
sample3\tC\t1.0""".split('\n')
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        result, cat_vals = get_category_info(mapping_data, header, 'cat1')
        self.assertEqual(result, {'sample1': 'A', 'sample3': 'C', 'sample2': 'B'})
        self.assertEqual(cat_vals, (['A', 'B', 'C']))
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        result, cat_vals = get_category_info(mapping_data, header, \
                        'cat2', threshold=5.0)
        self.assertEqual(result, {'sample1': '0', 'sample3': '0', 'sample2': '1'})
        self.assertEqual(cat_vals, (['0', '1']))
Пример #9
0
    def test_parse_mapping_file(self):
        """parse_mapping_file functions as expected"""
        s1 = ['#sample\ta\tb', '#comment line to skip',\
              'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
        exp = ([['x','y','z'],['i','j','k']],\
               ['sample','a','b'],\
               ['comment line to skip','more skip'])
        obs = parse_mapping_file(s1)
        self.assertEqual(obs, exp)

        #check that we strip double quotes by default
        s2 = ['#sample\ta\tb', '#comment line to skip',\
              '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
        obs = parse_mapping_file(s2)
        self.assertEqual(obs, exp)
Пример #10
0
def merge_mapping_files(mapping_files, no_data_value="no_data"):
    """ Merge list of mapping files into a single mapping file 
    
        mapping_files: open file objects containing mapping data
        no_data_value: value to be used in cases where there is no
         mapping field associated with a sample ID (default: 'no_data')
    """
    mapping_data = {}
    all_headers = []
    result = []

    # iterate over mapping files, parsing each
    for mapping_file in mapping_files:
        current_data, current_headers, current_comments = parse_mapping_file(mapping_file, strip_quotes=False)
        all_headers += current_headers
        for entry in current_data:
            sample_id = entry[0]
            current_values = {}
            for header, value in zip(current_headers[1:], entry[1:]):
                current_values[header] = value
            if sample_id in mapping_data:
                # if the sample id has already been seen, confirm that
                # there is no conflicting values across the different
                # mapping files (e.g., pH=5.0 and pH=6.0)- if there is,
                # raise a ValueError
                previous_data = mapping_data[sample_id]
                for header, value in current_values.items():
                    if header in previous_data and value != previous_data[header]:
                        raise ValueError, "Different values provided for %s for sample %s in different mapping files." % (
                            header,
                            sample_id,
                        )
                mapping_data[sample_id].update(current_values)
            else:
                mapping_data[sample_id] = current_values
    all_headers = {}.fromkeys(all_headers)

    # remove and place the fields whose order is important
    ordered_beginning = []
    for e in ["SampleID", "BarcodeSequence", "LinkerPrimerSequence"]:
        try:
            del all_headers[e]
            ordered_beginning.append(e)
        except KeyError:
            pass

    ordered_end = []
    for e in ["Description"]:
        try:
            del all_headers[e]
            ordered_end.append(e)
        except KeyError:
            pass
    ordered_headers = ordered_beginning + list(all_headers) + ordered_end

    # generate the mapping file lines containing all fields
    result.append("#" + "\t".join(ordered_headers))
    for sample_id, data in mapping_data.items():
        result.append("\t".join([sample_id] + [data.get(h, no_data_value) for h in ordered_headers[1:]]))
    return result
Пример #11
0
    def test_longitudinal_otu_table_conversion_wrapper(self):
        """londitudinal_otu_table_conversion_wrapper works
        """
        mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint
AT0\tA\t1\t0
AT1\tA\t0\t1
AT2\tA\t0\t2
BT0\tB\t1\t0
BT1\tB\t0\t1
BT2\tB\t0\t2
""".split('\n')
        category_mapping = parse_mapping_file(mapping_lines)
        otu_table = """{"rows": [{"id": "0", "metadata": null}, {"id": "1", "metadata": null}, {"id": "2", "metadata": null}, {"id": "3", "metadata": null}, {"id": "4", "metadata": null}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 1.0], [0, 1, 2.0], [0, 2, 3.0], [1, 3, 1.0], [1, 4, 2.0], [1, 5, 3.0], [2, 0, 1.0], [2, 1, 2.0], [2, 2, 3.0], [2, 4, 1.0], [2, 5, 2.0], [3, 0, 2.0], [3, 1, 4.0], [3, 2, 6.0], [3, 4, 1.0], [3, 5, 2.0], [4, 0, 3.0], [4, 1, 2.0], [4, 2, 1.0], [4, 3, 6.0], [4, 4, 4.0], [4, 5, 2.0]], "columns": [{"id": "AT0", "metadata": null}, {"id": "AT1", "metadata": null}, {"id": "AT2", "metadata": null}, {"id": "BT0", "metadata": null}, {"id": "BT1", "metadata": null}, {"id": "BT2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0-dev", "matrix_type": "sparse", "shape": [5, 6], "format_url": "http://biom-format.org", "date": "2012-08-01T09:14:03.574451", "type": "OTU table", "id": null, "matrix_element_type": "float"}"""

        otu_table = parse_biom_table_str(otu_table)
        new_otu_table = longitudinal_otu_table_conversion_wrapper(otu_table,
                                                                  category_mapping, 'individual', 'timepoint_zero')
        new_otu_table = str(new_otu_table).split('\n')
        self.assertEqual(new_otu_table[0], "# Constructed from biom file")
        data_line1 = new_otu_table[2].split('\t')
        self.assertFloatEqual(float(data_line1[0]), 0.0)
        # sets the reference to 0
        self.assertFloatEqual(float(data_line1[1]), 0.0)
        # subtracts values from same individual from the reference
        self.assertFloatEqual(float(data_line1[2]), 0.05714286)
        # sets to ignore number when not observed across a person
        self.assertFloatEqual(float(data_line1[4]), 999999999.0)
Пример #12
0
def _collapse_metadata(mapping_f, collapse_fields):
    """ Load a mapping file into a DataFrame and then collapse rows

    Parameters
    ----------
    mapping_f : file handle or filepath
        The sample metadata mapping file.
    collapse_fields : iterable
        The fields to combine when collapsing samples. For each sample in the
        mapping_f, the ordered values from these columns will be tuplized and
        used as the group identfier. Samples whose tuplized values in these
        fields are identical will be grouped.

    Returns
    -------
    pd.DataFrame
        Sample metadata resulting from the collapse operation.

    Raises
    ------
    KeyError
        If sample_id_field or any of the collapse fields are not column headers
        in mapping_f.

    """
    mapping_data, header, _ = parse_mapping_file(mapping_f)
    sample_md = pd.DataFrame(mapping_data, columns=header)
    grouped = sample_md.groupby(collapse_fields)
    collapsed_md = grouped.agg(lambda x: tuple(x))
    return collapsed_md
Пример #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # create the output directories
    try:
        makedirs(opts.output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # This check helps users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"
                " a different directory, or force overwrite with -f.")

    # verify that category is in mapping file
    map_list = parse_mapping_file(open(opts.mapping_file,'U').readlines())
    if not opts.category in map_list[1][1:]:
        print "Category '%s' not found in mapping file columns:" %(opts.category)
        print map_list[1][1:]
        exit(1)

    # run the supervised learning algorithm
    result = run_supervised_learning(opts.input_data, opts.mapping_file, opts.category,
            ntree=opts.ntree, errortype=opts.errortype,
            output_dir=opts.output_dir, verbose=opts.verbose)
    def test_run_single_paired_T_test(self):
        """run_single_paired_T_test works
        """
        cat_mapping = """#SampleID\ttimepoint_zero\tindividual
s1\t1\tA
s2\t0\tA
s3\t1\tB
s4\t0\tB
s5\t1\tC
s6\t0\tC""".split('\n')
        otu_table = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float)
        mapping_data, header, comments = parse_mapping_file(cat_mapping)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        OTU_list = ['0', '1', '2']
        #should return the results since there should be 4 values to evaluate
        result = run_single_paired_T_test('0', mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 4)
        self.assertEqual(len(result), 4)
        self.assertFloatEqual(result[1], 0.12566591637800242)
        self.assertFloatEqual(result[2], [0.29999999999999999, 0.20000000000000001])
        self.assertEqual(result[3], 2)
        #check the the filter works
        result = run_single_paired_T_test('0', mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 5)
        self.assertEqual(result, None)
Пример #15
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    output_fp = opts.output_fp

    map_data, header, comments = parse_mapping_file(opts.mapping_file)

    if opts.category not in header:
        option_parser.error("%s doesn't appear to exist in the mapping file!" %
                            opts.category)

    # use stdout or the user supplied file path
    if output_fp:
        fd = open(output_fp, 'w')
    else:
        fd = stdout

    result = defaultdict(int)
    cat_idx = header.index(opts.category)
    for samp in map_data:
        result[samp[cat_idx]] += 1

    for cat_val in natsort(result):
        if not cat_val:
            fd.write("***UNSPECIFIED***\t%d\n" % result[cat_val])
        else:
            fd.write("%s\t%d\n" % (cat_val, result[cat_val]))

    fd.close()
Пример #16
0
def _collate_cluster_pcoa_plot_data(coords_f, map_f, category):
    pc_data = parse_coords(coords_f)
    coords_d = dict(zip(pc_data[0], pc_data[1]))

    map_data = parse_mapping_file(map_f)
    full_map_data = [map_data[1]]
    full_map_data.extend(map_data[0])

    sid_map = group_by_field(full_map_data, category)
    sorted_states = sorted(sid_map.keys())

    color_pool = get_color_pool()
    if len(sorted_states) > len(color_pool):
        raise ValueError("Not enough colors to uniquely color sample "
                         "groups.")

    results = []
    for state, color in zip(sorted_states,
                            color_pool[:len(sorted_states)]):
        sids = sid_map[state]
        xs = [coords_d[sid][0] for sid in sids]
        ys = [coords_d[sid][1] for sid in sids]
        results.append((xs, ys, color, state))

    return results
Пример #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    map_fp = opts.mapping
    biom_fp = opts.biom_file
    min_seqs_sample = opts.min_seqs_sample
    subject_category = opts.subject_name

    cleaned_fp = opts.clean_fp
    verbose = opts.verbose

    map_data, headers, comments = parse_mapping_file(open(map_fp, 'U'))
    biom_table = parse_biom_table(open(biom_fp, 'U'))

    # getting valid samples from biom file
    real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\
        biom_table.SampleIds, include_repeat_cols=False)

    if subject_category not in real_map_headers:
        raise ValueError, 'This column: %s is not in the mapping file, try %s'%\
            (subject_category, real_map_headers)

    sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table)

    mapping_file_tuple = (real_map_data, real_map_headers)

    # calculate the available subjects at each rarefaction level
    results = make_selectors(sorted_counts_per_sample, min_seqs_sample,\
        mapping_file_tuple, subject_category, verbose=verbose)

    # save the output
    fout = open(cleaned_fp,'w')
    fout.write('#Sequences\tSubjects\tSamples\tMetadata\n')
    fout.write('\n'.join(results))
    fout.close()
    def test_get_sample_individual_info(self):
        """get_sample_individual_info works
        """
        mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint
AT0\tA\t1\t0
AT1\tA\t0\t1
AT2\tA\t0\t2
BT0\tB\t1\t0
BT1\tB\t0\t1
BT2\tB\t0\t2
""".split('\n')
        mapping_data, header, comments = parse_mapping_file(mapping_lines)
        samples_from_subject, samples_to_subtract = \
            get_sample_individual_info(mapping_data, header, 'individual', \
                'timepoint_zero')
        self.assertEqual(
            samples_from_subject, {
                'BT1': ['BT0', 'BT1', 'BT2'],
                'BT0': ['BT0', 'BT1', 'BT2'],
                'BT2': ['BT0', 'BT1', 'BT2'],
                'AT2': ['AT0', 'AT1', 'AT2'],
                'AT0': ['AT0', 'AT1', 'AT2'],
                'AT1': ['AT0', 'AT1', 'AT2']
            })
        self.assertEqual(
            samples_to_subtract, {
                'BT1': 'BT0',
                'BT0': 'BT0',
                'BT2': 'BT0',
                'AT2': 'AT0',
                'AT0': 'AT0',
                'AT1': 'AT0'
            })
Пример #19
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    map_file_name, otu_file_name, valid_states_str = \
        opts.map_fname, opts.otu_table_fp, opts.valid_states
    map_infile = open(map_file_name, 'U')
    otu_infile = open(otu_file_name, 'U')

    map_out_fname = opts.map_out_fname
    otu_out_fname = opts.otu_out_fname
    
    if map_out_fname is None:
        map_out_fname = map_file_name + '.pooled.txt'

    if otu_out_fname is None:
        otu_out_fname = otu_file_name + '.pooled.txt'

    # write out the filtered mapping file
    map_outfile = open(map_out_fname, 'w')
    otu_outfile = open(otu_out_fname, 'w')

    map_data, map_header, map_comments = parse_mapping_file(map_infile)
    map_infile.close()
    map_infile = open(map_file_name, 'U') # reopen for later
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids_to_pool = get_sample_ids(map_data, map_header, valid_states)
    
    pool_map(map_infile, map_outfile,
        opts.pooled_sample_name, sample_ids_to_pool)
    pool_otu_table(otu_infile, otu_outfile,
        opts.pooled_sample_name, sample_ids_to_pool)
Пример #20
0
def filter_otus_and_map(map_infile, otu_infile, map_outfile, otu_outfile, 
    valid_states_str, num_seqs_per_otu):
    """Filters OTU and map files according to specified criteria."""
    map_data, map_header, map_comments = parse_mapping_file(map_infile)
    map_infile.close()
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)

    # write out the filtered mapping file
    out_headers, out_data = filter_map(map_data, map_header, sample_ids)
    header_line = '#' + '\t'.join(out_headers)
    map_outfile.write('\n'.join([header_line] + map('\t'.join, out_data)))
    if not isinstance(map_outfile, StringIO):
        map_outfile.close()

    # write out the filtered OTU file
    for line in otu_infile:
        if line.startswith('#OTU ID'):
            fields = map(strip, line.split('\t'))
            cols = find_good_cols(line, sample_ids)
            filter_line(line, cols, min_count=None, outfile=otu_outfile)
        elif line.startswith('#'):
            otu_outfile.write(line)
        else:
            filter_line(line, cols, min_count=num_seqs_per_otu, 
                outfile=otu_outfile)
    if not isinstance(otu_outfile, StringIO):
        otu_outfile.close()
Пример #21
0
def pool_map(map_infile, map_outfile,
    pooled_sample_name, sample_ids_to_pool):
    """pools map file according to specified criteria."""
    map_data, map_header, map_comments = parse_mapping_file(map_infile)
    map_infile.close()
    # valid_states = parse_metadata_state_descriptions(valid_states_str)
    # sample_ids = get_sample_ids(map_data, map_header, valid_states)

    # write out the filtered mapping file
    sample_id_idx = map_header.index('SampleID')

    # separate the samples to be pooled from the rest (new_map_data)
    new_map_data = []
    pooled_map_data = []
    for sam in map_data:
        if sam[sample_id_idx] in sample_ids_to_pool:
            pooled_map_data.append(sam)
        else:
            new_map_data.append(sam)
    
    # make the new pooled sample
    newsam = ['multipleValues'] * len(map_header)

    for i in range(len(map_header)):
        pooled_vals = [sam[i] for sam in pooled_map_data]
        if len(set(pooled_vals)) == 1:
            newsam[i] = pooled_vals[0]

    newsam[sample_id_idx] = pooled_sample_name
    
    new_map_data.append(newsam)

    header_line = '#' + '\t'.join(map_header)
    map_outfile.write('\n'.join([header_line] + map('\t'.join, new_map_data)))
Пример #22
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_biom_table(open(opts.input_otu_table,'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp
    
    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp,'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data,
                                                 mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        result = sort_otu_table(otu_table_data,
                        natsort_case_insensitive(otu_table_data.SampleIds))
    
    # format and write the otu table
    result_str = format_biom_table(result)
    of = open(opts.output_fp,'w')
    of.write(result_str)
    of.close()
Пример #23
0
def sample_ids_from_metadata_description(mapping_f, valid_states_str):
    """ Given a description of metadata, return the corresponding sample ids
    """
    map_data, map_header, map_comments = parse_mapping_file(mapping_f)
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)
    return sample_ids
Пример #24
0
def _collapse_metadata(mapping_f, collapse_fields):
    """ Load a mapping file into a DataFrame and then collapse rows

    Parameters
    ----------
    mapping_f : file handle or filepath
        The sample metadata mapping file.
    collapse_fields : iterable
        The fields to combine when collapsing samples. For each sample in the
        mapping_f, the ordered values from these columns will be tuplized and
        used as the group identfier. Samples whose tuplized values in these
        fields are identical will be grouped.

    Returns
    -------
    pd.DataFrame
        Sample metadata resulting from the collapse operation.

    Raises
    ------
    KeyError
        If sample_id_field or any of the collapse fields are not column headers
        in mapping_f.

    """
    mapping_data, header, _ = parse_mapping_file(mapping_f)
    sample_md = pd.DataFrame(mapping_data, columns=header)
    grouped = sample_md.groupby(collapse_fields)
    collapsed_md = grouped.agg(lambda x: tuple(x))
    return collapsed_md
Пример #25
0
def get_sample_cat_info(lines, category):
    cat_by_sample = {}
    sample_by_cat = defaultdict(list)
    meta_dict = {}
    num_samples_by_cat = defaultdict(int)
    label_lists_dict = defaultdict(list)
    mapping_data, header, comments = parse_mapping_file(lines)

    category_labels = header[1:]
    index = category_labels.index(category) + 1

    for line in mapping_data:
        categories = line[0:len(category_labels) + 1]
        sample = categories[0].strip()
        meta_dict[sample] = [(categories[index], 0)]

        cat_by_sample[sample] = [(l.strip(),c.strip()) \
                             for l,c in zip(category_labels,categories[1:])]

        cat_list = []
        for i, (l, c) in enumerate(zip(category_labels, categories[1:])):
            if c not in label_lists_dict[l]:
                label_lists_dict[l].append(c)
            l = l.strip()
            c = c.strip()
            cat_list.append((l, c))
            sample_by_cat[(l, c)].append(sample)
            num_samples_by_cat[(l, c)] += 1

        cat_by_sample[sample] = cat_list

    return cat_by_sample, sample_by_cat, len(
        category_labels), meta_dict, label_lists_dict, num_samples_by_cat
    def test_output_results_paired_T_test(self):
        """output_results_paired_T_test works
        """
        cat_mapping = """#SampleID\ttimepoint_zero\tindividual
s1\t1\tA
s2\t0\tA
s3\t1\tB
s4\t0\tB
s5\t1\tC
s6\t0\tC""".split('\n')
        otu_table = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float)
        mapping_data, header, comments = parse_mapping_file(cat_mapping)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        OTU_list = ['0', '1', '2']
        all_results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 4)
        output = output_results_paired_T_test(all_results)
        #of = open('/Users/lozupone/temp_output.xls', 'w')
        #of.write('\n'.join(output))
        #of.close()
        self.assertEqual(output, ['OTU\tprob\tT stat\taverage_diff\tnum_pairs\tBonferroni_corrected\tFDR_corrected', '0\t0.125665916378\t-5.0\t0.25\t2\t0.251331832756\t0.251331832756', '2\t0.685730319473\t0.468164588785\t-0.133333333333\t3\t1.37146063895\t0.685730319473'])
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    coords_fp = opts.input_coords
    mapping_fp = opts.mapping_fp
    output_fp = opts.output_fp
    valid_states = opts.valid_states
    negate = opts.negate
    mapping_header_name = opts.mapping_header_name

    coords_ids, coords, eigen_values, pct_exp = parse_coords(open(coords_fp, "U"))

    data, headers, _ = parse_mapping_file(open(mapping_fp, "U"))

    if mapping_fp and valid_states:
        valid_sample_ids = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)

    valid_coords_ids, valid_coords = filter_sample_ids_from_coords(coords_ids, coords, valid_sample_ids, negate)

    if mapping_header_name:
        sorted_sample_ids = sort_sample_ids(data, headers, mapping_header_name)
        sorted_coord_ids, sorted_coords = sort_coords(valid_coords_ids, valid_coords, sorted_sample_ids)
        valid_coords_ids, valid_coords = sorted_coord_ids, sorted_coords

    lines = format_coords(valid_coords_ids, valid_coords, eigen_values, pct_exp)
    fd = open(output_fp, "w")
    fd.writelines(lines)
    fd.close
Пример #28
0
 def setUp(self):
     """Load data created on the fly with the biom.table.Table."""
     self.bt1 = TEST_TABLE1
     self.bt2 = TEST_TABLE2
     mdata, mheaders, _ = parse_mapping_file(TEST_MF.split('\n'))
     self.mdata = array(mdata)
     self.mheaders = mheaders
Пример #29
0
 def setUp(self):
     """Load data created on the fly with the biom.table.Table."""
     self.bt1 = TEST_TABLE1
     self.bt2 = TEST_TABLE2
     mdata, mheaders, _ = parse_mapping_file(TEST_MF.split('\n'))
     self.mdata = array(mdata)
     self.mheaders = mheaders
Пример #30
0
def merge_mapping_files(mapping_files,no_data_value='no_data'):
    """ Merge list of mapping files into a single mapping file 
    
        mapping_files: open file objects containing mapping data
        no_data_value: value to be used in cases where there is no
         mapping field associated with a sample ID (default: 'no_data')
    """
    mapping_data = defaultdict(dict)
    all_headers = set([])
    
    # iterate over mapping files, parsing each
    for mapping_file in mapping_files:
        current_data, current_headers, current_comments = \
           parse_mapping_file(mapping_file,strip_quotes=False)
        all_headers.update(set(current_headers))

        for entry in current_data:
            current_values = {k:v for k,v in zip(current_headers, entry)}
            sample_id = current_values['SampleID']

            if sample_id in mapping_data:
                # if the sample id has already been seen, confirm that
                # there is no conflicting values across the different 
                # mapping files (e.g., pH=5.0 and pH=6.0)- if there is, 
                # raise a ValueError
                previous_data = mapping_data[sample_id]
                
                for key in current_values:
                    if key not in previous_data:
                        continue

                    if current_values[key] != previous_data[key]:
                        raise ValueError("Different values provided for %s for"
                                      "sample %s in different mapping files."\
                                      % (key, sample_id))

            mapping_data[sample_id].update(current_values)
    
    # remove and place the fields whose order is important
    ordered_beginning = []
    for e in ['SampleID','BarcodeSequence','LinkerPrimerSequence']:
        if e in all_headers:
            all_headers.remove(e)
            ordered_beginning.append(e)
            
    ordered_end = []
    for e in ['Description']:
        if e in all_headers:
            all_headers.remove(e)
            ordered_end.append(e)
    
    ordered_headers = ordered_beginning + list(all_headers) + ordered_end
    
    # generate the mapping file lines containing all fields
    result = ['#' + '\t'.join(ordered_headers)]
    for sample_id, data in mapping_data.items():
        values = [data.get(k, no_data_value) for k in ordered_headers]
        result.append('\t'.join(values))
    
    return result
Пример #31
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # create the output directories
    try:
        makedirs(opts.output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # This check helps users avoid overwriting previous output.
            print "Output directory already exists. Please choose "+\
             "a different directory, or force overwrite with -f."
            exit(1)

    # verify that category is in mapping file
    map_list = parse_mapping_file(open(opts.mapping_file, 'U').readlines())
    if not opts.category in map_list[1][1:]:
        print "Category '%s' not found in mapping file columns:" % (
            opts.category)
        print map_list[1][1:]
        exit(1)

    # run the supervised learning algorithm
    result = run_supervised_learning(opts.input_data,
                                     opts.mapping_file,
                                     opts.category,
                                     ntree=opts.ntree,
                                     errortype=opts.errortype,
                                     output_dir=opts.output_dir,
                                     verbose=opts.verbose)
Пример #32
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_otu_table(open(opts.input_otu_table,'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp
    
    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp,'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data,
                                                 mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        parser.error("must provide either --sort_field and --mapping_fp OR --sorted_sample_ids_fp")

    # format and write the otu table
    result_str = format_otu_table(result[0],result[1],result[2],result[3])
    of = open(opts.output_fp,'w')
    of.write(result_str)
    of.close()
Пример #33
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)
    map_data, map_header, map_comments = parse_mapping_file(open(
        opts.map, 'U'))
    map_dict = mapping_file_to_dict(map_data, map_header)

    distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U'))

    if opts.colorby == None:
        colorby_cats = [None]
    else:
        colorby_idx = map_header.index(opts.colorby)
        colorby_cats = list(set([map_data[i][colorby_idx] for\
            i in range(len(map_data))]))
    textfilename = os.path.splitext(opts.output_path)[0] + '.txt'
    text_fh = open(textfilename, 'w')
    text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n')
    colorby_cats.sort()
    plt.figure()
    for cat_num, cat in enumerate(colorby_cats):
        # collect the primary and secondary samples within this category
        state1_samids, state2_samids = get_sam_ids(map_data, map_header,
                                                   opts.colorby, cat,
                                                   opts.primary_state,
                                                   opts.secondary_state)
        state1_samids =\
            list(set(state1_samids).intersection(set(distdict.keys())))
        state2_samids =\
            list(set(state2_samids).intersection(set(distdict.keys())))
        if state1_samids == [] or state2_samids == [] or \
            (len(state1_samids) == 1 and state1_samids == state2_samids):
            raise RuntimeError("one category of samples didn't have any valid"+\
            " distances. try eliminating samples from -p or -s, or changing"+\
            " your mapping file with filter_samples_from_otu_table.py")
        # go through dmtx
        state1_avg_dists = get_avg_dists(state1_samids, state2_samids,
                                         distdict)

        # plot
        xvals = [float(map_dict[sam][opts.axis_category]) for\
            sam in state1_samids]
        try:
            color = plt.cm.jet(cat_num / (len(colorby_cats) - 1))
        except ZeroDivisionError:  # only one cat
            color = 'b'
        plt.scatter(xvals,
                    state1_avg_dists,
                    edgecolors=color,
                    alpha=.5,
                    facecolors='none')
        plt.xlabel(opts.axis_category)
        plt.ylabel('average distance')

        lines = [str(xvals[i])+'\t'+str(state1_avg_dists[i])+\
            '\t'+state1_samids[i]+'\n' for i in range(len(xvals))]
        text_fh.writelines(lines)

    if opts.colorby != None: plt.legend(colorby_cats)
    plt.savefig(opts.output_path)
Пример #34
0
def merge_mapping_files(mapping_files, no_data_value='no_data'):
    """ Merge list of mapping files into a single mapping file 
    
        mapping_files: open file objects containing mapping data
        no_data_value: value to be used in cases where there is no
         mapping field associated with a sample ID (default: 'no_data')
    """
    mapping_data = defaultdict(dict)
    all_headers = set([])

    # iterate over mapping files, parsing each
    for mapping_file in mapping_files:
        current_data, current_headers, current_comments = \
           parse_mapping_file(mapping_file,strip_quotes=False)
        all_headers.update(set(current_headers))

        for entry in current_data:
            current_values = {k: v for k, v in zip(current_headers, entry)}
            sample_id = current_values['SampleID']

            if sample_id in mapping_data:
                # if the sample id has already been seen, confirm that
                # there is no conflicting values across the different
                # mapping files (e.g., pH=5.0 and pH=6.0)- if there is,
                # raise a ValueError
                previous_data = mapping_data[sample_id]

                for key in current_values:
                    if key not in previous_data:
                        continue

                    if current_values[key] != previous_data[key]:
                        raise ValueError("Different values provided for %s for"
                                      "sample %s in different mapping files."\
                                      % (key, sample_id))

            mapping_data[sample_id].update(current_values)

    # remove and place the fields whose order is important
    ordered_beginning = []
    for e in ['SampleID', 'BarcodeSequence', 'LinkerPrimerSequence']:
        if e in all_headers:
            all_headers.remove(e)
            ordered_beginning.append(e)

    ordered_end = []
    for e in ['Description']:
        if e in all_headers:
            all_headers.remove(e)
            ordered_end.append(e)

    ordered_headers = ordered_beginning + list(all_headers) + ordered_end

    # generate the mapping file lines containing all fields
    result = ['#' + '\t'.join(ordered_headers)]
    for sample_id, data in mapping_data.items():
        values = [data.get(k, no_data_value) for k in ordered_headers]
        result.append('\t'.join(values))

    return result
Пример #35
0
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp):
    """Counts the number of seqs/OTUs per sample and add its to the mapping file

    Inputs:
        biom_lines:
        mapping_lines:
        otu_counts:
        output_fp:
    """
    # Parse biom file
    biom = parse_biom_table(biom_lines)
    # Parse mapping file
    map_data, headers, comments = parse_mapping_file(mapping_lines)
    # Compute the counts per sample
    min_count, max_count, median_count, mean_count, counts_per_sample =\
        compute_counts_per_sample_stats(biom, binary_counts=otu_counts)
    # Add the counts to the mapping data
    index = len(headers) - 1
    headers.insert(index, "NumIndividuals")
    for row in map_data:
        row.insert(index, str(counts_per_sample[row[0]]))
    # # Add the '#' character to the first header
    # headers[0] = '#' + headers[0]
    # # Add headers to the data
    # map_data.insert(0, headers)
    # Write the corrected mapping file
    write_corrected_mapping(output_fp, headers, comments, map_data)
Пример #36
0
def create_replicated_mapping_file(map_f, num_replicates, sample_ids):
    """Returns a formatted mapping file with replicated sample IDs.

    Each sample ID will have an ascending integer appended to it from the range
    [0, num_replicates - 1]. For example, if there are two input sample IDs, S1
    and S2, with 3 replicates each, the output will be:
        S1.0
        S1.1
        S1.2
        S2.0
        S2.1
        S2.2

    All other metadata columns will simply be copied to the output mapping
    file. The order of input sample IDs is preserved.

    Arguments:
        map_f - input mapping file to replicate (file-like object)
        num_replicates - number of replicates at each sample
        sample_ids - only sample IDs in the mapping file that are in this list
            will be replicated. Sample IDs in the mapping file that are not
            found in this list will not be added to the resulting mapping file
    """
    if num_replicates < 1:
        raise ValueError("Must specify at least one sample replicate (was "
                         "provided %d)." % num_replicates)
    map_data, header, comments = parse_mapping_file(map_f)

    rep_map_data = []
    for row in map_data:
        if row[0] in sample_ids:
            for rep_num in range(num_replicates):
                rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:])

    return format_mapping_file(header, rep_map_data, comments)
Пример #37
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    otu_table_fp = opts.otu_table_fp
    mapping_fp = opts.mapping_fp
    mapping_field = opts.mapping_field
    output_dir = opts.output_dir
    
    otu_table_base_name = splitext(split(otu_table_fp)[1])[0]
    
    mapping_data, headers, comments = parse_mapping_file(open(mapping_fp,'U'))
    try:
        field_index = headers.index(mapping_field)
    except ValueError:
        option_parser.error("Field is not in mapping file (search is case "+\
        "and white-space sensitive). \n\tProvided field: "+\
        "%s. \n\tValid fields: %s" % (mapping_field,' '.join(headers)))
    
    mapping_values = set([e[field_index] for e in mapping_data])
    
    create_dir(output_dir)
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        otu_table_output_fp = join(output_dir,'%s_%s.txt' % (otu_table_base_name, v_fp_str))
        mapping_output_fp = join(output_dir,'mapping_%s.txt' % v_fp_str)
        filter_otus_and_map(open(mapping_fp,'U'), 
                            open(otu_table_fp,'U'), 
                            open(mapping_output_fp,'w'), 
                            open(otu_table_output_fp,'w'),
                            valid_states_str="%s:%s" % (mapping_field,v),
                            num_seqs_per_otu=1)
Пример #38
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        with open(opts.mapping_fp, 'U') as map_f:
            mapping_data, mapping_header, mapping_comments = \
                parse_mapping_file(map_f)

        sample_metadata = mapping_file_to_dict(mapping_data, mapping_header)

    with open(opts.otu_map_fp, 'U') as otu_map_f:
        biom_otu_table = make_otu_table(otu_map_f,
                                        otu_to_taxonomy=otu_to_taxonomy,
                                        otu_ids_to_exclude=ids_to_exclude,
                                        sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
    def test_make_new_otu_counts(self):
        """make_new_otu_counts works
        """
        mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint
AT0\tA\t1\t0
AT1\tA\t0\t1
AT2\tA\t0\t2
BT0\tB\t1\t0
BT1\tB\t0\t1
BT2\tB\t0\t2
""".split('\n')
        mapping_data, header, comments = parse_mapping_file(mapping_lines)
        samples_from_subject, sample_to_subtract = \
            get_sample_individual_info(mapping_data, header, 'individual', \
            'timepoint_zero')
        otu_lines = """# QIIME v1.2.0-dev OTU table
#OTU ID\tAT0\tAT1\tS1\tAT2\tBT0\tBT1\tBT2
0\t0.5\t0.3\t99\t0.2\t0.0\t0.0\t0.0
1\t0.0\t0.0\t99\t0.0\t0.4\t0.5\t0.6
2\t0.1\t0.4\t99\t0.7\t0.5\t0.6\t0.8
3\t0.0\t0.1\t99\t0.0\t0.4\t0.0\t0.0
""".split('\n')
        otu_table = parse_otu_table(otu_lines, float)
        sample_ids, otu_ids, otu_counts, consensus = otu_table
        converted_otu_table = make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, sample_to_subtract, samples_from_subject)
        converted_otu_table = converted_otu_table.split('\n')
        self.assertEqual(converted_otu_table[1], "#OTU ID\tAT0\tAT1\tAT2\tBT0\tBT1\tBT2")
        self.assertEqual(converted_otu_table[2], "0\t0.0\t-0.2\t-0.3\t999999999.0\t999999999.0\t999999999.0")
        self.assertEqual(converted_otu_table[3], "1\t999999999.0\t999999999.0\t999999999.0\t0.0\t0.1\t0.2")
        self.assertEqual(converted_otu_table[4], "2\t0.0\t0.3\t0.6\t0.0\t0.1\t0.3")
        self.assertEqual(converted_otu_table[5], "3\t0.0\t0.1\t0.0\t0.0\t-0.4\t-0.4")
Пример #40
0
def get_technical_lengths(input_map, debug=False):
    """Returns per-sample info on technical lengths.

    Note: KEY_SEQ, BARCODE and PRIMER fields are required. The LINKER
    field is optional.
    """
    if debug:
        print "Making debug output"
    body, header, comments = parse_mapping_file(input_map)
    if debug:
        print "HEADER:", header
    key_index = header.index('KEY_SEQ')
    bc_index = header.index('BARCODE')
    if 'LINKER' in header:
        linker_index = header.index('LINKER')
    else:
        linker_index = None
    primer_index = header.index('PRIMER')
    technical_lengths = {}
    for fields in body:
        curr_tech_len = len(fields[key_index]) + len(fields[bc_index]) + \
            len(fields[primer_index])
        if linker_index is not None:
            curr_tech_len += len(fields[linker_index])
        technical_lengths[fields[0]] = curr_tech_len
    if debug:
        print "Technical lengths:"
        print technical_lengths
    return technical_lengths
Пример #41
0
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp):
    """Counts the number of seqs/OTUs per sample and add its to the mapping file

    Inputs:
        biom_lines:
        mapping_lines:
        otu_counts:
        output_fp:
    """
    # Parse biom file
    biom = parse_biom_table(biom_lines)
    # Parse mapping file
    map_data, headers, comments = parse_mapping_file(mapping_lines)
    # Compute the counts per sample
    min_count, max_count, median_count, mean_count, counts_per_sample =\
        compute_counts_per_sample_stats(biom, binary_counts=otu_counts)
    # Add the counts to the mapping data
    index = len(headers) - 1
    headers.insert(index, "NumIndividuals")
    for row in map_data:
        row.insert(index, str(counts_per_sample[row[0]]))
    # Add the '#' character to the first header
    headers[0] = '#' + headers[0]
    # Add headers to the data
    map_data.insert(0, headers)
    # Write the corrected mapping file
    write_corrected_file(map_data, comments, output_fp)
Пример #42
0
    def test_sort_otu_table_by_mapping_field_error(self):
        """ sort_otu_table_by_mapping_field fails on samples in otu table but not mapping"""

        self.assertRaises(KeyError,sort_otu_table_by_mapping_field,
                                   parse_biom_table_str(self.otu_table1_bad_sampleID),
                                   parse_mapping_file(self.mapping_f2),
                                   sort_field = "Age")
Пример #43
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        mapping_data, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, 'U'))
        sample_metadata = assemble_sample_metadata(mapping_data, mapping_header, mapping_comments)
                
    biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'),
                                    otu_to_taxonomy=otu_to_taxonomy,
                                    otu_ids_to_exclude=ids_to_exclude,
                                    sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
Пример #44
0
def create_replicated_mapping_file(map_f, num_replicates, sample_ids):
    """Returns a formatted mapping file with replicated sample IDs.

    Each sample ID will have an ascending integer appended to it from the range
    [0, num_replicates - 1]. For example, if there are two input sample IDs, S1
    and S2, with 3 replicates each, the output will be:
        S1.0
        S1.1
        S1.2
        S2.0
        S2.1
        S2.2

    All other metadata columns will simply be copied to the output mapping
    file. The order of input sample IDs is preserved.

    Arguments:
        map_f - input mapping file to replicate (file-like object)
        num_replicates - number of replicates at each sample
        sample_ids - only sample IDs in the mapping file that are in this list
            will be replicated. Sample IDs in the mapping file that are not
            found in this list will not be added to the resulting mapping file
    """
    if num_replicates < 1:
        raise ValueError("Must specify at least one sample replicate (was "
                         "provided %d)." % num_replicates)
    map_data, header, comments = parse_mapping_file(map_f)

    rep_map_data = []
    for row in map_data:
        if row[0] in sample_ids:
            for rep_num in range(num_replicates):
                rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:])

    return format_mapping_file(header, rep_map_data, comments)
Пример #45
0
def sample_ids_from_metadata_description(mapping_f,valid_states_str):
    """ Given a description of metadata, return the corresponding sample ids
    """
    map_data, map_header, map_comments = parse_mapping_file(mapping_f)
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)
    return sample_ids
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    columns_to_merge = opts.columns_to_merge
    mapping_fp = opts.mapping_fp
    output_fp = opts.output_fp

    try:
        data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    except:
        option_parser.error('Bro, that doesn\'t look like a mapping file')

    for merging in columns_to_merge:
        retrieve = lambda x: headers.index(x)
        indices = map(retrieve, merging.split('&&'))

        headers.append(''.join([headers[element] for element in indices]))

        for line in data:
            line.append(''.join([line[element] for element in indices]))

    # this should never happen
    assert len(headers) == len(data[0]), "Something went horribly wrong, "+\
        "that's what you get for using non-unit-tested software"

    lines = format_mapping_file(headers, data, comments)

    fd = open(output_fp, 'w')
    fd.writelines(lines)
    fd.close()
Пример #47
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    output_fp = opts.output_fp

    map_data, header, comments = parse_mapping_file(opts.input_fp)

    if opts.category not in header:
        option_parser.error("%s doesn't appear to exist in the mapping file!" % opts.category)

    # use stdout or the user supplied file path
    if output_fp:
        fd = open(output_fp, 'w')
    else:
        fd = stdout

    result = defaultdict(int)
    cat_idx = header.index(opts.category)
    for samp in map_data:
        result[samp[cat_idx]] += 1

    for cat_val in natsort(result):
        if not cat_val:
            fd.write("***UNSPECIFIED***\t%d\n" % result[cat_val])
        else:
            fd.write("%s\t%d\n" % (cat_val, result[cat_val]))

    fd.close()
Пример #48
0
def filter_otus_and_map(map_infile, otu_infile, map_outfile, otu_outfile, 
    valid_states_str, num_seqs_per_otu):
    """Filters OTU and map files according to specified criteria."""
    map_data, map_header, map_comments = parse_mapping_file(map_infile)
    map_infile.close()
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)

    # write out the filtered mapping file
    out_headers, out_data = filter_map(map_data, map_header, sample_ids)
    header_line = '#' + '\t'.join(out_headers)
    map_outfile.write('\n'.join([header_line] + map('\t'.join, out_data)))
    if not isinstance(map_outfile, StringIO):
        map_outfile.close()

    # write out the filtered OTU file
    for line in otu_infile:
        if line.startswith('#OTU ID'):
            fields = map(strip, line.split('\t'))
            cols = find_good_cols(line, sample_ids)
            filter_line(line, cols, min_count=None, outfile=otu_outfile)
        elif line.startswith('#'):
            otu_outfile.write(line)
        else:
            filter_line(line, cols, min_count=num_seqs_per_otu, 
                outfile=otu_outfile)
    if not isinstance(otu_outfile, StringIO):
        otu_outfile.close()
Пример #49
0
def get_technical_lengths(input_map, debug=False):
    """Returns per-sample info on technical lengths.
    
    Note: KEY_SEQ, BARCODE and PRIMER fields are required. The LINKER
    field is optional.
    """
    if debug:
        print "Making debug output"
    body, header, comments = parse_mapping_file(input_map)
    if debug:
        print "HEADER:", header
    key_index = header.index('KEY_SEQ')
    bc_index = header.index('BARCODE')
    if 'LINKER' in header:
        linker_index = header.index('LINKER')
    else:
        linker_index = None
    primer_index = header.index('PRIMER')
    technical_lengths = {}
    for fields in body:
        curr_tech_len = len(fields[key_index]) + len(fields[bc_index]) + \
            len(fields[primer_index])
        if linker_index is not None:
            curr_tech_len += len(fields[linker_index])
        technical_lengths[fields[0]] = curr_tech_len
    if debug:
        print "Technical lengths:"
        print technical_lengths
    return technical_lengths
Пример #50
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_biom_table(open(opts.input_otu_table, 'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp

    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp, 'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U'))
        result = sort_otu_table(otu_table_data, sorted_sample_ids)
    else:
        result = sort_otu_table(
            otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds))

    # format and write the otu table
    result_str = format_biom_table(result)
    of = open(opts.output_fp, 'w')
    of.write(result_str)
    of.close()
    def test_get_sam_ids(self):
        """set of sample ids in get_sam_ids should be correct"""
        map_file = StringIO.StringIO(
            """#SampleID	Country	AgeYears	Family	AgeCat
    h208A.1	Malawi	0.032854209	h208	Child
    h301A.1	Malawi	0.05	h301	Child
    h301B.1	Malawi	0.05	h301	Child
    USinfTw20.1	USA	0.083333333	USinfTw20	Child
    USinfTw20.2	USA	0.083333333	USinfTw20	Child
    USinfTw1.1	USA	0.083333333	USinfTw1	Child
    h10M	Malawi	26	h10	Adult
    h68M	Malawi	26	h68	Adult
    TS25	USA	26	USts9	Adult
    TS26	USA	26	USts9	Adult""")

        map_data, map_header, comments = parse_mapping_file(map_file)
        colorby = 'Country'
        cat = 'USA'
        primary_state = 'AgeCat:Child'
        ids1, ids2 = get_sam_ids(map_data,
                                 map_header,
                                 colorby,
                                 cat,
                                 primary_state,
                                 secondary_state=None)
        self.assertEqual(set(ids1),
                         set(['USinfTw20.1', 'USinfTw20.2', 'USinfTw1.1']))
        self.assertEqual(set(ids2), set(['TS25', 'TS26']))
Пример #52
0
    def test_sort_otu_table_by_mapping_field_some_values_same(self):
        """ sort_otu_table_by_mapping_field fns when all values are the same"""

        actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1),
                              parse_mapping_file(self.mapping_f2),
                              sort_field = "Name")
        expected = parse_biom_table_str(self.name_sorted_otu_table1)
        self.assertEqual(actual, expected)
Пример #53
0
    def test_sort_otu_table_by_mapping_field_some_values_differ(self):
        """ sort_otu_table fns when some values differ"""

        actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1),
                              parse_mapping_file(self.mapping_f2),
                              sort_field = "Nothing")
        expected = parse_biom_table_str(self.nothing_sorted_otu_table1)
        self.assertEqual(actual, expected)
Пример #54
0
    def test_sort_otu_table_by_mapping_field_all_values_differ(self):
        """ sort_otu_table_by_mapping_field fns when all values differ"""

        actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1),
                                parse_mapping_file(self.mapping_f2),
                                sort_field = "Age")
        expected = parse_biom_table_str(self.age_sorted_otu_table1)
        self.assertEqual(actual, expected)
Пример #55
0
def get_taxa(taxa_fname, sample_ids):
    """Opens and returns coords data"""
    try:
        lines = open(taxa_fname, 'U').readlines()
    except (TypeError, IOError):
        raise MissingFileError, 'Taxa summary file required for this analysis'
    map = parse_mapping_file(lines)
    return map
Пример #56
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
Пример #57
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n' +
                                 'Pass a valid one via -i.') % f)
    outdir = opts.output_dir

    create_dir(outdir, fail_on_exist=not opts.force)

    log_fh = None

    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError("Either mapping file or primer required")
    # Read primer from Meta data file if not set on command line
    if not opts.primer:
        mapping_data, header, comments = \
            parse_mapping_file(open(opts.map_fname, "U"))

        index = header.index("LinkerPrimerSequence")
        all_primers = set(array(mapping_data)[:, index])

        if len(all_primers) != 1:
            raise ValueError(
                "Currently only data sets with one primer are allowed.\n" +
                "Make separate mapping files with only one primer, re-run split_libraries and\n"
                + "denoise with each split_library output separately.")
        primer = list(all_primers)[0]
        last_char = primer[-1]
        if (last_char not in "ACGT"):
            raise ValueError("We currently do not support primer with " +
                             "degenerate bases at it's 3' end.")

    else:
        primer = opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps,
                                               opts.fasta_fp,
                                               outdir,
                                               opts.num_cpus,
                                               primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path, 'w')
    for i, cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i), '\t'.join(cluster)))
    of.close()

    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path, 'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)
Пример #58
0
 def test_mapping_data_to_barcode_map(self):
     """parse_barcode_map: functions as expected
     """
     mapping_data, mapping_headers, mapping_comments =\
         parse_mapping_file(self.mapping_f)
     expected = {'GGTGGT':'Samp2',\
                 'GGAGGT':'SAMP_1',\
                 'GGTTAA':'dflsdflsdfsdfsdfsd'}
     self.assertEqual(mapping_data_to_barcode_map(mapping_data), expected)