def test_parse_mapping_file(self): """parse_mapping_file functions as expected""" s1 = ['#sample\ta\tb', '#comment line to skip',\ 'x \t y \t z ', ' ', '#more skip', 'i\tj\tk'] exp = ([['x','y','z'],['i','j','k']],\ ['sample','a','b'],\ ['comment line to skip','more skip']) obs = parse_mapping_file(s1) self.assertEqual(obs, exp) # We don't currently support this, but we should soon... # # check that first non-comment, non-blank line is used as # # header # s1 = ['sample\ta\tb', '#comment line to skip',\ # 'x \t y \t z ', ' ', '#more skip', 'i\tj\tk'] # exp = ([['x','y','z'],['i','j','k']],\ # ['sample','a','b'],\ # ['comment line to skip','more skip']) # obs = parse_mapping_file(s1) # self.assertEqual(obs, exp) #check that we strip double quotes by default s2 = ['#sample\ta\tb', '#comment line to skip',\ '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk'] obs = parse_mapping_file(s2) self.assertEqual(obs, exp)
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case " + "and white-space sensitive). \n\tProvided field: " + "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) # parse mapping file each time though the loop as filtering operates on # values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def test_make_otu_table_with_sample_metadata(self): # Want to make sure that the order of the sample IDs in the OTU # map and the order of the IDs in the mapping file do not matter otu_map_lines = """0 ABC_0 DEF_1 1 ABC_1 x GHI_2 GHI_3 GHI_77 z DEF_3 XYZ_1""".split('\n') mapping_f = StringIO(MAPPING_FILE) sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ'] data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]] map_data, map_header, map_comments = parse_mapping_file(mapping_f) sample_metadata = mapping_file_to_dict(map_data, map_header) sample_md = [sample_metadata[sample_id] for sample_id in sample_ids] obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata) exp = Table(data, ['0', '1', 'x', 'z'], sample_ids, sample_metadata=sample_md, input_is_dense=True) self.assertEqual(obs, exp) # Test with a mapping file that is missing a sample's metadata, # make sure it raises the KeyError mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE) map_data, map_header, map_comments = parse_mapping_file(mapping_f) sample_metadata = mapping_file_to_dict(map_data, map_header) with self.assertRaises(KeyError): obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case "+\ "and white-space sensitive). \n\tProvided field: "+\ "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) # parse mapping file each time though the loop as filtering operates on values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def test_wrapper(test, otu_table, category_mapping, category, threshold, \ _filter, otu_include=None): """runs statistical test to look for category/OTU associations""" if test == 'ANOVA' or test == 'correlation': otu_table = convert_OTU_table_relative_abundance(otu_table) sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= False, \ category_mapping_info=category_info) elif test == 'g_test': sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= True, \ category_mapping_info=category_info) else: raise ValueError( "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test." ) #filter OTU_list with the otu_include list if otu_include: otu_include = [line.strip() for line in otu_include] OTU_list = [OTU for OTU in OTU_list if OTU in otu_include] if len(OTU_list) == 0: raise ValueError( "No OTUs remain after applying the filter. Try lowering the filter value (-f option)" ) if test == 'ANOVA': results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_ANOVA(results, category_values, taxonomy_info) elif test == 'correlation': results = run_correlation_OTUs(OTU_list, category_info, otu_sample_info) output = output_results_correlation(results, taxonomy_info) elif test == 'g_test': results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_G_test(results, taxonomy_info) return output
def test_parse_mapping_file(self): """parse_mapping_file functions as expected""" s1 = ['#sample\ta\tb', '#comment line to skip',\ 'x \t y \t z ', ' ', '#more skip', 'i\tj\tk'] exp = ([['x','y','z'],['i','j','k']],\ ['sample','a','b'],\ ['comment line to skip','more skip']) obs = parse_mapping_file(s1) self.assertEqual(obs, exp) #check that we strip double quotes by default s2 = ['#sample\ta\tb', '#comment line to skip',\ '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk'] obs = parse_mapping_file(s2) self.assertEqual(obs, exp)
def test_get_category_info(self): """get_category_info works""" category_mapping = """#SampleID\tcat1\tcat2 sample1\tA\t0 sample2\tB\t8.0 sample3\tC\t1.0""".split('\n') mapping_data, header, comments = parse_mapping_file(category_mapping) result, cat_vals = get_category_info(mapping_data, header, 'cat1') self.assertEqual(result, {'sample1': 'A', 'sample3': 'C', 'sample2': 'B'}) self.assertEqual(cat_vals, (['A', 'B', 'C'])) mapping_data, header, comments = parse_mapping_file(category_mapping) result, cat_vals = get_category_info(mapping_data, header, \ 'cat2', threshold=5.0) self.assertEqual(result, {'sample1': '0', 'sample3': '0', 'sample2': '1'}) self.assertEqual(cat_vals, (['0', '1']))
def merge_mapping_files(mapping_files, no_data_value="no_data"): """ Merge list of mapping files into a single mapping file mapping_files: open file objects containing mapping data no_data_value: value to be used in cases where there is no mapping field associated with a sample ID (default: 'no_data') """ mapping_data = {} all_headers = [] result = [] # iterate over mapping files, parsing each for mapping_file in mapping_files: current_data, current_headers, current_comments = parse_mapping_file(mapping_file, strip_quotes=False) all_headers += current_headers for entry in current_data: sample_id = entry[0] current_values = {} for header, value in zip(current_headers[1:], entry[1:]): current_values[header] = value if sample_id in mapping_data: # if the sample id has already been seen, confirm that # there is no conflicting values across the different # mapping files (e.g., pH=5.0 and pH=6.0)- if there is, # raise a ValueError previous_data = mapping_data[sample_id] for header, value in current_values.items(): if header in previous_data and value != previous_data[header]: raise ValueError, "Different values provided for %s for sample %s in different mapping files." % ( header, sample_id, ) mapping_data[sample_id].update(current_values) else: mapping_data[sample_id] = current_values all_headers = {}.fromkeys(all_headers) # remove and place the fields whose order is important ordered_beginning = [] for e in ["SampleID", "BarcodeSequence", "LinkerPrimerSequence"]: try: del all_headers[e] ordered_beginning.append(e) except KeyError: pass ordered_end = [] for e in ["Description"]: try: del all_headers[e] ordered_end.append(e) except KeyError: pass ordered_headers = ordered_beginning + list(all_headers) + ordered_end # generate the mapping file lines containing all fields result.append("#" + "\t".join(ordered_headers)) for sample_id, data in mapping_data.items(): result.append("\t".join([sample_id] + [data.get(h, no_data_value) for h in ordered_headers[1:]])) return result
def test_longitudinal_otu_table_conversion_wrapper(self): """londitudinal_otu_table_conversion_wrapper works """ mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint AT0\tA\t1\t0 AT1\tA\t0\t1 AT2\tA\t0\t2 BT0\tB\t1\t0 BT1\tB\t0\t1 BT2\tB\t0\t2 """.split('\n') category_mapping = parse_mapping_file(mapping_lines) otu_table = """{"rows": [{"id": "0", "metadata": null}, {"id": "1", "metadata": null}, {"id": "2", "metadata": null}, {"id": "3", "metadata": null}, {"id": "4", "metadata": null}], "format": "Biological Observation Matrix 1.0.0", "data": [[0, 0, 1.0], [0, 1, 2.0], [0, 2, 3.0], [1, 3, 1.0], [1, 4, 2.0], [1, 5, 3.0], [2, 0, 1.0], [2, 1, 2.0], [2, 2, 3.0], [2, 4, 1.0], [2, 5, 2.0], [3, 0, 2.0], [3, 1, 4.0], [3, 2, 6.0], [3, 4, 1.0], [3, 5, 2.0], [4, 0, 3.0], [4, 1, 2.0], [4, 2, 1.0], [4, 3, 6.0], [4, 4, 4.0], [4, 5, 2.0]], "columns": [{"id": "AT0", "metadata": null}, {"id": "AT1", "metadata": null}, {"id": "AT2", "metadata": null}, {"id": "BT0", "metadata": null}, {"id": "BT1", "metadata": null}, {"id": "BT2", "metadata": null}], "generated_by": "BIOM-Format 1.0.0-dev", "matrix_type": "sparse", "shape": [5, 6], "format_url": "http://biom-format.org", "date": "2012-08-01T09:14:03.574451", "type": "OTU table", "id": null, "matrix_element_type": "float"}""" otu_table = parse_biom_table_str(otu_table) new_otu_table = longitudinal_otu_table_conversion_wrapper(otu_table, category_mapping, 'individual', 'timepoint_zero') new_otu_table = str(new_otu_table).split('\n') self.assertEqual(new_otu_table[0], "# Constructed from biom file") data_line1 = new_otu_table[2].split('\t') self.assertFloatEqual(float(data_line1[0]), 0.0) # sets the reference to 0 self.assertFloatEqual(float(data_line1[1]), 0.0) # subtracts values from same individual from the reference self.assertFloatEqual(float(data_line1[2]), 0.05714286) # sets to ignore number when not observed across a person self.assertFloatEqual(float(data_line1[4]), 999999999.0)
def _collapse_metadata(mapping_f, collapse_fields): """ Load a mapping file into a DataFrame and then collapse rows Parameters ---------- mapping_f : file handle or filepath The sample metadata mapping file. collapse_fields : iterable The fields to combine when collapsing samples. For each sample in the mapping_f, the ordered values from these columns will be tuplized and used as the group identfier. Samples whose tuplized values in these fields are identical will be grouped. Returns ------- pd.DataFrame Sample metadata resulting from the collapse operation. Raises ------ KeyError If sample_id_field or any of the collapse fields are not column headers in mapping_f. """ mapping_data, header, _ = parse_mapping_file(mapping_f) sample_md = pd.DataFrame(mapping_data, columns=header) grouped = sample_md.groupby(collapse_fields) collapsed_md = grouped.agg(lambda x: tuple(x)) return collapsed_md
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create the output directories try: makedirs(opts.output_dir) except OSError: if opts.force: pass else: # This check helps users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") # verify that category is in mapping file map_list = parse_mapping_file(open(opts.mapping_file,'U').readlines()) if not opts.category in map_list[1][1:]: print "Category '%s' not found in mapping file columns:" %(opts.category) print map_list[1][1:] exit(1) # run the supervised learning algorithm result = run_supervised_learning(opts.input_data, opts.mapping_file, opts.category, ntree=opts.ntree, errortype=opts.errortype, output_dir=opts.output_dir, verbose=opts.verbose)
def test_run_single_paired_T_test(self): """run_single_paired_T_test works """ cat_mapping = """#SampleID\ttimepoint_zero\tindividual s1\t1\tA s2\t0\tA s3\t1\tB s4\t0\tB s5\t1\tC s6\t0\tC""".split('\n') otu_table = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float) mapping_data, header, comments = parse_mapping_file(cat_mapping) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) OTU_list = ['0', '1', '2'] #should return the results since there should be 4 values to evaluate result = run_single_paired_T_test('0', mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 4) self.assertEqual(len(result), 4) self.assertFloatEqual(result[1], 0.12566591637800242) self.assertFloatEqual(result[2], [0.29999999999999999, 0.20000000000000001]) self.assertEqual(result[3], 2) #check the the filter works result = run_single_paired_T_test('0', mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 5) self.assertEqual(result, None)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_fp = opts.output_fp map_data, header, comments = parse_mapping_file(opts.mapping_file) if opts.category not in header: option_parser.error("%s doesn't appear to exist in the mapping file!" % opts.category) # use stdout or the user supplied file path if output_fp: fd = open(output_fp, 'w') else: fd = stdout result = defaultdict(int) cat_idx = header.index(opts.category) for samp in map_data: result[samp[cat_idx]] += 1 for cat_val in natsort(result): if not cat_val: fd.write("***UNSPECIFIED***\t%d\n" % result[cat_val]) else: fd.write("%s\t%d\n" % (cat_val, result[cat_val])) fd.close()
def _collate_cluster_pcoa_plot_data(coords_f, map_f, category): pc_data = parse_coords(coords_f) coords_d = dict(zip(pc_data[0], pc_data[1])) map_data = parse_mapping_file(map_f) full_map_data = [map_data[1]] full_map_data.extend(map_data[0]) sid_map = group_by_field(full_map_data, category) sorted_states = sorted(sid_map.keys()) color_pool = get_color_pool() if len(sorted_states) > len(color_pool): raise ValueError("Not enough colors to uniquely color sample " "groups.") results = [] for state, color in zip(sorted_states, color_pool[:len(sorted_states)]): sids = sid_map[state] xs = [coords_d[sid][0] for sid in sids] ys = [coords_d[sid][1] for sid in sids] results.append((xs, ys, color, state)) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) map_fp = opts.mapping biom_fp = opts.biom_file min_seqs_sample = opts.min_seqs_sample subject_category = opts.subject_name cleaned_fp = opts.clean_fp verbose = opts.verbose map_data, headers, comments = parse_mapping_file(open(map_fp, 'U')) biom_table = parse_biom_table(open(biom_fp, 'U')) # getting valid samples from biom file real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\ biom_table.SampleIds, include_repeat_cols=False) if subject_category not in real_map_headers: raise ValueError, 'This column: %s is not in the mapping file, try %s'%\ (subject_category, real_map_headers) sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table) mapping_file_tuple = (real_map_data, real_map_headers) # calculate the available subjects at each rarefaction level results = make_selectors(sorted_counts_per_sample, min_seqs_sample,\ mapping_file_tuple, subject_category, verbose=verbose) # save the output fout = open(cleaned_fp,'w') fout.write('#Sequences\tSubjects\tSamples\tMetadata\n') fout.write('\n'.join(results)) fout.close()
def test_get_sample_individual_info(self): """get_sample_individual_info works """ mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint AT0\tA\t1\t0 AT1\tA\t0\t1 AT2\tA\t0\t2 BT0\tB\t1\t0 BT1\tB\t0\t1 BT2\tB\t0\t2 """.split('\n') mapping_data, header, comments = parse_mapping_file(mapping_lines) samples_from_subject, samples_to_subtract = \ get_sample_individual_info(mapping_data, header, 'individual', \ 'timepoint_zero') self.assertEqual( samples_from_subject, { 'BT1': ['BT0', 'BT1', 'BT2'], 'BT0': ['BT0', 'BT1', 'BT2'], 'BT2': ['BT0', 'BT1', 'BT2'], 'AT2': ['AT0', 'AT1', 'AT2'], 'AT0': ['AT0', 'AT1', 'AT2'], 'AT1': ['AT0', 'AT1', 'AT2'] }) self.assertEqual( samples_to_subtract, { 'BT1': 'BT0', 'BT0': 'BT0', 'BT2': 'BT0', 'AT2': 'AT0', 'AT0': 'AT0', 'AT1': 'AT0' })
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) map_file_name, otu_file_name, valid_states_str = \ opts.map_fname, opts.otu_table_fp, opts.valid_states map_infile = open(map_file_name, 'U') otu_infile = open(otu_file_name, 'U') map_out_fname = opts.map_out_fname otu_out_fname = opts.otu_out_fname if map_out_fname is None: map_out_fname = map_file_name + '.pooled.txt' if otu_out_fname is None: otu_out_fname = otu_file_name + '.pooled.txt' # write out the filtered mapping file map_outfile = open(map_out_fname, 'w') otu_outfile = open(otu_out_fname, 'w') map_data, map_header, map_comments = parse_mapping_file(map_infile) map_infile.close() map_infile = open(map_file_name, 'U') # reopen for later valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids_to_pool = get_sample_ids(map_data, map_header, valid_states) pool_map(map_infile, map_outfile, opts.pooled_sample_name, sample_ids_to_pool) pool_otu_table(otu_infile, otu_outfile, opts.pooled_sample_name, sample_ids_to_pool)
def filter_otus_and_map(map_infile, otu_infile, map_outfile, otu_outfile, valid_states_str, num_seqs_per_otu): """Filters OTU and map files according to specified criteria.""" map_data, map_header, map_comments = parse_mapping_file(map_infile) map_infile.close() valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) # write out the filtered mapping file out_headers, out_data = filter_map(map_data, map_header, sample_ids) header_line = '#' + '\t'.join(out_headers) map_outfile.write('\n'.join([header_line] + map('\t'.join, out_data))) if not isinstance(map_outfile, StringIO): map_outfile.close() # write out the filtered OTU file for line in otu_infile: if line.startswith('#OTU ID'): fields = map(strip, line.split('\t')) cols = find_good_cols(line, sample_ids) filter_line(line, cols, min_count=None, outfile=otu_outfile) elif line.startswith('#'): otu_outfile.write(line) else: filter_line(line, cols, min_count=num_seqs_per_otu, outfile=otu_outfile) if not isinstance(otu_outfile, StringIO): otu_outfile.close()
def pool_map(map_infile, map_outfile, pooled_sample_name, sample_ids_to_pool): """pools map file according to specified criteria.""" map_data, map_header, map_comments = parse_mapping_file(map_infile) map_infile.close() # valid_states = parse_metadata_state_descriptions(valid_states_str) # sample_ids = get_sample_ids(map_data, map_header, valid_states) # write out the filtered mapping file sample_id_idx = map_header.index('SampleID') # separate the samples to be pooled from the rest (new_map_data) new_map_data = [] pooled_map_data = [] for sam in map_data: if sam[sample_id_idx] in sample_ids_to_pool: pooled_map_data.append(sam) else: new_map_data.append(sam) # make the new pooled sample newsam = ['multipleValues'] * len(map_header) for i in range(len(map_header)): pooled_vals = [sam[i] for sam in pooled_map_data] if len(set(pooled_vals)) == 1: newsam[i] = pooled_vals[0] newsam[sample_id_idx] = pooled_sample_name new_map_data.append(newsam) header_line = '#' + '\t'.join(map_header) map_outfile.write('\n'.join([header_line] + map('\t'.join, new_map_data)))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_biom_table(open(opts.input_otu_table,'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp,'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: result = sort_otu_table(otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds)) # format and write the otu table result_str = format_biom_table(result) of = open(opts.output_fp,'w') of.write(result_str) of.close()
def sample_ids_from_metadata_description(mapping_f, valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) return sample_ids
def get_sample_cat_info(lines, category): cat_by_sample = {} sample_by_cat = defaultdict(list) meta_dict = {} num_samples_by_cat = defaultdict(int) label_lists_dict = defaultdict(list) mapping_data, header, comments = parse_mapping_file(lines) category_labels = header[1:] index = category_labels.index(category) + 1 for line in mapping_data: categories = line[0:len(category_labels) + 1] sample = categories[0].strip() meta_dict[sample] = [(categories[index], 0)] cat_by_sample[sample] = [(l.strip(),c.strip()) \ for l,c in zip(category_labels,categories[1:])] cat_list = [] for i, (l, c) in enumerate(zip(category_labels, categories[1:])): if c not in label_lists_dict[l]: label_lists_dict[l].append(c) l = l.strip() c = c.strip() cat_list.append((l, c)) sample_by_cat[(l, c)].append(sample) num_samples_by_cat[(l, c)] += 1 cat_by_sample[sample] = cat_list return cat_by_sample, sample_by_cat, len( category_labels), meta_dict, label_lists_dict, num_samples_by_cat
def test_output_results_paired_T_test(self): """output_results_paired_T_test works """ cat_mapping = """#SampleID\ttimepoint_zero\tindividual s1\t1\tA s2\t0\tA s3\t1\tB s4\t0\tB s5\t1\tC s6\t0\tC""".split('\n') otu_table = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float) mapping_data, header, comments = parse_mapping_file(cat_mapping) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) OTU_list = ['0', '1', '2'] all_results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 4) output = output_results_paired_T_test(all_results) #of = open('/Users/lozupone/temp_output.xls', 'w') #of.write('\n'.join(output)) #of.close() self.assertEqual(output, ['OTU\tprob\tT stat\taverage_diff\tnum_pairs\tBonferroni_corrected\tFDR_corrected', '0\t0.125665916378\t-5.0\t0.25\t2\t0.251331832756\t0.251331832756', '2\t0.685730319473\t0.468164588785\t-0.133333333333\t3\t1.37146063895\t0.685730319473'])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) coords_fp = opts.input_coords mapping_fp = opts.mapping_fp output_fp = opts.output_fp valid_states = opts.valid_states negate = opts.negate mapping_header_name = opts.mapping_header_name coords_ids, coords, eigen_values, pct_exp = parse_coords(open(coords_fp, "U")) data, headers, _ = parse_mapping_file(open(mapping_fp, "U")) if mapping_fp and valid_states: valid_sample_ids = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) valid_coords_ids, valid_coords = filter_sample_ids_from_coords(coords_ids, coords, valid_sample_ids, negate) if mapping_header_name: sorted_sample_ids = sort_sample_ids(data, headers, mapping_header_name) sorted_coord_ids, sorted_coords = sort_coords(valid_coords_ids, valid_coords, sorted_sample_ids) valid_coords_ids, valid_coords = sorted_coord_ids, sorted_coords lines = format_coords(valid_coords_ids, valid_coords, eigen_values, pct_exp) fd = open(output_fp, "w") fd.writelines(lines) fd.close
def setUp(self): """Load data created on the fly with the biom.table.Table.""" self.bt1 = TEST_TABLE1 self.bt2 = TEST_TABLE2 mdata, mheaders, _ = parse_mapping_file(TEST_MF.split('\n')) self.mdata = array(mdata) self.mheaders = mheaders
def merge_mapping_files(mapping_files,no_data_value='no_data'): """ Merge list of mapping files into a single mapping file mapping_files: open file objects containing mapping data no_data_value: value to be used in cases where there is no mapping field associated with a sample ID (default: 'no_data') """ mapping_data = defaultdict(dict) all_headers = set([]) # iterate over mapping files, parsing each for mapping_file in mapping_files: current_data, current_headers, current_comments = \ parse_mapping_file(mapping_file,strip_quotes=False) all_headers.update(set(current_headers)) for entry in current_data: current_values = {k:v for k,v in zip(current_headers, entry)} sample_id = current_values['SampleID'] if sample_id in mapping_data: # if the sample id has already been seen, confirm that # there is no conflicting values across the different # mapping files (e.g., pH=5.0 and pH=6.0)- if there is, # raise a ValueError previous_data = mapping_data[sample_id] for key in current_values: if key not in previous_data: continue if current_values[key] != previous_data[key]: raise ValueError("Different values provided for %s for" "sample %s in different mapping files."\ % (key, sample_id)) mapping_data[sample_id].update(current_values) # remove and place the fields whose order is important ordered_beginning = [] for e in ['SampleID','BarcodeSequence','LinkerPrimerSequence']: if e in all_headers: all_headers.remove(e) ordered_beginning.append(e) ordered_end = [] for e in ['Description']: if e in all_headers: all_headers.remove(e) ordered_end.append(e) ordered_headers = ordered_beginning + list(all_headers) + ordered_end # generate the mapping file lines containing all fields result = ['#' + '\t'.join(ordered_headers)] for sample_id, data in mapping_data.items(): values = [data.get(k, no_data_value) for k in ordered_headers] result.append('\t'.join(values)) return result
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create the output directories try: makedirs(opts.output_dir) except OSError: if opts.force: pass else: # This check helps users avoid overwriting previous output. print "Output directory already exists. Please choose "+\ "a different directory, or force overwrite with -f." exit(1) # verify that category is in mapping file map_list = parse_mapping_file(open(opts.mapping_file, 'U').readlines()) if not opts.category in map_list[1][1:]: print "Category '%s' not found in mapping file columns:" % ( opts.category) print map_list[1][1:] exit(1) # run the supervised learning algorithm result = run_supervised_learning(opts.input_data, opts.mapping_file, opts.category, ntree=opts.ntree, errortype=opts.errortype, output_dir=opts.output_dir, verbose=opts.verbose)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_otu_table(open(opts.input_otu_table,'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp,'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: parser.error("must provide either --sort_field and --mapping_fp OR --sorted_sample_ids_fp") # format and write the otu table result_str = format_otu_table(result[0],result[1],result[2],result[3]) of = open(opts.output_fp,'w') of.write(result_str) of.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) map_data, map_header, map_comments = parse_mapping_file(open( opts.map, 'U')) map_dict = mapping_file_to_dict(map_data, map_header) distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U')) if opts.colorby == None: colorby_cats = [None] else: colorby_idx = map_header.index(opts.colorby) colorby_cats = list(set([map_data[i][colorby_idx] for\ i in range(len(map_data))])) textfilename = os.path.splitext(opts.output_path)[0] + '.txt' text_fh = open(textfilename, 'w') text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n') colorby_cats.sort() plt.figure() for cat_num, cat in enumerate(colorby_cats): # collect the primary and secondary samples within this category state1_samids, state2_samids = get_sam_ids(map_data, map_header, opts.colorby, cat, opts.primary_state, opts.secondary_state) state1_samids =\ list(set(state1_samids).intersection(set(distdict.keys()))) state2_samids =\ list(set(state2_samids).intersection(set(distdict.keys()))) if state1_samids == [] or state2_samids == [] or \ (len(state1_samids) == 1 and state1_samids == state2_samids): raise RuntimeError("one category of samples didn't have any valid"+\ " distances. try eliminating samples from -p or -s, or changing"+\ " your mapping file with filter_samples_from_otu_table.py") # go through dmtx state1_avg_dists = get_avg_dists(state1_samids, state2_samids, distdict) # plot xvals = [float(map_dict[sam][opts.axis_category]) for\ sam in state1_samids] try: color = plt.cm.jet(cat_num / (len(colorby_cats) - 1)) except ZeroDivisionError: # only one cat color = 'b' plt.scatter(xvals, state1_avg_dists, edgecolors=color, alpha=.5, facecolors='none') plt.xlabel(opts.axis_category) plt.ylabel('average distance') lines = [str(xvals[i])+'\t'+str(state1_avg_dists[i])+\ '\t'+state1_samids[i]+'\n' for i in range(len(xvals))] text_fh.writelines(lines) if opts.colorby != None: plt.legend(colorby_cats) plt.savefig(opts.output_path)
def merge_mapping_files(mapping_files, no_data_value='no_data'): """ Merge list of mapping files into a single mapping file mapping_files: open file objects containing mapping data no_data_value: value to be used in cases where there is no mapping field associated with a sample ID (default: 'no_data') """ mapping_data = defaultdict(dict) all_headers = set([]) # iterate over mapping files, parsing each for mapping_file in mapping_files: current_data, current_headers, current_comments = \ parse_mapping_file(mapping_file,strip_quotes=False) all_headers.update(set(current_headers)) for entry in current_data: current_values = {k: v for k, v in zip(current_headers, entry)} sample_id = current_values['SampleID'] if sample_id in mapping_data: # if the sample id has already been seen, confirm that # there is no conflicting values across the different # mapping files (e.g., pH=5.0 and pH=6.0)- if there is, # raise a ValueError previous_data = mapping_data[sample_id] for key in current_values: if key not in previous_data: continue if current_values[key] != previous_data[key]: raise ValueError("Different values provided for %s for" "sample %s in different mapping files."\ % (key, sample_id)) mapping_data[sample_id].update(current_values) # remove and place the fields whose order is important ordered_beginning = [] for e in ['SampleID', 'BarcodeSequence', 'LinkerPrimerSequence']: if e in all_headers: all_headers.remove(e) ordered_beginning.append(e) ordered_end = [] for e in ['Description']: if e in all_headers: all_headers.remove(e) ordered_end.append(e) ordered_headers = ordered_beginning + list(all_headers) + ordered_end # generate the mapping file lines containing all fields result = ['#' + '\t'.join(ordered_headers)] for sample_id, data in mapping_data.items(): values = [data.get(k, no_data_value) for k in ordered_headers] result.append('\t'.join(values)) return result
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp): """Counts the number of seqs/OTUs per sample and add its to the mapping file Inputs: biom_lines: mapping_lines: otu_counts: output_fp: """ # Parse biom file biom = parse_biom_table(biom_lines) # Parse mapping file map_data, headers, comments = parse_mapping_file(mapping_lines) # Compute the counts per sample min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(biom, binary_counts=otu_counts) # Add the counts to the mapping data index = len(headers) - 1 headers.insert(index, "NumIndividuals") for row in map_data: row.insert(index, str(counts_per_sample[row[0]])) # # Add the '#' character to the first header # headers[0] = '#' + headers[0] # # Add headers to the data # map_data.insert(0, headers) # Write the corrected mapping file write_corrected_mapping(output_fp, headers, comments, map_data)
def create_replicated_mapping_file(map_f, num_replicates, sample_ids): """Returns a formatted mapping file with replicated sample IDs. Each sample ID will have an ascending integer appended to it from the range [0, num_replicates - 1]. For example, if there are two input sample IDs, S1 and S2, with 3 replicates each, the output will be: S1.0 S1.1 S1.2 S2.0 S2.1 S2.2 All other metadata columns will simply be copied to the output mapping file. The order of input sample IDs is preserved. Arguments: map_f - input mapping file to replicate (file-like object) num_replicates - number of replicates at each sample sample_ids - only sample IDs in the mapping file that are in this list will be replicated. Sample IDs in the mapping file that are not found in this list will not be added to the resulting mapping file """ if num_replicates < 1: raise ValueError("Must specify at least one sample replicate (was " "provided %d)." % num_replicates) map_data, header, comments = parse_mapping_file(map_f) rep_map_data = [] for row in map_data: if row[0] in sample_ids: for rep_num in range(num_replicates): rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:]) return format_mapping_file(header, rep_map_data, comments)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp mapping_fp = opts.mapping_fp mapping_field = opts.mapping_field output_dir = opts.output_dir otu_table_base_name = splitext(split(otu_table_fp)[1])[0] mapping_data, headers, comments = parse_mapping_file(open(mapping_fp,'U')) try: field_index = headers.index(mapping_field) except ValueError: option_parser.error("Field is not in mapping file (search is case "+\ "and white-space sensitive). \n\tProvided field: "+\ "%s. \n\tValid fields: %s" % (mapping_field,' '.join(headers))) mapping_values = set([e[field_index] for e in mapping_data]) create_dir(output_dir) for v in mapping_values: v_fp_str = v.replace(' ','_') otu_table_output_fp = join(output_dir,'%s_%s.txt' % (otu_table_base_name, v_fp_str)) mapping_output_fp = join(output_dir,'mapping_%s.txt' % v_fp_str) filter_otus_and_map(open(mapping_fp,'U'), open(otu_table_fp,'U'), open(mapping_output_fp,'w'), open(otu_table_output_fp,'w'), valid_states_str="%s:%s" % (mapping_field,v), num_seqs_per_otu=1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: with open(opts.mapping_fp, 'U') as map_f: mapping_data, mapping_header, mapping_comments = \ parse_mapping_file(map_f) sample_metadata = mapping_file_to_dict(mapping_data, mapping_header) with open(opts.otu_map_fp, 'U') as otu_map_f: biom_otu_table = make_otu_table(otu_map_f, otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata) write_biom_table(biom_otu_table, opts.output_biom_fp)
def test_make_new_otu_counts(self): """make_new_otu_counts works """ mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint AT0\tA\t1\t0 AT1\tA\t0\t1 AT2\tA\t0\t2 BT0\tB\t1\t0 BT1\tB\t0\t1 BT2\tB\t0\t2 """.split('\n') mapping_data, header, comments = parse_mapping_file(mapping_lines) samples_from_subject, sample_to_subtract = \ get_sample_individual_info(mapping_data, header, 'individual', \ 'timepoint_zero') otu_lines = """# QIIME v1.2.0-dev OTU table #OTU ID\tAT0\tAT1\tS1\tAT2\tBT0\tBT1\tBT2 0\t0.5\t0.3\t99\t0.2\t0.0\t0.0\t0.0 1\t0.0\t0.0\t99\t0.0\t0.4\t0.5\t0.6 2\t0.1\t0.4\t99\t0.7\t0.5\t0.6\t0.8 3\t0.0\t0.1\t99\t0.0\t0.4\t0.0\t0.0 """.split('\n') otu_table = parse_otu_table(otu_lines, float) sample_ids, otu_ids, otu_counts, consensus = otu_table converted_otu_table = make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, sample_to_subtract, samples_from_subject) converted_otu_table = converted_otu_table.split('\n') self.assertEqual(converted_otu_table[1], "#OTU ID\tAT0\tAT1\tAT2\tBT0\tBT1\tBT2") self.assertEqual(converted_otu_table[2], "0\t0.0\t-0.2\t-0.3\t999999999.0\t999999999.0\t999999999.0") self.assertEqual(converted_otu_table[3], "1\t999999999.0\t999999999.0\t999999999.0\t0.0\t0.1\t0.2") self.assertEqual(converted_otu_table[4], "2\t0.0\t0.3\t0.6\t0.0\t0.1\t0.3") self.assertEqual(converted_otu_table[5], "3\t0.0\t0.1\t0.0\t0.0\t-0.4\t-0.4")
def get_technical_lengths(input_map, debug=False): """Returns per-sample info on technical lengths. Note: KEY_SEQ, BARCODE and PRIMER fields are required. The LINKER field is optional. """ if debug: print "Making debug output" body, header, comments = parse_mapping_file(input_map) if debug: print "HEADER:", header key_index = header.index('KEY_SEQ') bc_index = header.index('BARCODE') if 'LINKER' in header: linker_index = header.index('LINKER') else: linker_index = None primer_index = header.index('PRIMER') technical_lengths = {} for fields in body: curr_tech_len = len(fields[key_index]) + len(fields[bc_index]) + \ len(fields[primer_index]) if linker_index is not None: curr_tech_len += len(fields[linker_index]) technical_lengths[fields[0]] = curr_tech_len if debug: print "Technical lengths:" print technical_lengths return technical_lengths
def add_counts_to_mapping(biom_lines, mapping_lines, otu_counts, output_fp): """Counts the number of seqs/OTUs per sample and add its to the mapping file Inputs: biom_lines: mapping_lines: otu_counts: output_fp: """ # Parse biom file biom = parse_biom_table(biom_lines) # Parse mapping file map_data, headers, comments = parse_mapping_file(mapping_lines) # Compute the counts per sample min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(biom, binary_counts=otu_counts) # Add the counts to the mapping data index = len(headers) - 1 headers.insert(index, "NumIndividuals") for row in map_data: row.insert(index, str(counts_per_sample[row[0]])) # Add the '#' character to the first header headers[0] = '#' + headers[0] # Add headers to the data map_data.insert(0, headers) # Write the corrected mapping file write_corrected_file(map_data, comments, output_fp)
def test_sort_otu_table_by_mapping_field_error(self): """ sort_otu_table_by_mapping_field fails on samples in otu table but not mapping""" self.assertRaises(KeyError,sort_otu_table_by_mapping_field, parse_biom_table_str(self.otu_table1_bad_sampleID), parse_mapping_file(self.mapping_f2), sort_field = "Age")
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: mapping_data, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, 'U')) sample_metadata = assemble_sample_metadata(mapping_data, mapping_header, mapping_comments) biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'), otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata) write_biom_table(biom_otu_table, opts.output_biom_fp)
def sample_ids_from_metadata_description(mapping_f,valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) return sample_ids
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) columns_to_merge = opts.columns_to_merge mapping_fp = opts.mapping_fp output_fp = opts.output_fp try: data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) except: option_parser.error('Bro, that doesn\'t look like a mapping file') for merging in columns_to_merge: retrieve = lambda x: headers.index(x) indices = map(retrieve, merging.split('&&')) headers.append(''.join([headers[element] for element in indices])) for line in data: line.append(''.join([line[element] for element in indices])) # this should never happen assert len(headers) == len(data[0]), "Something went horribly wrong, "+\ "that's what you get for using non-unit-tested software" lines = format_mapping_file(headers, data, comments) fd = open(output_fp, 'w') fd.writelines(lines) fd.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_fp = opts.output_fp map_data, header, comments = parse_mapping_file(opts.input_fp) if opts.category not in header: option_parser.error("%s doesn't appear to exist in the mapping file!" % opts.category) # use stdout or the user supplied file path if output_fp: fd = open(output_fp, 'w') else: fd = stdout result = defaultdict(int) cat_idx = header.index(opts.category) for samp in map_data: result[samp[cat_idx]] += 1 for cat_val in natsort(result): if not cat_val: fd.write("***UNSPECIFIED***\t%d\n" % result[cat_val]) else: fd.write("%s\t%d\n" % (cat_val, result[cat_val])) fd.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_biom_table(open(opts.input_otu_table, 'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp, 'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: result = sort_otu_table( otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds)) # format and write the otu table result_str = format_biom_table(result) of = open(opts.output_fp, 'w') of.write(result_str) of.close()
def test_get_sam_ids(self): """set of sample ids in get_sam_ids should be correct""" map_file = StringIO.StringIO( """#SampleID Country AgeYears Family AgeCat h208A.1 Malawi 0.032854209 h208 Child h301A.1 Malawi 0.05 h301 Child h301B.1 Malawi 0.05 h301 Child USinfTw20.1 USA 0.083333333 USinfTw20 Child USinfTw20.2 USA 0.083333333 USinfTw20 Child USinfTw1.1 USA 0.083333333 USinfTw1 Child h10M Malawi 26 h10 Adult h68M Malawi 26 h68 Adult TS25 USA 26 USts9 Adult TS26 USA 26 USts9 Adult""") map_data, map_header, comments = parse_mapping_file(map_file) colorby = 'Country' cat = 'USA' primary_state = 'AgeCat:Child' ids1, ids2 = get_sam_ids(map_data, map_header, colorby, cat, primary_state, secondary_state=None) self.assertEqual(set(ids1), set(['USinfTw20.1', 'USinfTw20.2', 'USinfTw1.1'])) self.assertEqual(set(ids2), set(['TS25', 'TS26']))
def test_sort_otu_table_by_mapping_field_some_values_same(self): """ sort_otu_table_by_mapping_field fns when all values are the same""" actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1), parse_mapping_file(self.mapping_f2), sort_field = "Name") expected = parse_biom_table_str(self.name_sorted_otu_table1) self.assertEqual(actual, expected)
def test_sort_otu_table_by_mapping_field_some_values_differ(self): """ sort_otu_table fns when some values differ""" actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1), parse_mapping_file(self.mapping_f2), sort_field = "Nothing") expected = parse_biom_table_str(self.nothing_sorted_otu_table1) self.assertEqual(actual, expected)
def test_sort_otu_table_by_mapping_field_all_values_differ(self): """ sort_otu_table_by_mapping_field fns when all values differ""" actual = sort_otu_table_by_mapping_field(parse_biom_table_str(self.otu_table1), parse_mapping_file(self.mapping_f2), sort_field = "Age") expected = parse_biom_table_str(self.age_sorted_otu_table1) self.assertEqual(actual, expected)
def get_taxa(taxa_fname, sample_ids): """Opens and returns coords data""" try: lines = open(taxa_fname, 'U').readlines() except (TypeError, IOError): raise MissingFileError, 'Taxa summary file required for this analysis' map = parse_mapping_file(lines) return map
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) biom_table_fp = opts.biom_table_fp mapping_fp = opts.mapping_fp fields = opts.fields.split(',') output_dir = opts.output_dir suppress_mf = opts.suppress_mapping_file_output # column_rename_ids = opts.column_rename_ids # include_repeat_cols = opts.include_repeat_cols bt = load_table(biom_table_fp) mdata, mheaders, mcomments = parse_mapping_file(mapping_fp) mdata = array(mdata) # check that biom file and mapping file have matching sample names. discard # those samples that do not appear in both. shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample'))) if len(shared_samples) == 0: raise ValueError('Mapping file and biom table share no samples.') elif len(shared_samples) == len(mdata[:, 0]): mdata = array(mdata) else: # we want to preserve the order of the samples in the biom table ss_bt_order = [s for s in bt.ids(axis='sample') if s in shared_samples] bt = bt.filter(ss_bt_order, axis='sample', inplace=True) mdata = subset_mapping_data(mdata, shared_samples) # check that headers in mapping data if not all([i in mheaders for i in fields]): raise ValueError('One or more of the specified fields was not found ' +\ 'in the mapping file.') # create output directory and create base names create_dir(output_dir) mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0]) bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0]) # run code and append output sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders, mdata) for sg, vg in zip(sample_groups, value_groups): name_base = '__' + '%s_%s_' * len(vg) + '_' name_tmp = [] for f, v in zip(fields, vg): name_tmp.extend([f, v]) nb = name_base % tuple(name_tmp) tmp_mf_data = subset_mapping_data(mdata, sg) tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments) write_biom_table(bt.filter(sg, axis='sample', inplace=False), bt_base_name + nb + '.biom') if not suppress_mf: o = open(mf_base_name + nb + '.txt', 'w') o.writelines(tmp_mf_str) o.close()
def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n' + 'Pass a valid one via -i.') % f) outdir = opts.output_dir create_dir(outdir, fail_on_exist=not opts.force) log_fh = None if (not (opts.primer or opts.map_fname)): raise ApplicationError("Either mapping file or primer required") # Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname, "U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:, index]) if len(all_primers) != 1: raise ValueError( "Currently only data sets with one primer are allowed.\n" + "Make separate mapping files with only one primer, re-run split_libraries and\n" + "denoise with each split_library output separately.") primer = list(all_primers)[0] last_char = primer[-1] if (last_char not in "ACGT"): raise ValueError("We currently do not support primer with " + "degenerate bases at it's 3' end.") else: primer = opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path, 'w') for i, cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i), '\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path, 'w') write_Fasta_from_name_seq_pairs(centroids, oh)
def test_mapping_data_to_barcode_map(self): """parse_barcode_map: functions as expected """ mapping_data, mapping_headers, mapping_comments =\ parse_mapping_file(self.mapping_f) expected = {'GGTGGT':'Samp2',\ 'GGAGGT':'SAMP_1',\ 'GGTTAA':'dflsdflsdfsdfsdfsd'} self.assertEqual(mapping_data_to_barcode_map(mapping_data), expected)