예제 #1
0
 def test_format_otu_table(self):
     """format_otu_table should return tab-delimited table"""
     a = array([[1,2,3],[4,5,2718281828459045]])
     samples = ['a','b','c']
     otus = [1,2]
     taxa = ['Bacteria','Archaea']
     res = format_otu_table(samples, otus, a,legacy=False)
     self.assertEqual(res,
         '# QIIME v%s OTU table\nOTU ID\ta\tb\tc\n1\t1\t2\t3\n2\t4\t5\t2718281828459045' % __version__)
     res = format_otu_table(samples, otus, a, taxa, legacy=False)
     self.assertEqual(res,
         '# QIIME v%s OTU table\nOTU ID\ta\tb\tc\tConsensus Lineage\n1\t1\t2\t3\tBacteria\n2\t4\t5\t2718281828459045\tArchaea' % __version__)
     self.assertRaises(ValueError, format_otu_table, samples, [1,2,3], a)
예제 #2
0
 def test_format_otu_table(self):
     """format_otu_table should return tab-delimited table"""
     a = array([[1,2,3],[4,5,2718281828459045]])
     samples = ['a','b','c']
     otus = [1,2]
     taxa = ['Bacteria','Archaea']
     res = format_otu_table(samples, otus, a)
     self.assertEqual(res,
         '#Full OTU Counts\n#OTU ID\ta\tb\tc\n1\t1\t2\t3\n2\t4\t5\t2718281828459045')
     res = format_otu_table(samples, otus, a, taxa)
     self.assertEqual(res,
         '#Full OTU Counts\n#OTU ID\ta\tb\tc\tConsensus Lineage\n1\t1\t2\t3\tBacteria\n2\t4\t5\t2718281828459045\tArchaea')
     self.assertRaises(ValueError, format_otu_table, samples, [1,2,3], a)
예제 #3
0
def filter_samples_from_otu_table(otu_table_lines,
                                  samples_to_discard,
                                  negate=False):
    """ Remove specified samples from OTU table """
    otu_table_data = parse_otu_table(otu_table_lines)
    
    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    new_otu_table_data = []
    new_sample_ids = []
    
    if negate:
        def keep_sample(s):
            return s in sample_lookup
    else:
        def keep_sample(s):
            return s not in sample_lookup
    
    sample_ids, otu_ids, otu_table_data, taxa = otu_table_data
    otu_table_data = otu_table_data.transpose()
    
    for row,sample_id in zip(otu_table_data,sample_ids):
        if keep_sample(sample_id):
            new_otu_table_data.append(row)
            new_sample_ids.append(sample_id)
    
    new_otu_table_data = array(new_otu_table_data).transpose()
    
    result = format_otu_table(new_sample_ids,
                              otu_ids,
                              new_otu_table_data,
                              taxa,
                              skip_empty=True).split('\n')
    return result
def make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, \
    sample_to_subtract, samples_from_subject):
    """make the converted otu table
    """
    new_sample_ids = sample_to_subtract.keys()
    new_sample_ids.sort()
    new_otu_counts = zeros([len(otu_ids), len(new_sample_ids)])
    for index1, otu in enumerate(otu_ids):
        for index2, sample in enumerate(new_sample_ids):
            tpz_sample = sample_to_subtract[sample]
            if tpz_sample in sample_ids:
                tpz_sample_index = sample_ids.index(tpz_sample)
            else:
                raise ValueError("There are samples in the category mapping file that are not in the otu table, such as sample: " + tpz_sample + ". Removing these samples from the category mapping file will allow you to proceed.")
            #get the new count as the relative abundance of the otu at
            #the later timepoint minus the relative abundance at timepoint zero
            old_sample_index = sample_ids.index(sample)
            new_count = otu_counts[index1, old_sample_index] - \
                otu_counts[index1, tpz_sample_index]
            #make sure that the count is not zero across all of the subject's
            #samples
            has_nonzeros = False
            subject_sample_ids = samples_from_subject[sample]
            for i in subject_sample_ids:
                sample_index = sample_ids.index(i)
                if otu_counts[index1, sample_index] > 0:
                    has_nonzeros = True
            if has_nonzeros:
                new_otu_counts[index1, index2] = new_count
            else:
                new_otu_counts[index1, index2] = 999999999
    return format_otu_table(new_sample_ids, otu_ids, new_otu_counts, consensus)
예제 #5
0
def filter_otus_from_otu_table(otu_table_lines,otus_to_discard,negate=False):
    """ Remove specified OTUs from otu_table """
    otu_table_data = parse_otu_table(otu_table_lines)
    
    otu_lookup = {}.fromkeys([e.split()[0] for e in otus_to_discard])
    new_otu_table_data = []
    new_otu_ids = []
    new_taxa = []
    
    if negate:
        def keep_otu(s):
            return s in otu_lookup
    else:
        def keep_otu(s):
            return s not in otu_lookup
    
    sample_ids, otu_ids, otu_table_data, taxa = otu_table_data
    
    for row,otu_id,taxonomy in zip(otu_table_data,otu_ids,taxa):
        if keep_otu(otu_id):
            new_otu_table_data.append(row)
            new_otu_ids.append(otu_id)
            new_taxa.append(taxonomy)
    
    new_otu_table_data = array(new_otu_table_data)
            
    result = format_otu_table(sample_ids,
                              new_otu_ids,
                              new_otu_table_data,
                              new_taxa).split('\n')
    return result
def summarize_by_cat(map_lines,otu_sample_lines,category,norm):
    """creates the category otu table"""
    cat_by_sample, sample_by_cat, num_meta, meta_dict, label_lists_dict, \
                   num_samples_by_cat = get_sample_cat_info(map_lines,category)

    lines, otus, taxonomy = get_counts_by_cat(otu_sample_lines, num_meta, \
                  meta_dict,label_lists_dict[category],category,num_samples_by_cat,\
                  norm)
    
    #This for loop was added to remove columns that sum to 0, since you may 
    #pass a mapping file that has more samples than in the OTU table, hence resulting
    #in columns with no counts
    new_labels=[]
    new_lines=[]
    for i,line in enumerate(zip(*lines)):
        total_col=sum([float(x) for x in line])
        if total_col>0:
            new_lines.append(line)
            new_labels.append(label_lists_dict[category][i])
    new_lines=zip(*new_lines)
    
    lines = format_otu_table(new_labels, otus, array(new_lines), \
                  taxonomy=taxonomy,
                  comment='Category OTU Counts-%s'% category)
    return lines
예제 #7
0
def pool_otu_table(otu_infile, otu_outfile, 
    pooled_sample_name, sample_ids_to_pool):
    """pools otu table file according to specified criteria."""

    ## otu table
    otu_table = parse_otu_table(otu_infile)
    pool_sample_idxs = []
    nonpool_sample_idxs = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] in sample_ids_to_pool:
            pool_sample_idxs.append(i)
        else:
            nonpool_sample_idxs.append(i)
    
    new_sample_ids = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] not in sample_ids_to_pool: 
            # from valid_states string on mapfile
            new_sample_ids.append(otu_table[0][i])
    new_sample_ids.append(pooled_sample_name)
    
    # otu mtx
    new_sample_abund = otu_table[2][:,pool_sample_idxs].sum(1)
    newdims = (len(otu_table[2]),len(new_sample_ids))

    new_otu_mtx = numpy.zeros(newdims,dtype=otu_table[2].dtype)
    new_otu_mtx[:,:-1] = otu_table[2][:,nonpool_sample_idxs]
    new_otu_mtx[:,-1] = new_sample_abund
    
    otu_outfile.write(format_otu_table(new_sample_ids, otu_table[1], 
        new_otu_mtx, taxonomy=otu_table[3]))
예제 #8
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_otu_table(open(opts.input_otu_table,'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp
    
    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp,'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data,
                                                 mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        parser.error("must provide either --sort_field and --mapping_fp OR --sorted_sample_ids_fp")

    # format and write the otu table
    result_str = format_otu_table(result[0],result[1],result[2],result[3])
    of = open(opts.output_fp,'w')
    of.write(result_str)
    of.close()
예제 #9
0
def pool_otu_table(otu_infile, otu_outfile, 
    pooled_sample_name, sample_ids_to_pool):
    """pools otu table file according to specified criteria."""

    ## otu table
    otu_table = parse_otu_table(otu_infile)
    pool_sample_idxs = []
    nonpool_sample_idxs = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] in sample_ids_to_pool:
            pool_sample_idxs.append(i)
        else:
            nonpool_sample_idxs.append(i)
    
    new_sample_ids = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] not in sample_ids_to_pool: 
            # from valid_states string on mapfile
            new_sample_ids.append(otu_table[0][i])
    new_sample_ids.append(pooled_sample_name)
    
    # otu mtx
    new_sample_abund = otu_table[2][:,pool_sample_idxs].sum(1)
    newdims = (len(otu_table[2]),len(new_sample_ids))

    new_otu_mtx = numpy.zeros(newdims,dtype=otu_table[2].dtype)
    new_otu_mtx[:,:-1] = otu_table[2][:,nonpool_sample_idxs]
    new_otu_mtx[:,-1] = new_sample_abund
    
    otu_outfile.write(format_otu_table(new_sample_ids, otu_table[1], 
        new_otu_mtx, taxonomy=otu_table[3]))
예제 #10
0
def reconcile_hosts_symbionts(otu_file, host_dist):

    # filter cOTU table by samples present in host_tree/dm

    filtered_cotu_table = filter_samples_from_otu_table(otu_file,
                                                        host_dist[0],
                                                        negate=True)

    # Now the cOTU table only has the samples present in the host dm

    # parse the filtered cOTU table
    sample_names, taxon_names, data, lineages = parse_otu_table(
        filtered_cotu_table)

    # filter cOTU table again because skip_empty doesn't seem to be
    # working in format_otu_table called from
    # filter_samples_from_otu_table

    sample_names, taxon_names, data, lineages = filter_otu_table_by_min(
        sample_names, taxon_names, data, lineages, min=1)

    # Filter the host_dists to match the newly trimmed subtree
    # Note: this is requiring the modified filter_dist method which
    # returns a native dm tuple rather than a string.

    host_dist_filtered = filter_samples_from_distance_matrix(
        host_dist, sample_names, negate=True)

    filtered_otu_table_lines = format_otu_table(
        sample_names, taxon_names, data, lineages)

    return StringIO(filtered_otu_table_lines), host_dist_filtered
예제 #11
0
def _filter_table_neg_control(otu_table_lines, samples):
    """removes OTUs from OTU_table that are found in one of the samples in the sample list
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    new_otu_table = []
    new_otu_ids = []
    new_lineages = []
    #get the sample indices to remove
    sample_indices = []
    for i in samples:
        if i in sample_ids:
            index = sample_ids.index(i)
            sample_indices.append(index)

    for i, row in enumerate(otu_table):
        #figure out if the OTU is in any of the negative controls
        count = 0
        for j in sample_indices:
            count += row[j]
        #only write it to the new OTU table if it is not
        if count == 0:
            if lineages:
                new_lineages.append(lineages[i])
            new_otu_table.append(list(row))
            new_otu_ids.append(otu_ids[i])
    new_otu_table = array(new_otu_table)
    result = format_otu_table(sample_ids, new_otu_ids, new_otu_table, new_lineages)
    result = result.split('\n')
    #remove the samples
    return _filter_table_samples(result, 1)
    def setUp(self):
        self.l19_data = numpy.array([
            [7,1,0,0,0,0,0,0,0],
            [4,2,0,0,0,1,0,0,0],
            [2,4,0,0,0,1,0,0,0],
            [1,7,0,0,0,0,0,0,0],
            [0,8,0,0,0,0,0,0,0],
            [0,7,1,0,0,0,0,0,0],
            [0,4,2,0,0,0,2,0,0],
            [0,2,4,0,0,0,1,0,0],
            [0,1,7,0,0,0,0,0,0],
            [0,0,8,0,0,0,0,0,0],
            [0,0,7,1,0,0,0,0,0],
            [0,0,4,2,0,0,0,3,0],
            [0,0,2,4,0,0,0,1,0],
            [0,0,1,7,0,0,0,0,0],
            [0,0,0,8,0,0,0,0,0],
            [0,0,0,7,1,0,0,0,0],
            [0,0,0,4,2,0,0,0,4],
            [0,0,0,2,4,0,0,0,1],
            [0,0,0,1,7,0,0,0,0]
            ])
        self.l19_sample_names = ['sam1', 'sam2', 'sam3', 'sam4', 'sam5','sam6',\
        'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', \
        'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19']
        self.l19_taxon_names =  ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',\
        'tax6', 'tax7', 'tax8', 'tax9']
        self.l19_taxon_names_w_underscore =  ['ta_x1', 'tax2', 'tax3', 'tax4', 
         'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9']

        self.l19_str = format_otu_table(
            self.l19_sample_names, 
            self.l19_taxon_names, 
            self.l19_data.T)
            
        self.l19_str_w_underscore = format_otu_table(
            self.l19_sample_names, 
            self.l19_taxon_names_w_underscore, 
            self.l19_data.T)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4, ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'
        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)
        
        self.files_to_remove = []
        self.folders_to_remove = []
예제 #13
0
def _filter_table_samples(otu_table_lines, min_seqs_per_sample):
    """removes samples from OTU_table that have less than min_seqs_per_sample
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    counts = sum(otu_table)
    big_enough_samples = (counts>=int(min_seqs_per_sample)).nonzero()
    res_otu_table = otu_table.copy()
    res_otu_table = res_otu_table[:,big_enough_samples[0]]
    res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
예제 #14
0
 def _write_rarefaction(self, fname, sub_sample_ids, sub_otu_ids,\
     sub_otu_table, otu_lineages):
     """ depth and rep can be numbers or strings
     """
     if min(numpy.shape(sub_otu_table)) == 0:  # no data to write
         return
     f = open(fname, 'w')
     f.write(format_otu_table(sub_sample_ids, sub_otu_ids,\
         sub_otu_table, otu_lineages, comment=fname))
     f.close()
예제 #15
0
def _filter_table_samples(otu_table_lines, min_seqs_per_sample):
    """removes samples from OTU_table that have less than min_seqs_per_sample
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    counts = sum(otu_table)
    big_enough_samples = (counts >= int(min_seqs_per_sample)).nonzero()
    res_otu_table = otu_table.copy()
    res_otu_table = res_otu_table[:, big_enough_samples[0]]
    res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
예제 #16
0
 def _write_rarefaction(self, depth, rep, sub_sample_ids, sub_otu_ids,\
     sub_otu_table, otu_lineages):
     """ depth and rep can be numbers or strings
     """
     if min(numpy.shape(sub_otu_table)) == 0:  # no data to write
         return
     fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.txt'
     f = open(os.path.join(self.output_dir, fname), 'w')
     f.write(format_otu_table(sub_sample_ids, sub_otu_ids,\
         sub_otu_table, otu_lineages, comment=fname))
     f.close()
예제 #17
0
    def setUp(self):
        self.l19_data = numpy.array([[7, 1, 0, 0, 0, 0, 0, 0, 0],
                                     [4, 2, 0, 0, 0, 1, 0, 0, 0],
                                     [2, 4, 0, 0, 0, 1, 0, 0, 0],
                                     [1, 7, 0, 0, 0, 0, 0, 0, 0],
                                     [0, 8, 0, 0, 0, 0, 0, 0, 0],
                                     [0, 7, 1, 0, 0, 0, 0, 0, 0],
                                     [0, 4, 2, 0, 0, 0, 2, 0, 0],
                                     [0, 2, 4, 0, 0, 0, 1, 0, 0],
                                     [0, 1, 7, 0, 0, 0, 0, 0, 0],
                                     [0, 0, 8, 0, 0, 0, 0, 0, 0],
                                     [0, 0, 7, 1, 0, 0, 0, 0, 0],
                                     [0, 0, 4, 2, 0, 0, 0, 3, 0],
                                     [0, 0, 2, 4, 0, 0, 0, 1, 0],
                                     [0, 0, 1, 7, 0, 0, 0, 0, 0],
                                     [0, 0, 0, 8, 0, 0, 0, 0, 0],
                                     [0, 0, 0, 7, 1, 0, 0, 0, 0],
                                     [0, 0, 0, 4, 2, 0, 0, 0, 4],
                                     [0, 0, 0, 2, 4, 0, 0, 0, 1],
                                     [0, 0, 0, 1, 7, 0, 0, 0, 0]])
        self.l19_sample_names = ['sam1', 'sam2', 'sam3', 'sam4', 'sam5','sam6',\
        'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', \
        'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19']
        self.l19_taxon_names =  ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',\
        'tax6', 'tax7', 'tax8', 'tax9']
        self.l19_taxon_names_w_underscore = [
            'ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7',
            'tax8', 'tax9'
        ]

        self.l19_str = format_otu_table(self.l19_sample_names,
                                        self.l19_taxon_names, self.l19_data.T)

        self.l19_str_w_underscore = format_otu_table(
            self.l19_sample_names, self.l19_taxon_names_w_underscore,
            self.l19_data.T)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4, ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'
        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)

        self.files_to_remove = []
예제 #18
0
 def test_format_otu_table(self):
     """format_otu_table should return biom-formatted string"""
     a = array([[1, 2, 3], [4, 5, 2718281828459045]])
     samples = ["a", "b", "c"]
     otus = [1, 2]
     taxa = ["Bacteria", "Archaea"]
     res = format_otu_table(samples, otus, a)
     # confirm that parsing the res gives us a valid biom file with
     # expected observation and sample ids
     t = parse_biom_table(res.split("\n"))
     self.assertEqual(t.ObservationIds, ("1", "2"))
     self.assertEqual(t.SampleIds, ("a", "b", "c"))
예제 #19
0
 def test_format_otu_table(self):
     """format_otu_table should return biom-formatted string"""
     a = array([[1, 2, 3],
                [4, 5, 2718281828459045]])
     samples = ['a', 'b', 'c']
     otus = [1, 2]
     taxa = ['Bacteria', 'Archaea']
     res = format_otu_table(samples, otus, a)
     # confirm that parsing the res gives us a valid biom file with
     # expected observation and sample ids
     t = parse_biom_table(res.split('\n'))
     self.assertEqual(t.ObservationIds, ('1', '2'))
     self.assertEqual(t.SampleIds, ('a', 'b', 'c'))
예제 #20
0
 def test_format_otu_table(self):
     """format_otu_table should return biom-formatted string"""
     a = array([[1,2,3],
                [4,5,2718281828459045]])
     samples = ['a','b','c']
     otus = [1,2]
     taxa = ['Bacteria','Archaea']
     res = format_otu_table(samples, otus, a)
     # confirm that parsing the res gives us a valid biom file with 
     # expected observation and sample ids
     t = parse_biom_table(res.split('\n'))
     self.assertEqual(t.ObservationIds,('1','2'))
     self.assertEqual(t.SampleIds,('a','b','c'))
예제 #21
0
def merge_n_otu_tables(otu_table_fs):
    """ Merge n otu tables """
    if len(otu_table_fs) < 2:
        raise ValueError, "Two or more OTU tables must be provided."
    otu_table_f0 = otu_table_fs[0]
    for otu_table_f in otu_table_fs[1:]:
        sample_names, otu_names, data, taxonomy = \
         merge_otu_tables(otu_table_f0,otu_table_f)
        otu_table_f0 = format_otu_table(sample_names=sample_names,
                                        otu_names=otu_names,
                                        data=data,
                                        taxonomy=taxonomy).split('\n')

    return sample_names, otu_names, data, taxonomy
예제 #22
0
def main():
    option_parser, opts, args =\
     parse_command_line_parameters(**script_info)

    out_fh = open(opts.output_file,'w')
    otu_table_fh = open(opts.otu_table,'U')
    sample_ids, otu_ids, otu_mtx, otu_metadata = parse_otu_table(otu_table_fh)
    tree_fh = open(opts.tree_file,'U')
    tree = DndParser(tree_fh)

    res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \
     sim_otu_table(sample_ids, otu_ids, otu_mtx, otu_metadata, 
     tree, opts.num, opts.dissim)

    out_fh.write(format_otu_table(res_sam_names, res_otus, 
     res_otu_mtx, res_otu_metadata))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
 
    if not isfile(opts.input_path):
       raise IOError, \
        "Input path (%s) not valid.  Does it exist?" % opts.input_path
    
    if isfile(opts.output_path):
       raise IOError, \
        "Output path (%s) already exists. Please " % opts.output_path +\
         "remove or specify a different path" 
    
    samples, otus, data = parse_trflp(open(opts.input_path,'U'))
    
    output_f = open(opts.output_path, 'w')
    output_f.write(format_otu_table(samples,otus,data, comment='Created with %s' % __file__))
    output_f.close()
예제 #24
0
def summarize_by_cat(map_lines, otu_sample_lines, category, dir_path, norm):
    """creates the category otu table"""
    cat_by_sample, sample_by_cat, num_meta, meta_dict, label_lists_dict, \
                   num_samples_by_cat = get_sample_cat_info(map_lines,category)

    lines, otus, taxonomy = get_counts_by_cat(otu_sample_lines, num_meta, \
                  meta_dict,label_lists_dict[category],category,num_samples_by_cat,\
                  norm)

    lines = format_otu_table(label_lists_dict[category], otus, array(lines), \
                  taxonomy=taxonomy,
                  comment='Category OTU Counts-%s'% category)

    if norm:
        file_name = os.path.join(dir_path, '%s_otu_table_norm.txt' % category)
    else:
        file_name = os.path.join(dir_path, '%s_otu_table.txt' % category)
    f = open(file_name, 'w')
    f.write(lines)
    f.close()
예제 #25
0
def filter_otu_table_to_n_samples(otu_table_lines,n):
    """
        randomly select n samples from the otu table
    """
    if n < 1:
        raise ValueError,\
         "number of randomly selected sample ids must be greater than 1"
    sample_ids, otu_ids, otu_table_data, taxa = parse_otu_table(otu_table_lines)
    
    samples_to_keep = list(sample_ids)
    shuffle(samples_to_keep)
    samples_to_keep = samples_to_keep[:n]
    
    otu_table_lines = format_otu_table(\
     sample_ids, otu_ids, otu_table_data, taxa).split('\n')
    
    result = filter_samples_from_otu_table(otu_table_lines,
                                           samples_to_keep,
                                           negate=True)
    return result
예제 #26
0
def split_otu_table_on_taxonomy(otu_table_lines,level):
    """ Split OTU table by taxonomic level, yielding formatted OTU tables 
    """
    if level < 1:
        raise ValueError, "Taxonomic level must be greater than zero"
    sample_ids, otu_ids, otu_table_data, taxa = parse_otu_table(otu_table_lines)
    taxon_data = {}
    for otu_id, counts, taxon in zip(otu_ids, otu_table_data, taxa):
        taxon_at_level = ';'.join(taxon[:level])
        try:
            current_taxon_table = taxon_data[taxon_at_level]
        except KeyError:
            taxon_data[taxon_at_level] = [[],[],[]]
            current_taxon_table = taxon_data[taxon_at_level]
        current_taxon_table[0].append(otu_id)
        current_taxon_table[1].append(counts)
        current_taxon_table[2].append(taxon)
        
    
    for taxon_at_level, taxon_datum in taxon_data.items():
        yield taxon_at_level, format_otu_table(sample_ids, 
                                               taxon_datum[0],
                                               array(taxon_datum[1]),
                                               taxon_datum[2])
예제 #27
0
def filter_table(params,filtered_table_path,otu_file):
    """ Filters table according to OTU counts, occurance, and taxonomy
    
    params: Dictionary containing minimum sequence count (min_otu_count) 
     per OTU, minimum number of samples that OTU needs to occur in
     (min_otu_samples), targetted taxonomy to retain (included_taxa), and
     taxonomy to exclude (excluded_taxa). 
    filtered_table_path:  Open file object to write filtered table to.
    otu_file: Open file object of input OTU file.
    """
    

    min_otu_count=params['min_otu_count']
    min_otu_samples=params['min_otu_samples']
    included_taxa=params['included_taxa']
    excluded_taxa=params['excluded_taxa']
    
    otu_data = parse_otu_table(otu_file)
    
    # Create list of OTUs that fail to pass filters
    flagged_otus = []
    
    otu_index = 1
    otus = otu_data[otu_index]
    
    otu_counts_index = 2
    otu_counts = otu_data[otu_counts_index]
    
    taxa_index = 3
    
    try:
        taxa_lines = otu_data[taxa_index]
        if len(taxa_lines):
            taxa_present = True
        else:
            taxa_present = False
    except IndexError:
        taxa_present = False
    
    index_counter = -1
    for otu_count in otu_counts:

        index_counter += 1
        
        if otu_count.sum() < min_otu_count or \
         (otu_count > 0).sum() < min_otu_samples:
             flagged_otus.append(otus[index_counter])
             continue
        if taxa_present:
            taxa = set(taxa_lines[index_counter])
            # Check for targetted taxa that also are not excluded
            if taxa.intersection(included_taxa) and not \
             taxa.intersection(excluded_taxa):
                continue
            # If taxonomy found in included taxa and no excluded taxa 
            # given, skip filtering.
            elif taxa.intersection(included_taxa) and not excluded_taxa:
                continue
            # Skip any taxonomic filtering if taxa present but no filters given
            elif not included_taxa and not excluded_taxa:
                continue
            # If only specifying exluded taxa, allow inclusion of this OTU
            # if taxa not in excluded set.
            elif not included_taxa and not taxa.intersection(excluded_taxa):
                continue
            # taxa does is not included, or falls in excluded set, so flag
            # this OTU for removal
            else:
                flagged_otus.append(otus[index_counter])

    sample_id_index = 0
        
    raw_otu_table = (format_otu_table(otu_data[sample_id_index], 
     otus, otu_counts, taxonomy=taxa_lines, skip_empty=True)).split('\n')
     
    # Filter out lines of the OTU table that are flagged
    
    filtered_otu_table = ""
    
    for line in raw_otu_table:
        if line.startswith("#"):
            filtered_otu_table += line + '\n'
            continue
        curr_otu_id = line.split('\t')[0].strip()
        
        if curr_otu_id in flagged_otus:
            continue
        else:
            filtered_otu_table += line + '\n'
        
    filtered_table_path.write(filtered_otu_table)
예제 #28
0
    try:
        table = zeros((len(all_otus), len(all_libs)), int)
    except MemoryError, e:
        stderr.write('memory error, check format of input otu file\n')
        stderr.write('are there really %s otus and %s samples?\n' %
            (len(all_otus), len(all_libs)))
        stderr.write('traceback follows:\n')
        raise(e)
    for o in all_otus:
        row_idx = all_otus.index(o)
        row = table[row_idx]
        seqids = otu_to_seqid[o]
        for s in seqids:
            lib = s.rsplit(delim, 1)[0]
            row[all_libs.index(lib)] += 1

    if otu_to_taxonomy:
        taxonomy = [otu_to_taxonomy.get(o, 'None') for o in all_otus]
    else:
        taxonomy=None

    return format_otu_table(all_libs, all_otus, table, taxonomy, legacy=legacy)

def remove_otus(otu_to_seqid,otus_to_exclude):
    """Remove otus_to_exclude from otu map """
    otus_to_exclude_lookup = [e.split()[0] for e in otus_to_exclude]
    for otu_id in otu_to_seqid.keys():
        if otu_id in otus_to_exclude_lookup:
            del otu_to_seqid[otu_id]
    return otu_to_seqid
예제 #29
0
    try:
        table = zeros((len(all_otus), len(all_libs)), int)
    except MemoryError, e:
        stderr.write('memory error, check format of input otu file\n')
        stderr.write('are there really %s otus and %s samples?\n' %
            (len(all_otus), len(all_libs)))
        stderr.write('traceback follows:\n')
        raise(e)
    for o in all_otus:
        row_idx = all_otus.index(o)
        row = table[row_idx]
        seqids = otu_to_seqid[o]
        for s in seqids:
            lib = s.rsplit(delim, 1)[0]
            row[all_libs.index(lib)] += 1

    if otu_to_taxonomy:
        taxonomy = [otu_to_taxonomy.get(o, 'None') for o in all_otus]
    else:
        taxonomy=None

    return format_otu_table(all_libs, all_otus, table, taxonomy, legacy=legacy)

def remove_otus(otu_to_seqid,otus_to_exclude):
    """Remove otus_to_exclude from otu map """
    otus_to_exclude_lookup = [e.split()[0] for e in otus_to_exclude]
    for otu_id in otu_to_seqid.keys():
        if otu_id in otus_to_exclude_lookup:
            del otu_to_seqid[otu_id]
    return otu_to_seqid