Пример #1
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    otu_table_fp = opts.biom_fp
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    scolors = opts.scolors.split(',')
    ocolors = opts.ocolors.split(',')
    sshapes = opts.sshapes.split(',')
    oshapes = opts.oshapes.split(',')
    ssizes = opts.ssizes.split(',')
    osizes = opts.osizes.split(',')
    md_fields = opts.md_fields.split(',')

    # check that the otu fields asked for are available
    shared_options = ['NodeType', 'Abundance']
    if not all(
        [i in md_fields + shared_options for i in ocolors + oshapes + osizes]):
        option_parser.error('The fields specified for observation colors, '
                            'sizes, or shapes are not in either the shared '
                            'options (NodeType,Abundance) or the supplied '
                            'md_fields. These fields must be a subset of the '
                            'union of these sets. Have you passed ocolors, '
                            'osizes or oshapes that are not in the md_fields?')
    # check that the sample fields asked for are available. mapping file
    # elements should all have same metadata keys
    sopts = parse_mapping_file_to_dict(map_fp)[0].items()[0][1].keys()
    if not all(
        [i in sopts + shared_options for i in scolors + sshapes + ssizes]):
        option_parser.error('The fields specified for sample colors, sizes, '
                            'or shapes are not in either the shared options '
                            '(NodeType,Abundance) or the supplied mapping '
                            'file. These fields must be a subset of the union '
                            'of these sets. Have you passed scolors, ssizes '
                            'or sshapes that are not in the mapping file '
                            'headers?')

    # actual compuation begins
    try:
        create_dir(output_dir, fail_on_exist=True)
    except OSError:
        option_parser.error('Directory already exists. Will not overwrite.')

    bt = load_table(otu_table_fp)
    pmf = parse_mapping_file_to_dict(map_fp)[0]  # [1] is comments, don't need
    sample_node_table = make_sample_node_table(bt, pmf)
    otu_node_table = make_otu_node_table(bt, opts.observation_md_header_key,
                                         md_fields)
    node_attr_table = make_node_attr_table(otu_node_table, sample_node_table,
                                           scolors, ocolors, ssizes, osizes,
                                           sshapes, oshapes)
    edge_table = make_edge_table(bt)

    _write_table(sample_node_table,
                 os.path.join(output_dir, 'SampleNodeTable.txt'))
    _write_table(otu_node_table, os.path.join(output_dir, 'OTUNodeTable.txt'))
    _write_table(node_attr_table, os.path.join(output_dir,
                                               'NodeAttrTable.txt'))
    _write_table(edge_table, os.path.join(output_dir, 'EdgeTable.txt'))
Пример #2
0
def make_node_attr_table(otu_node_lines, sample_node_lines, sample_color,
                         otu_color, sample_size, otu_size, sample_shape,
                         otu_shape):
    '''Make a preference table to load as node attributes for cytoscape.
    This file makes it easy to color, shape, and size the nodes according
    to the desire of the user. The color, size, and shape inputs are lists
    of strings that specify fields in the headers of otu_node_lines
    and sample_node_lines. The output will be as follows:

    #NodeID NodeType    Abundance   Color   Shape   Size
    otu1    otu 45  Bacteria_bacteriodales    spc56   xyz
    sample1 sample  56  post_treatment  tp_5    abc

    In the above example the user has passed sample_color as ['Treatment']
    and sample1 happens to be post treatment. For otu_color they passed
    ['kingdom', 'phylum'] and otu1 had kingdom Bacteria and phylum
    bacteriodales. This allows arbitrary numbers of color, size, shape
    combos to be created so that everything is fully customizable. If more
    than one field is passed the values for those fields will be joined
    with a '_'.
    Inputs:
     otu_node_lines - list of strs, output of make_otu_node_table
     sample_node_lines - list of strs, output of make_sample_node_table
     _colors, _size, _shape - each of these 6 fields must be a list of
     strings which identify which header fields are desired.
    '''
    # no comments
    sample_nodes = parse_mapping_file_to_dict(sample_node_lines)[0]
    otu_nodes = parse_mapping_file_to_dict(otu_node_lines)[0]
    header = '#NodeID\tNodeType\tAbundance\tColor\tSize\tShape'
    lines = [header]
    # make list of nodes that includes samples and otus
    nodes = sample_nodes.keys() + otu_nodes.keys()
    # make 5 lists which will be the columns of the output file
    nids, nodetypes, abundances, colors, sizes, shapes = [], [], [], [], [], []
    for node in nodes:
        if node in otu_nodes:
            nodetype_val = 'otu'
            abundance_val = otu_nodes[node]['Abundance']
            color_val = '_'.join([otu_nodes[node][i] for i in otu_color])
            size_val = '_'.join([otu_nodes[node][i] for i in otu_size])
            shape_val = '_'.join([otu_nodes[node][i] for i in otu_shape])
        elif node in sample_nodes:
            nodetype_val = 'sample'
            abundance_val = sample_nodes[node]['Abundance']
            color_val = '_'.join([sample_nodes[node][i] for i in sample_color])
            size_val = '_'.join([sample_nodes[node][i] for i in sample_size])
            shape_val = '_'.join([sample_nodes[node][i] for i in sample_shape])
        nids.append(node)
        nodetypes.append(nodetype_val)
        abundances.append(abundance_val)
        colors.append(color_val)
        sizes.append(size_val)
        shapes.append(shape_val)
    nls = [
        '\t'.join(vals)
        for vals in zip(nids, nodetypes, abundances, colors, sizes, shapes)
    ]
    return lines + nls
Пример #3
0
def make_node_attr_table(otu_node_lines, sample_node_lines,
                         sample_color, otu_color, sample_size, otu_size, sample_shape,
                         otu_shape):
    '''Make a preference table to load as node attributes for cytoscape.
    This file makes it easy to color, shape, and size the nodes according
    to the desire of the user. The color, size, and shape inputs are lists
    of strings that specify fields in the headers of otu_node_lines
    and sample_node_lines. The output will be as follows:

    #NodeID NodeType    Abundance   Color   Shape   Size
    otu1    otu 45  Bacteria_bacteriodales    spc56   xyz
    sample1 sample  56  post_treatment  tp_5    abc

    In the above example the user has passed sample_color as ['Treatment']
    and sample1 happens to be post treatment. For otu_color they passed
    ['kingdom', 'phylum'] and otu1 had kingdom Bacteria and phylum
    bacteriodales. This allows arbitrary numbers of color, size, shape
    combos to be created so that everything is fully customizable. If more
    than one field is passed the values for those fields will be joined
    with a '_'.
    Inputs:
     otu_node_lines - list of strs, output of make_otu_node_table
     sample_node_lines - list of strs, output of make_sample_node_table
     _colors, _size, _shape - each of these 6 fields must be a list of
     strings which identify which header fields are desired.
    '''
    # no comments
    sample_nodes = parse_mapping_file_to_dict(sample_node_lines)[0]
    otu_nodes = parse_mapping_file_to_dict(otu_node_lines)[0]
    header = '#NodeID\tNodeType\tAbundance\tColor\tSize\tShape'
    lines = [header]
    # make list of nodes that includes samples and otus
    nodes = sample_nodes.keys() + otu_nodes.keys()
    # make 5 lists which will be the columns of the output file
    nids, nodetypes, abundances, colors, sizes, shapes = [], [], [], [], [], []
    for node in nodes:
        if node in otu_nodes:
            nodetype_val = 'otu'
            abundance_val = otu_nodes[node]['Abundance']
            color_val = '_'.join([otu_nodes[node][i] for i in otu_color])
            size_val = '_'.join([otu_nodes[node][i] for i in otu_size])
            shape_val = '_'.join([otu_nodes[node][i] for i in otu_shape])
        elif node in sample_nodes:
            nodetype_val = 'sample'
            abundance_val = sample_nodes[node]['Abundance']
            color_val = '_'.join([sample_nodes[node][i] for i in sample_color])
            size_val = '_'.join([sample_nodes[node][i] for i in sample_size])
            shape_val = '_'.join([sample_nodes[node][i] for i in sample_shape])
        nids.append(node)
        nodetypes.append(nodetype_val)
        abundances.append(abundance_val)
        colors.append(color_val)
        sizes.append(size_val)
        shapes.append(shape_val)
    nls = ['\t'.join(vals) for vals in zip(nids, nodetypes, abundances, colors,
                                           sizes, shapes)]
    return lines + nls
Пример #4
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table_fp = opts.biom_fp
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    taxonomy_key = opts.observation_md_header_key
    scolors = opts.scolors.split(',')
    ocolors = opts.ocolors.split(',')
    sshapes = opts.sshapes.split(',')
    oshapes = opts.oshapes.split(',')
    ssizes = opts.ssizes.split(',')
    osizes = opts.osizes.split(',')
    md_fields = opts.md_fields.split(',')

    # check that the otu fields asked for are available
    shared_options = ['NodeType','Abundance']
    if not all([i in md_fields+shared_options for i in ocolors+oshapes+osizes]):
        option_parser.error('The fields specified for observation colors, sizes, or '+\
            'shapes are not in either the shared options (NodeType,Abundance)'+\
            ' or the supplied md_fields. These fields must be a subset of '+\
            'the union of these sets. Have you passed ocolors, osizes or '+\
            'oshapes that are not in the md_fields?')
    # check that the sample fields asked for are available. mapping file 
    # elements should all have same metadata keys
    sopts = parse_mapping_file_to_dict(map_fp)[0].items()[0][1].keys()
    if not all([i in sopts+shared_options for i in scolors+sshapes+ssizes]):
        option_parser.error('The fields specified for sample colors, sizes, or '+\
            'shapes are not in either the shared options (NodeType,Abundance)'+\
            ' or the supplied mapping file. These fields must be a subset of '+\
            'the union of these sets. Have you passed scolors, ssizes or '+\
            'sshapes that are not in the mapping file headers?')

    # actual compuation begins
    try:
        create_dir(output_dir, fail_on_exist=True)
    except OSError:
        option_parser.error('Directory already exists. Will not overwrite.')

    bt = parse_biom_table(open(otu_table_fp))
    pmf = parse_mapping_file_to_dict(map_fp)[0] # [1] is comments, don't need
    sample_node_table = make_sample_node_table(bt, pmf)
    otu_node_table = make_otu_node_table(bt, opts.observation_md_header_key, 
        md_fields)
    node_attr_table = make_node_attr_table(otu_node_table, sample_node_table,
        scolors, ocolors, ssizes, osizes, sshapes, oshapes)
    edge_table = make_edge_table(bt)

    _write_table(sample_node_table, os.path.join(output_dir,'SampleNodeTable.txt'))
    _write_table(otu_node_table, os.path.join(output_dir,'OTUNodeTable.txt'))
    _write_table(node_attr_table, os.path.join(output_dir,'NodeAttrTable.txt'))
    _write_table(edge_table, os.path.join(output_dir,'EdgeTable.txt'))
Пример #5
0
def choose_gradient_subset(otu_table_f, map_f, category, num_total_samples):
    otu_table = parse_biom_table(otu_table_f)
    mdm, _ = parse_mapping_file_to_dict(map_f)

    try:
        map_f.seek(0)
    except AttributeError:
        pass

    if num_total_samples > len(otu_table.SampleIds):
        raise InvalidSubsetSize("Too many total samples (%d) were specified "
                                "as a gradient subset size. There are only %d "
                                "total samples to choose a subset from." %
                                (num_total_samples, len(otu_table.SampleIds)))

    # Only keep the sample IDs that are in both the mapping file and OTU table.
    # Sort the samples according to the gradient category.
    samp_ids = [(samp_id, float(metadata[category]))
                for samp_id, metadata in mdm.items()
                if samp_id in otu_table.SampleIds]
    samp_ids.sort(key=lambda samp_id: samp_id[1])

    samp_ids_to_keep = [samp_id[0] for samp_id in
                        _choose_items_from_bins(samp_ids, num_total_samples)]

    assert len(samp_ids_to_keep) == num_total_samples, \
           "%d != %d" % (len(samp_ids_to_keep), num_total_samples)
    assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \
           "Duplicate sample IDs in subset"

    return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf),
            filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
Пример #6
0
 def setUp(self):
     """define data for tests"""
     # small amount of redundancy here since setUp called at each test, but
     # limited tests means little concern
     self.rarefaction_file = \
         ['\tsequences per sample\titeration\tSam1\tSam2\tSam3\tSam4\tSam5\tSam6',
         'rare480.txt\t480\t0\t2.52800404052\t2.3614611247\t2.59867416108\t3.56970811181\t3.44800265895\t1.9433560517',
         'rare480.txt\t480\t1\t2.06375457238\t3.32293450758\t3.4189896645\t3.35312890712\t3.10763472113\t2.78155253726',
         'rare480.txt\t480\t2\t2.44788730109\t3.42464996459\t2.24541787295\t2.491419231\t2.60106690099\t5.40828403581',
         'rare480.txt\t480\t3\t5.1846120153\t3.67022675065\t1.54879964908\t2.8055801405\t4.3086171269\t3.87761898868',
         'rare910.txt\t910\t0\t2.67580703282\t1.72405794627\t2.15312863498\t2.4300954476\t3.7753658185\t3.36198860355',
         'rare910.txt\t910\t1\t4.10226466956\t2.24587945345\t3.02932964779\t2.98218513619\t3.73316846484\t1.85879566537',
         'rare910.txt\t910\t2\t1.65800670063\t2.42281993323\t3.02400997565\t3.271608097\t2.99265263795\t3.68802382515',
         'rare910.txt\t910\t3\t2.50976021964\t2.43976761056\t3.32119905587\t2.47487750248\t1.901408525\t3.42883742207',
         'rare500.txt\t500\t0\t3.42225118215\tn/a\t4.03758268426\t2.35344629448\t2.26690085385\t1.80164570104',
         'rare850.txt\t850\t0\t4.2389858006\t4.97464230229\t1.53451087057\t3.35785261181\t1.91658777533\t2.32583475424',
         'rare850.txt\t850\t1\t2.81445883827\tn/a\t2.54767461948\t1.38835207925\t3.70018890199\t1.57359105209',
         'rare850.txt\t850\t2\t2.9340493412\t3.95897035158\tn/a\t2.07761860166\t3.42393336685\t2.6927305603']
     self.rarefaction_data = parse_rarefaction(self.rarefaction_file)
     self.mapping_file = \
         ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription',
         '#Comment Line',
         'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc',
         'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc',
         'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc',
         'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc',
         'Sam5\tControl\tCGTC\tLow\t67\ts5_desc',
         'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc']
     self.mapping_data = parse_mapping_file_to_dict(self.mapping_file)[0]
Пример #7
0
def distance_matrix(input_path, column):
    """ calculates distance matrix on a single column of a mapping file
    
    inputs:
     input_path (file handler)
     column (str)
    """
    data, comments = parse_mapping_file_to_dict(input_path)
    column_data = []
    column_headers = []
    for i in data:
        if column not in data[i]:
            stderr.write(
                "\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n"
                % (column, data[i].keys()))
            exit(1)
        try:
            column_data.append(float(data[i][column]))
        except ValueError:
            stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\
                % (column,i,data[i][column]))
            exit(1)

        column_headers.append(i)

    data_row = array(column_data)
    data_col = reshape(data_row, (1, len(data_row)))
    dist_mtx = abs(data_row - data_col.T)

    return format_distance_matrix(column_headers, dist_mtx)
Пример #8
0
 def setUp(self):
     """define data for tests"""
     # small amount of redundancy here since setUp called at each test, but
     # limited tests means little concern
     self.rarefaction_file = \
         ['\tsequences per sample\titeration\tSam1\tSam2\tSam3\tSam4\tSam5\tSam6',
          'rare480.txt\t480\t0\t2.52800404052\t2.3614611247\t2.59867416108\t3.56970811181\t3.44800265895\t1.9433560517',
          'rare480.txt\t480\t1\t2.06375457238\t3.32293450758\t3.4189896645\t3.35312890712\t3.10763472113\t2.78155253726',
          'rare480.txt\t480\t2\t2.44788730109\t3.42464996459\t2.24541787295\t2.491419231\t2.60106690099\t5.40828403581',
          'rare480.txt\t480\t3\t5.1846120153\t3.67022675065\t1.54879964908\t2.8055801405\t4.3086171269\t3.87761898868',
          'rare910.txt\t910\t0\t2.67580703282\t1.72405794627\t2.15312863498\t2.4300954476\t3.7753658185\t3.36198860355',
          'rare910.txt\t910\t1\t4.10226466956\t2.24587945345\t3.02932964779\t2.98218513619\t3.73316846484\t1.85879566537',
          'rare910.txt\t910\t2\t1.65800670063\t2.42281993323\t3.02400997565\t3.271608097\t2.99265263795\t3.68802382515',
          'rare910.txt\t910\t3\t2.50976021964\t2.43976761056\t3.32119905587\t2.47487750248\t1.901408525\t3.42883742207',
          'rare500.txt\t500\t0\t3.42225118215\tn/a\t4.03758268426\t2.35344629448\t2.26690085385\t1.80164570104',
          'rare850.txt\t850\t0\t4.2389858006\t4.97464230229\t1.53451087057\t3.35785261181\t1.91658777533\t2.32583475424',
          'rare850.txt\t850\t1\t2.81445883827\tn/a\t2.54767461948\t1.38835207925\t3.70018890199\t1.57359105209',
          'rare850.txt\t850\t2\t2.9340493412\t3.95897035158\tn/a\t2.07761860166\t3.42393336685\t2.6927305603']
     self.rarefaction_data = parse_rarefaction(self.rarefaction_file)
     self.mapping_file = \
         ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription',
          '#Comment Line',
          'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc',
          'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc',
          'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc',
          'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc',
          'Sam5\tControl\tCGTC\tLow\t67\ts5_desc',
          'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc']
     self.mapping_data = parse_mapping_file_to_dict(self.mapping_file)[0]
Пример #9
0
def make_profiles_by_category(mapping_fp, taxa_level, category):
    """ Creates a list of profiles for each unique value in the category
    Inputs:
        mapping_fp: filepath to the mapping file
        category: mapping file category to split data over
                  defaults to HOST_SUBJECT_ID
    Returns a dictionary keyed by the values on that category and a list of 
        profiles as values
    """
    # Parse the mapping file
    map_f = open(mapping_fp, 'U')
    mapping_data, comments = parse_mapping_file_to_dict(map_f)
    map_f.close()
    # Get a list of unique keys for the specified category
    if category == 'SampleID':
        result = {}
        for sid in mapping_data:
            result[sid] = [make_profile_by_sid(mapping_data, sid, taxa_level)]
    else:
        values = set([mapping_data[sid][category] for sid in mapping_data])
        result = {}
        # Loop over each value in that category
        for value in values:
            # Re-open the mapping file
            map_f = open(mapping_fp, 'U')
            # Get sample ids that match the value
            sids = sample_ids_from_metadata_description(map_f,
                                                        category+":"+value)
            map_f.close()
            # Create the list with all the profiles of the sample IDs in this
            # category value
            result[value] = [make_profile_by_sid(mapping_data,
                                                sid, taxa_level) for sid in sids]
    return result
Пример #10
0
    def test_sampleId_pairs(self):
        """Test that sampleId_pairs returns the correct combos/sampleId's."""
        # expected values
        dose_vps = \
            [('1xDose', '2xDose'), ('1xDose', 'Control'),
             ('2xDose', 'Control')]
        ttd_vps = \
            [('31', '21'), ('31', '55'), ('31', '67'), ('21', '55'),
             ('21', '67'), ('55', '67')]
        dose_sids = \
            [(['Sam1', 'Sam2', 'Sam6'], ['Sam3', 'Sam4']),
             (['Sam1', 'Sam2', 'Sam6'], ['Sam5']),
             (['Sam3', 'Sam4'], ['Sam5'])]
        ttd_sids = \
            [(['Sam1'], ['Sam3']),
             (['Sam1'], ['Sam4', 'Sam6']),
             (['Sam1'], ['Sam2', 'Sam5']),
             (['Sam3'], ['Sam4', 'Sam6']),
             (['Sam3'], ['Sam2', 'Sam5']),
             (['Sam4', 'Sam6'], ['Sam2', 'Sam5'])]

        # observed values
        obs_dose_sids, obs_dose_vps = sampleId_pairs(self.mapping_data,
                                                     self.rarefaction_data,
                                                     'Dose')
        obs_ttd_sids, obs_ttd_vps = sampleId_pairs(self.mapping_data,
                                                   self.rarefaction_data,
                                                   'TTD')

        # sort -- order is unimportant and depends on way presented in mf
        self.assertEqual(dose_vps.sort(), obs_dose_vps.sort())
        self.assertEqual(dose_sids.sort(), obs_dose_sids.sort())
        self.assertEqual(ttd_vps.sort(), obs_ttd_vps.sort())
        self.assertEqual(ttd_sids.sort(), obs_ttd_sids.sort())

        # check errors when no samples had this category
        self.assertRaises(ValueError, sampleId_pairs, self.mapping_data,
                          self.rarefaction_data, 'DNE')

        # check no error if map file has more sampleids than rarefaction data
        superset_mf = \
            ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription',
             '#Comment Line',
             'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc',
             'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc',
             'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc',
             'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc',
             'Sam5\tControl\tCGTC\tLow\t67\ts5_desc',
             'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc',
             'Sam7\t4xDose\tACCT\tLow\t55\ts7_desc',
             'Sam8\t3xDose\tACCT\tLow\t55\ts8_desc',
             'Sam9\t1xDose\tACCT\tLow\t55\ts9_desc']
        # (mf, comments)
        superset_mf = parse_mapping_file_to_dict(superset_mf)[0]
        obs_dose_sids, obs_dose_vps = sampleId_pairs(superset_mf,
                                                     self.rarefaction_data,
                                                     'Dose')

        self.assertEqual(dose_vps.sort(), obs_dose_vps.sort())
        self.assertEqual(dose_sids.sort(), obs_dose_sids.sort())
Пример #11
0
 def test_make_sample_node_table(self):
     '''Test that the sample node table is created correctly.'''
     # test when sampleids in biom == sampleids in mapping file
     bt = parse_biom_table(BIOM_STRING_1)
     mf_dict = parse_mapping_file_to_dict(MF_LINES.split('\n'))[0]
     obs = make_sample_node_table(bt, mf_dict)
     exp = [
         '#NodeID\tNodeType\tAbundance\tTimePt\tStudy\tTreatment\tDiet',
         's1\tsample\t148.0\t1\ta\tpre\thf',
         's2\tsample\t156.0\t2\ta\tpre\tlf',
         's3\tsample\t164.0\t3\ta\tpre\thf',
         's4\tsample\t172.0\t4\ta\tpost\tlf',
         's5\tsample\t180.0\t5\ta\tpost\tmf'
     ]
     self.assertEqual(obs, exp)
     # test when sampleids in biom are a subset of sampleids in mapping file
     bt = parse_biom_table(BIOM_STRING_2)
     obs = make_sample_node_table(bt, mf_dict)
     exp = [
         '#NodeID\tNodeType\tAbundance\tTimePt\tStudy\tTreatment\tDiet',
         's3\tsample\t164.0\t3\ta\tpre\thf',
         's4\tsample\t172.0\t4\ta\tpost\tlf',
         's5\tsample\t180.0\t5\ta\tpost\tmf'
     ]
     self.assertEqual(obs, exp)
Пример #12
0
def print_info(opts):
    """
  @opt: options from the command line. see --help
  """
    # import the the data using qiimes utilities. the o variable
    # contains the dictionary with the map file contents.
    try:
        o, c = parse_mapping_file_to_dict(open(opts.map))
    except IOError:
        raise ("File may not exist. ")

    # ---- print the metadata
    if opts.print_meta:
        s = set()
        print "Meta-Data: "
        for k in o.keys():
            s = s.union(set(o[k].keys()))
        for r in s:
            print "  " + r

    # ---- print the sample IDs
    if opts.print_ids:
        print "SampleIDs: "
        for k in o.keys():
            print "  " + k

    # ---- print a column from the map file
    if opts.print_col != None:
        s = set()
        print "Column: " + opts.print_col + ": "
        for k in o.keys():
            s = s.union(set([o[k][opts.print_col]]))
        for r in s:
            print "  " + r
    return None
def distance_matrix(input_path, column):
    """ calculates distance matrix on a single column of a mapping file
    
    inputs:
     input_path (file handler)
     column (str)
    """
    data, comments = parse_mapping_file_to_dict(input_path)
    column_data = []
    column_headers = []
    for i in data:
        if column not in data[i]:
            stderr.write("\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n" % (column,data[i].keys()))
            exit(1)
        try:
            column_data.append(float(data[i][column]))
        except ValueError:
            stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\
                % (column,i,data[i][column]))
            exit(1)
            
        column_headers.append(i)
    
    data_row = array(column_data)
    data_col = reshape(data_row, (1, len(data_row)))
    dist_mtx = abs(data_row-data_col.T)
    
    return format_distance_matrix(column_headers, dist_mtx)
Пример #14
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.output_path != None:
        outf = open(opts.output_path, 'w')
    else:
        outf = sys.stdout

    dists = parse_distmat(open(opts.input_path, 'U'))
    map_data = parse_mapping_file_to_dict(open(opts.map, 'U'))
    diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category)

    if opts.short:
        print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists)
    else:
        print >> outf, "dissimilarity ratio between/within (large for clustered data):"
        print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists)
        print >> outf, "dissimilarities between clusters: mean, std, num:"
        print >> outf, '\t'.join(
            map(str, [
                numpy.mean(diff_dists),
                numpy.std(diff_dists),
                len(diff_dists)
            ]))
        print >> outf, "dissimilarities within clusters: mean, std, num:"
        print >> outf, '\t'.join(
            map(str, [
                numpy.mean(same_dists),
                numpy.std(same_dists),
                len(same_dists)
            ]))
Пример #15
0
def print_info(opts):
  """
  @opt: options from the command line. see --help
  """
  # import the the data using qiimes utilities. the o variable 
  # contains the dictionary with the map file contents. 
  try:
    o,c = parse_mapping_file_to_dict(open(opts.map))
  except IOError:  
    raise("File may not exist. ")

  # ---- print the metadata
  if opts.print_meta: 
    s = set()
    print "Meta-Data: "
    for k in o.keys():
      s = s.union(set(o[k].keys()))
    for r in s: 
      print "  " + r 

  # ---- print the sample IDs
  if opts.print_ids: 
    print "SampleIDs: "
    for k in o.keys():
      print "  " + k

  # ---- print a column from the map file
  if opts.print_col != None: 
    s = set()
    print "Column: " + opts.print_col + ": "
    for k in o.keys():
      s = s.union(set([o[k][opts.print_col]]))
    for r in s:
      print "  " + r
  return None 
Пример #16
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.output_path != None:
        outf = open(opts.output_path,'w')
    else:
        outf = sys.stdout

    dists = parse_distmat(open(opts.input_path,'U'))
    map_data = parse_mapping_file_to_dict(open(opts.map,'U'))
    diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category)



    if opts.short:
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
    else:
        print >> outf, "dissimilarity ratio between/within (large for clustered data):"
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
        print >> outf, "dissimilarities between clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(diff_dists), numpy.std(diff_dists),
         len(diff_dists)]))
        print >> outf, "dissimilarities within clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(same_dists), numpy.std(same_dists),
         len(same_dists)]))
Пример #17
0
def view_metadata_categories_from_mapping_file(mapping_file):
    """ Print list of metadata categories """ 
    mapping_fp = open(mapping_file, 'rU')
    mapping_dict, comments = parse_mapping_file_to_dict(mapping_fp)
    key = mapping_dict.keys()[0]
    for category, value in mapping_dict[key].iteritems():
        print "%s (e.g. '%s')" % (category, value)
Пример #18
0
def multiple_file_DA_DESeq2(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile)
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        outfile = join(output_dir, 'DESeq2_DA_'+base_fname+'.txt') 
        outfile_diagnostic = join(output_dir, 'DESeq2_diagnostic_plots_'+base_fname+'.pdf') 

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_DESeq2(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic) 
def alpha_diversity_by_sample_type(adiv_fs, mapping_f,
                                   mapping_category='Sample_Type'):
    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [line.strip().split('\t')
                     for line in adiv_f if line.strip()][1:]

        for samp_id, adiv in adiv_data:
            sample_type = sample_type_map[samp_id]
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v) for k, v in
                     sample_type_to_adiv.items()]
    plotting_data.sort()

    plot_fig = generate_box_plots([dist[2] for dist in
            plotting_data], x_tick_labels=[dist[1] for dist in plotting_data],
            x_label=mapping_category, y_label='Alpha Diversity',
            title='Alpha Diversity by %s' % mapping_category)
    tight_layout()
    return plotting_data, plot_fig
 def test_make_sample_node_table(self):
     """Test that the sample node table is created correctly."""
     # test when sampleids in biom == sampleids in mapping file
     bt = parse_biom_table(BIOM_STRING_1)
     mf_dict = parse_mapping_file_to_dict(MF_LINES.split("\n"))[0]
     obs = make_sample_node_table(bt, mf_dict)
     exp = [
         "#NodeID\tNodeType\tAbundance\tTimePt\tStudy\tTreatment\tDiet",
         "s1\tsample\t148.0\t1\ta\tpre\thf",
         "s2\tsample\t156.0\t2\ta\tpre\tlf",
         "s3\tsample\t164.0\t3\ta\tpre\thf",
         "s4\tsample\t172.0\t4\ta\tpost\tlf",
         "s5\tsample\t180.0\t5\ta\tpost\tmf",
     ]
     self.assertEqual(obs, exp)
     # test when sampleids in biom are a subset of sampleids in mapping file
     bt = parse_biom_table(BIOM_STRING_2)
     obs = make_sample_node_table(bt, mf_dict)
     exp = [
         "#NodeID\tNodeType\tAbundance\tTimePt\tStudy\tTreatment\tDiet",
         "s3\tsample\t164.0\t3\ta\tpre\thf",
         "s4\tsample\t172.0\t4\ta\tpost\tlf",
         "s5\tsample\t180.0\t5\ta\tpost\tmf",
     ]
     self.assertEqual(obs, exp)
Пример #21
0
def multiple_file_DA_fitZIG(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2):
    """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile) 
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        #make temporary json biom version - R currently does not have hdf5
        outfile = join(output_dir, 'fitZIG_DA_'+base_fname+'.txt')

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_fitZIG(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2) 
Пример #22
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data, comments = parse_mapping_file_to_dict(opts.input_path)
    column_headers = []
    if ',' not in opts.column:
        column_data = []
        column_name = opts.column
        for i in data:
            if column_name not in data[i]:
                raise ValueError(
                    "No column: '%s' in the mapping file. Existing columns are: %s" %
                    (column_name, data[i].keys()))

            try:
                column_data.append(float(data[i][opts.column]))
            except ValueError:
                raise ValueError(
                    "All the values in the column '%s' must be numeric but '%s' has '%s'" %
                    (column_name, i, data[i][column_name]))

            column_headers.append(i)
        dtx_mtx = compute_distance_matrix_from_metadata(column_data)
    else:
        latitudes = []
        longitudes = []
        try:
            latitude, longitude = opts.column.split(',')
        except ValueError:
            raise ValueError(
                "This script accepts a maximum of 2 colums separated by comma and you passed: %s" %
                (opts.column))

        for i in data:
            if latitude not in data[i] or longitude not in data[i]:
                raise ValueError(
                    "One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s" %
                    (latitude, longitude, data[i].keys()))

            try:
                latitudes.append(float(data[i][latitude]))
                longitudes.append(float(data[i][longitude]))
            except ValueError:
                raise ValueError(
                    "All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'" %
                    (latitude, longitude, i, data[i][column_name]))

            column_headers.append(i)

        dtx_mtx = calculate_dist_vincenty(latitudes, longitudes)

    dtx_txt = format_distance_matrix(column_headers, dtx_mtx)

    outfilepath = os.path.join(opts.output_fp)
    f = open(outfilepath, 'w')
    f.write(dtx_txt)
    f.close()
Пример #23
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data, comments = parse_mapping_file_to_dict(opts.input_path)
    column_headers = []
    if ',' not in opts.column:
        column_data = []
        column_name = opts.column
        for i in data:
            if column_name not in data[i]:
                raise ValueError(
                    "No column: '%s' in the mapping file. Existing columns are: %s"
                    % (column_name, data[i].keys()))

            try:
                column_data.append(float(data[i][opts.column]))
            except ValueError:
                raise ValueError(
                    "All the values in the column '%s' must be numeric but '%s' has '%s'"
                    % (column_name, i, data[i][column_name]))

            column_headers.append(i)
        dtx_mtx = compute_distance_matrix_from_metadata(column_data)
    else:
        latitudes = []
        longitudes = []
        try:
            latitude, longitude = opts.column.split(',')
        except ValueError:
            raise ValueError(
                "This script accepts a maximum of 2 colums separated by comma and you passed: %s"
                % (opts.column))

        for i in data:
            if latitude not in data[i] or longitude not in data[i]:
                raise ValueError(
                    "One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s"
                    % (latitude, longitude, data[i].keys()))

            try:
                latitudes.append(float(data[i][latitude]))
                longitudes.append(float(data[i][longitude]))
            except ValueError:
                raise ValueError(
                    "All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'"
                    % (latitude, longitude, i, data[i][column_name]))

            column_headers.append(i)

        dtx_mtx = calculate_dist_vincenty(latitudes, longitudes)

    dtx_txt = format_distance_matrix(column_headers, dtx_mtx)

    outfilepath = os.path.join(opts.output_fp)
    f = open(outfilepath, 'w')
    f.write(dtx_txt)
    f.close()
    def test_sampleId_pairs(self):
        """Test that sampleId_pairs returns the correct combos/sampleId's."""
        # expected values
        dose_vps = \
            [('1xDose', '2xDose'), ('1xDose', 'Control'),
             ('2xDose', 'Control')]
        ttd_vps = \
            [('31', '21'), ('31', '55'), ('31', '67'), ('21', '55'),
             ('21', '67'), ('55', '67')]
        dose_sids = \
            [(['Sam1', 'Sam2', 'Sam6'], ['Sam3', 'Sam4']),
             (['Sam1', 'Sam2', 'Sam6'], ['Sam5']),
             (['Sam3', 'Sam4'], ['Sam5'])]
        ttd_sids = \
            [(['Sam1'], ['Sam3']),
             (['Sam1'], ['Sam4', 'Sam6']),
             (['Sam1'], ['Sam2', 'Sam5']),
             (['Sam3'], ['Sam4', 'Sam6']),
             (['Sam3'], ['Sam2', 'Sam5']),
             (['Sam4', 'Sam6'], ['Sam2', 'Sam5'])]

        # observed values
        obs_dose_sids, obs_dose_vps = sampleId_pairs(self.mapping_data,
                                                     self.rarefaction_data, 'Dose')
        obs_ttd_sids, obs_ttd_vps = sampleId_pairs(self.mapping_data,
                                                   self.rarefaction_data, 'TTD')

        # sort -- order is unimportant and depends on way presented in mf
        self.assertEqual(dose_vps.sort(), obs_dose_vps.sort())
        self.assertEqual(dose_sids.sort(), obs_dose_sids.sort())
        self.assertEqual(ttd_vps.sort(), obs_ttd_vps.sort())
        self.assertEqual(ttd_sids.sort(), obs_ttd_sids.sort())

        # check errors when no samples had this category
        self.assertRaises(ValueError, sampleId_pairs, self.mapping_data,
                          self.rarefaction_data, 'DNE')

        # check no error if map file has more sampleids than rarefaction data
        superset_mf = \
            ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription',
             '#Comment Line',
             'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc',
             'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc',
             'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc',
             'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc',
             'Sam5\tControl\tCGTC\tLow\t67\ts5_desc',
             'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc',
             'Sam7\t4xDose\tACCT\tLow\t55\ts7_desc',
             'Sam8\t3xDose\tACCT\tLow\t55\ts8_desc',
             'Sam9\t1xDose\tACCT\tLow\t55\ts9_desc']
        # (mf, comments)
        superset_mf = parse_mapping_file_to_dict(superset_mf)[0]
        obs_dose_sids, obs_dose_vps = sampleId_pairs(superset_mf,
                                                     self.rarefaction_data, 'Dose')

        self.assertEqual(dose_vps.sort(), obs_dose_vps.sort())
        self.assertEqual(dose_sids.sort(), obs_dose_sids.sort())
Пример #25
0
 def test_parse_mapping_file_to_dict(self):
     """parse_mapping_file functions as expected"""
     s1 = ['#sample\ta\tb', '#comment line to skip',\
           'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     exp = ([['x','y','z'],['i','j','k']],\
            ['sample','a','b'],\
            ['comment line to skip','more skip'])
     mapdict, comments = parse_mapping_file_to_dict(s1)
     expdict = {'x':{'a':'y','b':'z'}, 'i':{'a':'j','b':'k'}}
     self.assertEqual(mapdict, expdict)
     self.assertEqual(comments, ['comment line to skip','more skip'])
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] =\
                          t_two_sample(i,j)
    
    return results
Пример #27
0
def alpha_diversity_by_sample_type(adiv_fs,
                                   mapping_f,
                                   mapping_category='Sample_Type',
                                   min_num_samples=11,
                                   category_values_to_exclude=None):
    """Will exclude 'NA' category value by default if this parameter is not
    provided"""
    if category_values_to_exclude is None:
        category_values_to_exclude = ['NA']

    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [
            line.strip().split('\t') for line in adiv_f if line.strip()
        ][1:]

        for samp_id, adiv in adiv_data:
            try:
                sample_type = sample_type_map[samp_id]
            except KeyError:
                sample_type = 'Unknown'
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v)
                     for k, v in sample_type_to_adiv.items()
                     if k != 'Unknown' and k not in category_values_to_exclude
                     and len(v) >= min_num_samples]
    plotting_data.sort()

    plot_fig = generate_box_plots(
        [dist[2] for dist in plotting_data],
        x_tick_labels=[dist[1] for dist in plotting_data],
        x_label=mapping_category,
        y_label='Alpha Diversity',
        title='Alpha Diversity by %s' % mapping_category)
    plot_fig.set_size_inches(12, 12)
    try:
        plot_fig.tight_layout()
    except ValueError:
        print "tight_layout() failed. Try making the plot figure larger " + \
              "with Figure.set_size_inches(). The labels will be cut off " + \
              "otherwise."
    return plotting_data, plot_fig
Пример #28
0
def load_data(biom_path, map_path):
  """
  @biom_path
  @map_path
  @data
  @labn
  @labs
  @classes
  """
  obj,comm = parse_mapping_file_to_dict(open(map_path, "U"))
  labn,labs,classes = extract_labels(obj)
  data = extract_data(biom_path)
  return data, labn, labs, classes
Пример #29
0
def DA_fitZIG(input_path, out_path, mapping_fp, mapping_category, subcategory_1, subcategory_2):
   """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance testing"""
   tmp_bt = load_table(input_path)
   tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
   check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
   tmp_bt.add_metadata(tmp_pmf, 'sample')

   with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                    prefix='QIIME-differential-abundance-temp-table-',
                                    suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_fitZIG(temp_fh.name, out_path, mapping_category, subcategory_1, subcategory_2)
Пример #30
0
def alpha_diversity_by_sample_type(adiv_fs, mapping_f,
                                   mapping_category='Sample_Type',
                                   min_num_samples=11,
                                   category_values_to_exclude=None):
    """Will exclude 'NA' category value by default if this parameter is not
    provided"""
    if category_values_to_exclude is None:
        category_values_to_exclude = ['NA']

    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [line.strip().split('\t')
                     for line in adiv_f if line.strip()][1:]

        for samp_id, adiv in adiv_data:
            try:
                sample_type = sample_type_map[samp_id]
            except KeyError:
                sample_type = 'Unknown'
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v) for k, v in
                     sample_type_to_adiv.items()
                     if k != 'Unknown' and k not in
                     category_values_to_exclude and
                     len(v) >= min_num_samples]
    plotting_data.sort()

    plot_fig = generate_box_plots([dist[2] for dist in
            plotting_data], x_tick_labels=[dist[1] for dist in plotting_data],
            x_label=mapping_category, y_label='Alpha Diversity',
            title='Alpha Diversity by %s' % mapping_category)
    plot_fig.set_size_inches(12, 12)
    try:
        plot_fig.tight_layout()
    except ValueError:
        print "tight_layout() failed. Try making the plot figure larger " + \
              "with Figure.set_size_inches(). The labels will be cut off " + \
              "otherwise."
    return plotting_data, plot_fig
Пример #31
0
def _collate_gradient_pcoa_plot_data(coords_f, map_f, category):
    pc_data = parse_coords(coords_f)
    coords_d = dict(zip(pc_data[0], pc_data[1]))

    # Build list of (gradient value, sid) tuples.
    map_dict = parse_mapping_file_to_dict(map_f)[0]
    sorted_sids = sorted([(float(md[category]), sid)
                          for sid, md in map_dict.items()])

    xs = [coords_d[sid][0] for _, sid in sorted_sids]
    ys = [coords_d[sid][1] for _, sid in sorted_sids]
    gradient = [cat_val for cat_val, _ in sorted_sids]

    return xs, ys, gradient
Пример #32
0
def parse_map_file(map_f, column_name, sample_ids):
    """
        parse_map_file(map_f, column_name, sample_ids)
        @map_f - file handle
        @column_name - name of the column that contains the class 
            labels
        sample_ids - names of the samples in the order
            of which they appear in the data set. 
        @labels - numpy array of class labels
    """
    obj, comm = parse_mapping_file_to_dict(map_f)
    label_full = []
    labels = []

    # grab the class labels which are likely to be in string
    # format.

    for id_set in sample_ids:
        if (id_set not in obj):
            raise ValueError(
                "Unknown sample ID supplied (" + str(id_set) +
                "). Make sure that the ID is in map file you specified")
        if (column_name not in obj[id_set]):
            raise ValueError(
                "Unknown class name supplied (" + str(column_name) +
                "). Make sure that the column name is in map file you specified"
            )
        label_full.append(obj[id_set][column_name])

    # now that we have the full class labels we need to determine
    # the number of unique classes in the data. if the number of
    # classes is equal to the number of observations, throw an error.
    # its likely the user does not know what they are doing.
    unique_classes = numpy.unique(label_full)
    if len(unique_classes) == len(sample_ids):
        raise ValueError(
            "Number of classes is equal to the number of IDs.  The number of classes must be less than the number of IDs in map file that was specified."
        )
    if len(unique_classes) == 1:
        raise ValueError(
            "There must be multiple classes specified. Only one unique class was detected."
        )

    for str_lab in label_full:
        for n, uclass in enumerate(unique_classes):
            if str_lab == uclass:
                labels.append(float(n))
                break
    return numpy.array(labels)
Пример #33
0
def process_mapping_file(map_f,
                         barcode_len,
                         barcode_type,
                         BARCODE_COLUMN,
                         REVERSE_PRIMER_COLUMN):
    """Ensures that sample IDs and barcodes are unique, that barcodes are
    all the same length, and that primers are present. Ensures barcodes
    and primers only contain valid characters.
    Parameters
    ----------
    map_f: file
        metadata mapping file
    barcode_type: string
        barcode type, can be either integer or golay_12
    barcode_len: int
        barcode length
    barcode_column: string
        header of barcode column
    reverse_primer_column: string
        header of the reverse primer column
    Returns
    ----------
    bc_to_sid: dict
    bc_to_fwd_primers: dict
    bc_to_rev_primers: dict
    """

    _, _, bc_to_sid, _, _, bc_to_fwd_primers, _ = check_map(map_f, False)
    map_f.seek(0)

    metadata_map = parse_mapping_file_to_dict(map_f)[0]
    bc_to_rev_primers = {}
    for sid, md in metadata_map.items():
        if REVERSE_PRIMER_COLUMN in md:
            bc_to_rev_primers[
                md[BARCODE_COLUMN]] = expand_degeneracies(
                md[REVERSE_PRIMER_COLUMN].upper().split(','))
        else:
            raise Exception(
                "The %s column does not exist in the "
                "mapping file. %s is required." %
                (REVERSE_PRIMER_COLUMN,
                 REVERSE_PRIMER_COLUMN))

    check_barcodes(bc_to_sid, barcode_len, barcode_type)

    return (bc_to_sid,
            bc_to_fwd_primers,
            bc_to_rev_primers)
Пример #34
0
def get_profiles_list(base_dir, mapping_table, taxa_level):
    """"""
    profiles = []
    # Loop through all the mapping files
    for map_file in mapping_table:
        # Get the path to the mapping file
        map_fp = join(base_dir, map_file)
        # Parse the mapping file in a dictionary
        map_f = open(map_fp, 'U')
        mapping_data, comments = parse_mapping_file_to_dict(map_f)
        map_f.close()
        # Create a profile for each sample in this mapping file
        for sid in mapping_data:
            profiles.append(make_profile_by_sid(mapping_data, sid, taxa_level))
    return profiles
Пример #35
0
def make_distance_matrix_heatmap(dm_lines, mapping_lines, html_fp, output_dir):
    """Create an html with a heatmap of the distance matrix

    Inputs:
        dm_lines: distance matrix open file object
        mapping_lines: mapping open file object
        html_fp: filepath of the output html file
        output_dir: path of the output directory which will contain the aux
            html files
    """
    # Parse input files
    data = generate_data_make_html(dm_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)
    # Create the html file
    make_html_file([data], mapping_data, html_fp, output_dir)
Пример #36
0
def get_profiles_list(base_dir, mapping_table, taxa_level):
    """"""
    profiles = []
    # Loop through all the mapping files
    for map_file in mapping_table:
        # Get the path to the mapping file
        map_fp = join(base_dir, map_file)
        # Parse the mapping file in a dictionary
        map_f = open(map_fp, 'U')
        mapping_data, comments = parse_mapping_file_to_dict(map_f)
        map_f.close()
        # Create a profile for each sample in this mapping file
        for sid in mapping_data:
            profiles.append(make_profile_by_sid(mapping_data, sid, taxa_level))
    return profiles
Пример #37
0
def interpretBiom(bf, mf, c, OTUIds):
    """Returns a list comprising of rules and the samples with each rule"""
    biom_file = parse_biom_table(bf)
    mapping_file = parse_mapping_file_to_dict(mf)
    mapping_file = mapping_file[0]

    category_dict = dict( [ ( key, val [ c ] ) for ( key, val ) in mapping_file.iteritems() ] )
    sorted_category_dict = sorted(category_dict.iteritems(), key = operator.itemgetter(1))

    print sorted_category_dict

    samp_ids = []
    for vals, ids, md in biom_file.iterSamples():
        samp_ids.append(ids)
    
    samples_present = []
    final_list = []
    count = 0
    counter = 0

    # This takes in the list of OTU ID's and matches them with ID from
    # the biom file using the getValueByIds. If it isn't 0 then keep track
    # of it (i.e. the OTU is present in that sample) and do an intersection
    # between all of the said OTU's within each sample. Unfortunately, matching
    # is O(n^2) no matter what.
    for j in OTUIds:
	for id in samp_ids:
	    for k in j:
	    	if int(biom_file.getValueByIds(k, id)) != 0:
                    count = count + 1
            if count == len(j):
                samples_present.append(id)
            count = 0
	    if id == 
        counter = counter + 1
        final_list.append(counter)
	# temporary hack: used the set function to make a unique list 
	# I should clear the list after each iteration through the OTUIds
	# but it somehow clears the entire list even if I append it before.
        final_list.append(set(samples_present))

    # this overcomes the temporary hack and converts from set to list
    for i in xrange(len(final_list)):
	if i % 2 != 0:
	    final_list[i] = list(final_list[i])


    return final_list
Пример #38
0
def DA_DESeq2(input_path, out_path, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a raw abundance OTU matrix
    """
    tmp_bt = load_table(input_path)
    tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
    check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
    tmp_bt.add_metadata(tmp_pmf, 'sample')
    base_fname, ext = splitext(out_path)
    outfile_diagnostic = join(base_fname+'_diagnostic_plots.pdf') 

    with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                     prefix='QIIME-differential-abundance-temp-table-',
                                     suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_DESeq2(temp_fh.name, out_path, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic) 
Пример #39
0
 def test_grouped_correlation_row_generator(self):
     """Test that group row generator behaves as expected."""
     category = 'val'
     gc_to_samples = \
         {'1':['s1','s3','s7','s9','s2'],
          '2':['s6','s5','s4','s8','s10']}
     bt = parse_biom_table(self.bt_str)
     pmf = parse_mapping_file_to_dict(self.mf_ordered)[0]
     obs_cvs, obs_mds, obs_otus = grouped_correlation_row_generator(bt, pmf,
         category, gc_to_samples)
     self.assertEqual(obs_cvs, self.cvs1)
     self.assertFloatEqual(obs_mds, self.mds1)
     self.assertFloatEqual(obs_otus, self.otus1)
     # make sure it throws an error on non float md
     self.assertRaises(ValueError, grouped_correlation_row_generator, bt, 
         pmf, 'field', gc_to_samples)
Пример #40
0
def choose_gradient_subsets(dm_f, map_f, gradient, subset_sizes, num_subsets):
    subsets = []

    mdm, _ = parse_mapping_file_to_dict(map_f)
    dm_labels, dm_data = parse_distmat(dm_f)

    # Only keep the sample IDs that are in both the mapping file and distance
    # matrix.
    samp_ids = [(samp_id, float(metadata[gradient]))
                for samp_id, metadata in mdm.items() if samp_id in dm_labels]
    samp_ids.sort(key=lambda samp_id: samp_id[1])

    for subset_size in subset_sizes:
        # Adapted from http://stackoverflow.com/a/9873935
        # We add 1 to the number of samples we want because we want subset_size
        # intervals to choose from.
        bin_idxs = [int(ceil(i * len(samp_ids) / (subset_size + 1)))
                    for i in range(subset_size + 1)]

        for subset_num in range(num_subsets):
            samp_ids_to_keep = []

            for i in range(len(bin_idxs) - 1):
                if i == len(bin_idxs) - 2:
                    # We're at the last bin, so choose from the entire bin
                    # range.
                    if bin_idxs[i + 1] < len(samp_ids):
                        end_idx = bin_idxs[i + 1]
                    else:
                        end_idx = bin_idxs[i + 1] - 1

                    samp_ids_to_keep.append(
                            samp_ids[randint(bin_idxs[i], end_idx)][0])
                else:
                    # We subtract 1 since randint is inclusive on both sides,
                    # and we don't want to choose the same sample ID multiple
                    # times from different bins.
                    samp_ids_to_keep.append(
                            samp_ids[randint(bin_idxs[i],
                                             bin_idxs[i + 1] - 1)][0])

            assert len(samp_ids_to_keep) == subset_size, \
                   "%d != %d" % (len(samp_ids_to_keep), subset_size)

            subsets.append(samp_ids_to_keep)

    return subsets
Пример #41
0
def make_html_file(lines, html_fp):
    """Creates the HTML file with a table with the sample counts

    Inputs:
        lines: mapping file open file object
        html_fp: file path to store the output html page

    Generates the html file.
    """
    # Parse the mapping file
    (map_dict, list_c) = parse_mapping_file_to_dict(lines)
    # Generate the string containing the html code
    page_html_string = get_html_page_string(map_dict)
    #Save the html file
    out = open(html_fp, 'w')
    out.write(page_html_string)
    out.close()
def compute_ordination_correlation(map_f, coord_f, category, axis=1,
                                   correlation_type='pearson',
                                   num_permutations=999):
    if correlation_type not in CORRELATION_TYPES:
        raise ValueError("Invalid correlation type '%s'. Must be one of %r." %
                         (correlation_type, CORRELATION_TYPES))
    if num_permutations < 0:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than or equal to zero." % num_permutations)

    coords_samp_ids, coords, _, _ = parse_coords(coord_f)
    num_axes = len(coords[0])
    if axis < 1 or axis > num_axes:
        raise ValueError("Invalid axis number %d. Must be greater than zero "
                         "and less than or equal to the number of axes in the "
                         "input coordinates file (found %d axes)." %
                         (axis, num_axes))
    axis_data = coords[:, axis - 1]

    mdm, _ = parse_mapping_file_to_dict(map_f)
    gradient_data = []
    for samp_id in coords_samp_ids:
        if category not in mdm[samp_id]:
            raise ValueError("Category '%s' does not exist in the input "
                             "mapping file." % category)

        md_value = mdm[samp_id][category]
        try:
            md_value = float(md_value)
        except ValueError:
            raise ValueError("The category state '%s' could not be converted "
                             "to a number. All states in the '%s' category "
                             "must be numeric." % (md_value, category))
        gradient_data.append(md_value)

    corr_coeff, param_p_val, _, nonparam_p_val, _ = \
            correlation_test(axis_data, gradient_data, method=correlation_type,
                             permutations=num_permutations)

    if num_permutations > 0:
        nonparam_p_val = format_p_value_for_num_iters(nonparam_p_val,
                                                      num_permutations)
    else:
        nonparam_p_val = 'N/A'

    return corr_coeff, param_p_val, nonparam_p_val
def make_html_file(lines, html_fp):
    """Creates the HTML file with a table with the sample counts

    Inputs:
        lines: mapping file open file object
        html_fp: file path to store the output html page

    Generates the html file.
    """
    # Parse the mapping file
    (map_dict, list_c) = parse_mapping_file_to_dict(lines)
    # Generate the string containing the html code
    page_html_string = get_html_page_string(map_dict)
    #Save the html file
    out = open(html_fp, 'w')
    out.write(page_html_string)
    out.close()
def make_beta_significance_heatmap(beta_significance_fp, mapping_fp, html_fp,
    output_dir):
    """Creates an html file with the heatmaps of beta significance analysis
    
    Inputs:
        beta_significance_fp: beta significance results filepath
        mapping_fp: mapping filepath
        html_fp: output html filepath
        output_dir: output directory where the aux html files will be stored
    """
    bs_lines = open(beta_significance_fp, 'U')

    l_data = generate_data_make_html(bs_lines)

    mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))

    make_html_file(l_data, mapping_data, html_fp, output_dir)
def make_beta_significance_heatmap(beta_significance_fp, mapping_fp, html_fp,
                                   output_dir):
    """Creates an html file with the heatmaps of beta significance analysis
    
    Inputs:
        beta_significance_fp: beta significance results filepath
        mapping_fp: mapping filepath
        html_fp: output html filepath
        output_dir: output directory where the aux html files will be stored
    """
    bs_lines = open(beta_significance_fp, 'U')

    l_data = generate_data_make_html(bs_lines)

    mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))

    make_html_file(l_data, mapping_data, html_fp, output_dir)
Пример #46
0
def main():
    args = parser.parse_args()

    categories = args.categories
    map_fp = args.map_fp
    tree_fp = args.tree_fp
    output_fp = args.output_fp
    length = args.length

    map_dict = parse_mapping_file_to_dict(map_fp)[0]

    fields = categories.split(',')

    tree = LoadTree(tree_fp)

    furcated_tree = furcate_tree(tree, map_dict, fields, length=length)

    tree.writeToFile(output_fp)
Пример #47
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    mothur_counts_fp = opts.mothur_counts_fp
    mapping_fp = opts.mapping_fp
    valid_states = opts.valid_states
    blank_id_fp = opts.blank_id_fp
    contaminant_db_fp = opts.contaminant_db_fp
    contaminant_similarity = opts.contaminant_similarity
    max_correlation = opts.max_correlation
    correlate_header = opts.correlate_header
    input_fasta_fp = opts.input_fasta_fp
    otu_map_fp = opts.otu_map_fp
    output_dir = opts.output_dir
    min_relabund_threshold = opts.min_relabund_threshold
    prescreen_threshold = opts.prescreen_threshold
    removal_stat_blank = opts.removal_stat_blank
    removal_stat_sample = opts.removal_stat_sample
    removal_differential = opts.removal_differential
    reinstatement_stat_sample = opts.reinstatement_stat_sample
    reinstatement_stat_blank = opts.reinstatement_stat_blank
    reinstatement_differential = opts.reinstatement_differential
    reinstatement_sample_number = opts.reinstatement_sample_number
    reinstatement_method = opts.reinstatement_method
    write_output_seq_lists = opts.write_output_seq_lists
    write_filtered_output = opts.write_filtered_output
    drop_lib_threshold = opts.drop_lib_threshold
    write_per_seq_stats = opts.write_per_seq_stats
    write_per_library_stats = opts.write_per_library_stats
    write_per_seq_disposition = opts.write_per_seq_disposition

    # Make unique seq OTU table (biom file)

    # Compute unique seq stats
    #   output biom file with unique seq stats

    # Optionally: make candidate contaminant DB
    #   remove sequences present at higher abundance in samples
    #   cluster blanks
    #   remove low-abundance contaminant OTUs

    # Filter by similarity against candidate contaminant DB
    #   annotate unique seq OTU table with top hit (OTU#, rep seq, ID%)
    #   make list of seqs @ threshold

    # Calculate reinstatement rule for filtered sequences

    # Generate lists of seqs failing:
    #   - unique seq rule
    #   - hit to contaminant
    #   - reinstatement after hit

    # Make sure passed at least one of an OTU biom or mothur counts table file
    input_file_counter = 0

    if mothur_counts_fp:
        input_file_counter += 1
        unique_seq_biom = mothur_counts_to_biom(mothur_counts_fp)
        mothur_output = True
        print "mothur input"

    if otu_table_fp:
        input_file_counter += 1
        unique_seq_biom = load_table(otu_table_fp)
        mothur_output = False
        print "BIOM input"

    if input_file_counter != 1:
        option_parser.error("must provide ONLY ONE of an OTU table biom file or"
                            "mothur counts table")

    # Check to make sure that if blank-based contamination filtering requested,
    # all necessary options are specified:

    removal_options_counter = 0
    if removal_stat_blank:
        removal_options_counter += 1
    if removal_stat_sample:
        removal_options_counter += 1
    if removal_differential:
        removal_options_counter += 1

    if ((removal_options_counter > 0) and (removal_options_counter < 3)):
        option_parser.error("Must provide all of "
                            "removal_stats_blank, "
                            "removal_stat_sample, and "
                            "removal_differential, or none.")
    elif removal_options_counter == 0:
        blank_stats_removal = False
    elif removal_options_counter == 3:
        blank_stats_removal = True


    # If reference-based filtering requested, make sure all necessary options
    # have been specified:

    if contaminant_db_fp and not input_fasta_fp:
        option_parser.error("If specifying ref-based contaminant ID, must "
                            "also specify path to input sequence fasta")


    # If correlation-based filtering requested, make sure correlate data 
    # are specified

    if max_correlation and not correlate_header:
        option_parser.error("If specifying maximum Spearman correlation, must "
                           "also provide map column header for correlate data")


    # If sequence reinstatement is requested, make sure all necessary options
    # are specified

    reinstatement_options_counter = 0
    if reinstatement_stat_blank:
        reinstatement_options_counter += 1
    if reinstatement_stat_sample:
        reinstatement_options_counter += 1
    if reinstatement_differential:
        reinstatement_options_counter += 1

    if ((reinstatement_options_counter > 0) and 
        (reinstatement_options_counter < 3)):
        option_parser.error("Must provide all of "
                            "reinstatement_stats_blank, "
                            "reinstatement_stat_sample, and "
                            "reinstatement_differential, or none.")

    if ((reinstatement_options_counter == 3 and reinstatement_sample_number)
        and not reinstatement_method):
        option_parser.error("If providing sample number AND abundance criteria "
                            "for sequence reinstatement, must also provide "
                            "a method for combining results.")

    if reinstatement_options_counter == 3 or reinstatement_sample_number:
        reinstatement = True
    else:
        reinstatement = False

    # get blank sample IDs from mapping file or sample ID list

    if mapping_fp and valid_states:
        blank_sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        blanks = True
    elif blank_id_fp is not None:
        blank_id_f = open(blank_id_fp, 'Ur')
        blank_sample_ids = set([line.strip().split()[0]
                                for line in blank_id_f
                                if not line.startswith('#')])
        blank_id_f.close()
        blanks = True
    else:
        blanks = False


    # Initialize output objets  

    output_dict = {}
    contaminant_types = []

    contamination_stats_dict = None
    contamination_stats_header = None
    corr_data_dict = None

    # Do blank-based stats calculations, if not there check to make sure no 
    # blank-dependent methods are requested:

    if blanks:
        if prescreen_threshold:
            low_contam_libraries = prescreen_libraries(unique_seq_biom,
                                                       blank_sample_ids,
                                                       removal_stat_sample, 
                                                       removal_stat_blank, 
                                                       removal_differential, 
                                                       prescreen_threshold)

            contamination_stats_header, contamination_stats_dict = \
                get_contamination_stats(unique_seq_biom,
                                        blank_sample_ids,
                                        exp_sample_ids=low_contam_libraries)
        else:
            contamination_stats_header, contamination_stats_dict = \
                get_contamination_stats(unique_seq_biom, blank_sample_ids)

    elif (blank_stats_removal or reinstatement or prescreen_threshold):
        option_parser.error("Blank-based filtering requested but no blank"
                            "samples indicated in mapping file or ID file.")
    else:
        contamination_stats_header, contamination_stats_dict = \
            get_contamination_stats(unique_seq_biom)


    seq_ids = unique_seq_biom.ids(axis='observation')


    # Do blank-based contaminant identification

    if min_relabund_threshold:
        output_dict['below_relabund_threshold'] = pick_min_relabund_threshold(
                                                  contamination_stats_dict,
                                                  contamination_stats_header,
                                                  min_relabund_threshold)


    if blank_stats_removal:
        output_dict['abund_contaminants'] = compare_blank_abundances(contamination_stats_dict, 
                                contamination_stats_header,
                                removal_stat_sample,
                                removal_stat_blank,
                                removal_differential,
                                negate=True)

        contaminant_types.append('abund_contaminants')


    # Do reference-based contaminant identification

    if contaminant_db_fp:
        output_dict['ref_contaminants'] = pick_ref_contaminants(seq_ids, contaminant_db_fp, input_fasta_fp, contaminant_similarity, output_dir)

        contaminant_types.append('ref_contaminants')


    # Do spearman correlation based contaminant identification

    if max_correlation:
        metadata_dict = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0]

        corr_data_dict = {x: float(metadata_dict[x][correlate_header]) for x in metadata_dict}

        output_dict['corr_contaminants'], corr_contaminant_dict = pick_corr_contaminants(unique_seq_biom,
                                                   corr_data_dict,
                                                   max_correlation)

        contaminant_types.append('corr_contaminants')
    else:
        corr_contaminant_dict = None


    # Putative contaminants are those that have been identified by any method

    output_dict['putative_contaminants'] = set.union(*map(set, [output_dict[x] for x in contaminant_types]))


    # If considering low abundance sequences, remove those from consideration as potential contaminants 

    if 'below_relabund_threshold' in output_dict:
        output_dict['putative_contaminants'] = output_dict['putative_contaminants'] - set(output_dict['below_relabund_threshold'])


    # Pick abundance-criterion seqs to reinstate

    if (reinstatement_stat_blank and reinstatement_stat_sample and reinstatement_differential):
        output_dict['abund_reinstated_seqs'] = reinstate_abund_seqs(output_dict['putative_contaminants'], 
                     contamination_stats_dict, 
                     contamination_stats_header,
                     reinstatement_stat_sample,
                     reinstatement_stat_blank,
                     reinstatement_differential)

        output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs']


    # Pick incidence-criterion seqs to reinstate
    if reinstatement_sample_number:
        output_dict['incidence_reinstated_seqs'] = reinstate_incidence_seqs(
                     output_dict['putative_contaminants'],
                     unique_seq_biom,
                     blank_sample_ids,
                     reinstatement_sample_number)

        output_dict['reinstated_seqs'] = output_dict['incidence_reinstated_seqs']


    # combine incidence and abundance reinstatements
    if reinstatement_sample_number and reinstatement_stat_blank:
        if reinstatement_method == "union":
            output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] | output_dict['incidence_reinstated_seqs']
        elif reinstatement_method == "intersection":
            output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] & output_dict['incidence_reinstated_seqs']


    # make sets for sequence _never_ identified as contaminants:

    output_dict['ever_good_seqs'] = set(seq_ids) - output_dict['putative_contaminants']

    # If considering low abundance sequences, remove those from consideration as potential contaminants 

    if 'below_relabund_threshold' in output_dict:
        output_dict['ever_good_seqs'] = output_dict['ever_good_seqs'] - set(output_dict['below_relabund_threshold'])

    # Make set of good seqs for final filtering

    final_good_seqs = output_dict['ever_good_seqs']

    # ...and those either never ID'd as contaminants or reinstated:
    if reinstatement:
        output_dict['all_good_seqs'] = set(output_dict['ever_good_seqs'] | output_dict['reinstated_seqs'])
        final_good_seqs = output_dict['all_good_seqs']
        # ...and those who remain contaminants after reinstatement:
        output_dict['never_good_seqs'] = set(output_dict['putative_contaminants'] - output_dict['reinstated_seqs'])


    # print filtered OTU maps if given a QIIME OTU map input

    if otu_map_fp:
        print_filtered_output('otu_map', otu_map_fp, output_dir, output_dict)


    # print filtered Mothur counts tables if given a Mothur counts table input

    if mothur_output:
        print_filtered_output('mothur_counts', mothur_counts_fp, output_dir, output_dict)


    # print filtered seq header files if requested

    if write_output_seq_lists:
        print_filtered_output('seq_headers', seq_ids, output_dir, output_dict)


    # filter final biom file to just good seqs

    filtered_biom = unique_seq_biom.filter(lambda val, id_, metadata: id_ in final_good_seqs,
                     axis='observation', invert=False, inplace=False)

    # drop heavily contaminated libraries if requested

    if drop_lib_threshold:
        dropped_libs = unique_seq_biom.norm(inplace=False).filter(lambda val, id_, metadata: id_ in final_good_seqs,
                 axis='observation', invert=False, inplace=False).filter(lambda val, id_, metadata: sum(val) >= drop_lib_threshold,
                 axis='sample', invert=True, inplace=False).ids(axis='sample')
        filtered_biom.filter(lambda val, id_, metadata: id_ in dropped_libs,
                 axis='sample', invert=True, inplace=True)
    else:
        dropped_libs = []


    # print filtered biom/mothur_output if library filtering is requested

    if write_filtered_output:

        if mothur_output:
            output_counts_string = biom_to_mothur_counts(filtered_biom)
            with open(os.path.join(output_dir,'decontaminated_table.counts'), "w") as output_counts_file:
                output_counts_file.write(output_counts_string)
        else:
            output_biom_string = filtered_biom.to_json('Filtered by decontaminate.py')
            output_biom_string
            with open(os.path.join(output_dir,'decontaminated_otu_table.biom'), "w") as output_biom_file:
                output_biom_file.write(output_biom_string)



    # print per-library stats if requested

    if write_per_library_stats:
        per_library_stats, per_library_stats_header = calc_per_library_decontam_stats(unique_seq_biom, output_dict)
        library_stats_string = print_per_library_stats(per_library_stats, per_library_stats_header, unique_seq_biom.ids(axis='sample'), dropped_libs=dropped_libs)
        
        with open(os.path.join(output_dir,'decontamination_per_library_stats.txt'), "w") as output_stats_file:
            output_stats_file.write(library_stats_string)


    # print otu by disposition file if requested

    if write_per_seq_disposition:
        per_seq_disposition = print_otu_disposition(seq_ids, output_dict)

        with open(os.path.join(output_dir,'decontamination_per_otu_disposition.txt'), "w") as output_stats_file:
            output_stats_file.write(per_seq_disposition)


    # print log file / per-seq info
    if write_per_seq_stats:
        print_results_file(seq_ids,
                       output_dict,
                       os.path.join(output_dir,'contamination_summary.txt'),
                       contamination_stats_header,
                       contamination_stats_dict,
                       corr_contaminant_dict)
Пример #48
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_fn = anosim
            elif method == 'permanova':
                method_fn = permanova

            results = method_fn(dm,
                                df,
                                column=categories[0],
                                permutations=num_perms)
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)

        results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)." %
                                     (category, method))

            # Build the command arguments string.
            command_args = [
                '-d %s -m %s -c %s -o %s' %
                (dm_fp, map_fp, categories[0], out_dir)
            ]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if (md_map.hasUniqueCategoryValues(category)
                            and not (method == 'adonis'
                                     and md_map.isNumericCategory(category))):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                             (method, methods))
Пример #49
0
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.

    Notes:
     Returns a defaultdict which as keys has the pairs of treatments being
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines.
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use
     the deepest available in the file.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)

    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
            # add alpha diversity averages and standard deviations. since their
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val is not None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val is None:  # None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)

    return ttest_results, ad_avgs
Пример #50
0
def run_core_diversity_analyses(biom_fp,
                                mapping_fp,
                                sampling_depth,
                                output_dir,
                                qiime_config,
                                command_handler=call_commands_serially,
                                tree_fp=None,
                                params=None,
                                categories=None,
                                arare_min_rare_depth=10,
                                arare_num_steps=10,
                                parallel=False,
                                suppress_taxa_summary=False,
                                suppress_beta_diversity=False,
                                suppress_alpha_diversity=False,
                                suppress_group_significance=False,
                                status_update_callback=print_to_stdout):
    """
    """
    if categories is not None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
            parse_mapping_file_to_dict(open(mapping_fp, 'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError(
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" %
                    (c, ', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError(
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." %
                    c)

    else:
        categories = []
    comma_separated_categories = ','.join(categories)
    # prep some variables
    if params is None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []

    # begin logging
    old_log_fps = glob(join(output_dir, 'log_20*txt'))
    log_fp = generate_log_fp(output_dir)
    index_links.append(
        ('Master run log', log_fp, _index_headers['run_summary']))
    for old_log_fp in old_log_fps:
        index_links.append(
            ('Previous run log', old_log_fp, _index_headers['run_summary']))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp is not None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params['biom-summarize-table'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = \
            "biom summarize-table -i %s -o %s %s" % \
            (biom_fp, biom_table_stats_output_fp, params_str)
        commands.append([('Generate BIOM table summary',
                          biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" %
                     biom_table_stats_output_fp)
    index_links.append(('BIOM table statistics', biom_table_stats_output_fp,
                        _index_headers['run_summary']))

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
            (biom_fp, filtered_biom_fp, sampling_depth)
        commands.append([(
            'Filter low sequence count samples from table (minimum sequence count: %d)'
            % sampling_depth, filter_samples_cmd)])
    else:
        logger.write(
            "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" %
            filtered_biom_fp)
    biom_fp = filtered_biom_fp

    # rarify the BIOM table to sampling_depth
    rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth)
    if not exists(rarefied_biom_fp):
        single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\
            (biom_fp, rarefied_biom_fp, sampling_depth)
        commands.append([
            ('Rarify the OTU table to %d sequences/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
    else:
        logger.write("Skipping single_rarefaction.py as %s exists.\n\n" %
                     rarefied_biom_fp)

    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands,
                        status_update_callback,
                        logger,
                        close_logger_on_success=False)
        commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth)
        # Need to check for the existence of any distance matrices, since the user
        # can select which will be generated.
        existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
                otu_table_fp=rarefied_biom_fp,
                mapping_fp=mapping_fp,
                output_dir=bdiv_even_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                # Note: we pass sampling depth=None here as
                # we rarify the BIOM table above and pass that
                # in here.
                sampling_depth=None,
                tree_fp=tree_fp,
                parallel=parallel,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n"
                % ', '.join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp)
                           for fp in existing_dm_fps]

        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params['make_distance_boxplots'])
        except KeyError:
            params_str = ''

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (
                    bdiv_even_output_dir, bdiv_metric)
                plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,
                                                          category)
                stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,
                                                       category)
                if not exists(plot_output_fp):
                    boxplots_cmd = \
                        'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                        (dm_fp, category, boxplots_output_dir,
                         mapping_fp, params_str)
                    commands.append([('Boxplots (%s)' % category, boxplots_cmd)
                                     ])
                else:
                    logger.write(
                        "Skipping make_distance_boxplots.py for %s as %s exists.\n\n"
                        % (category, plot_output_fp))
                index_links.append(
                    ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(
                    ('Distance boxplots statistics (%s)' % bdiv_metric,
                     stats_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))

            index_links.append(
                ('PCoA plot (%s)' % bdiv_metric,
                 '%s/%s_emperor_pcoa_plot/index.html' %
                 (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Distance matrix (%s)' % bdiv_metric,
                 '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Principal coordinate matrix (%s)' % bdiv_metric,
                 '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))

    if not suppress_alpha_diversity:
        # Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,
                                                     sampling_depth)
        rarefaction_plots_output_fp = \
            '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=arare_full_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                tree_fp=tree_fp,
                num_steps=arare_num_steps,
                parallel=parallel,
                logger=logger,
                min_rare_depth=arare_min_rare_depth,
                max_rare_depth=sampling_depth,
                suppress_md5=True,
                status_update_callback=status_update_callback,
                retain_intermediate_files=False)
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" %
                         rarefaction_plots_output_fp)

        index_links.append(
            ('Alpha rarefaction plots', rarefaction_plots_output_fp,
             _index_headers['alpha_diversity']))

        collated_alpha_diversity_fps = \
            glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''

        if len(categories) > 0:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(
                    split(collated_alpha_diversity_fp)[1])[0]
                compare_alpha_output_dir = '%s/compare_%s' % \
                    (arare_full_output_dir, alpha_metric)
                if not exists(compare_alpha_output_dir):
                    compare_alpha_cmd = \
                        'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                        (collated_alpha_diversity_fp,
                         mapping_fp,
                         comma_separated_categories,
                         compare_alpha_output_dir,
                         params_str)
                    commands.append([
                        ('Compare alpha diversity (%s)' % alpha_metric,
                         compare_alpha_cmd)
                    ])
                    for category in categories:
                        alpha_comparison_stat_fp = '%s/%s_stats.txt' % \
                            (compare_alpha_output_dir, category)
                        alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \
                            (compare_alpha_output_dir, category)
                        index_links.append(
                            ('Alpha diversity statistics (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_stat_fp,
                             _index_headers['alpha_diversity']))
                        index_links.append(
                            ('Alpha diversity boxplots (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_boxplot_fp,
                             _index_headers['alpha_diversity']))
                else:
                    logger.write("Skipping compare_alpha_diversity.py"
                                 " for %s as %s exists.\n\n" %
                                 (alpha_metric, compare_alpha_output_dir))
        else:
            logger.write("Skipping compare_alpha_diversity.py as"
                         " no categories were provided.\n\n")

    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        # need to check for existence of any html files, since the user can
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(
            join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html'))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=None,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n"
                % ', '.join(existing_taxa_plot_html_fps))

        index_links.append(
            ('Taxa summary bar plots',
             '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        index_links.append(
            ('Taxa summary area plots',
             '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,
                                                           category)
            # need to check for existence of any html files, since the user can
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' %
                                               taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                    otu_table_fp=biom_fp,
                    mapping_fp=mapping_fp,
                    output_dir=taxa_plots_output_dir,
                    mapping_cat=category,
                    sort=True,
                    command_handler=command_handler,
                    params=params,
                    qiime_config=qiime_config,
                    logger=logger,
                    suppress_md5=True,
                    status_update_callback=status_update_callback)
            else:
                logger.write(
                    "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n"
                    % (category, ', '.join(existing_taxa_plot_html_fps)))

            index_links.append(
                ('Taxa summary bar plots',
                 '%s/taxa_summary_plots/bar_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))
            index_links.append(
                ('Taxa summary area plots',
                 '%s/taxa_summary_plots/area_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))

    if not suppress_group_significance:
        params_str = get_params_str(params['group_significance'])
        # group significance tests, aka category significance
        for category in categories:
            group_signifance_fp = \
                '%s/group_significance_%s.txt' % (output_dir, category)
            if not exists(group_signifance_fp):
                # Build the OTU cateogry significance command
                group_significance_cmd = \
                    'group_significance.py -i %s -m %s -c %s -o %s %s' %\
                    (rarefied_biom_fp, mapping_fp, category,
                     group_signifance_fp, params_str)
                commands.append([('Group significance (%s)' % category,
                                  group_significance_cmd)])
            else:
                logger.write(
                    "Skipping group_significance.py for %s as %s exists.\n\n" %
                    (category, group_signifance_fp))

            index_links.append(
                ('Category significance (%s)' % category, group_signifance_fp,
                 _index_headers['group_significance']))

    filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([('Compress the filtered BIOM table',
                          'gzip %s' % filtered_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of filtered BIOM table as %s exists.\n\n" %
            filtered_biom_gzip_fp)
    index_links.append(
        ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
         filtered_biom_gzip_fp, _index_headers['run_summary']))

    rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp
    if not exists(rarified_biom_gzip_fp):
        commands.append([('Compress the rarified BIOM table',
                          'gzip %s' % rarefied_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of rarified BIOM table as %s exists.\n\n" %
            rarified_biom_gzip_fp)
    index_links.append(
        ('Rarified BIOM table (sampling depth: %d)' % sampling_depth,
         rarified_biom_gzip_fp, _index_headers['run_summary']))

    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()

    generate_index_page(index_links, index_fp)
Пример #51
0
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:, 0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0] == depth])

    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance.
    rare_mat = (rare_mat.sum(0) /
                rare_mat.shape[0])[2:]  #remove depth,iter cols
    sids = rarefaction_data[0][3:]  # 0-2 are header strings

    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val != None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val == None:  #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std)
    return ttest_results, alphadiv_avgs
Пример #52
0
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.read(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
        elif sort_by not in metamap.keys():
            option_parser.error("Sort category %s does not exist in the "
                                "mapping file. Available categories are: %s" %
                                (sort_by, metamap.keys()))
Пример #53
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError(
                'The samples in the biom table are a superset of' +
                ' the samples in the mapping file. The script will abort in' +
                ' this case even though the calculations wouldn\'t be' +
                ' affected, to ensure consistency within QIIME. Pass the' +
                ' --biom_samples_are_superset option to disable this behavior.'
            )
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError(
            'At least one metadata group has no samples. Check ' +
            'that the mapping file has at least one sample for each value in '
            + 'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error(
            'The t-test and mann_whitney_u test may ' +
            'only be used when there are two sample groups. Choose another ' +
            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError(
                'The number of samples is too small to use the ' +
                'Mann-Whitney-U normal approximation. Review the script ' +
                'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError(
                'It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt,
                                                test_stats,
                                                pvals,
                                                fdr_pvals,
                                                bon_pvals,
                                                means,
                                                cat_sam_indices,
                                                md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()