Exemplo n.º 1
0
def _function(inputs, database, output, levels, save_median_taxatable=False):
    # Check if output exists, if not then make
    if not os.path.exists(output):
        os.makedirs(output)

    # Load the datafiles to locate function db
    data_files = _load_metadata(database)

    # Load the functional db
    logger.info("Loading the functional database and converting.")
    func_db = parse_function_db(data_files, database)

    for input, level in zip(inputs, levels):
        # Verify it is in a reasonable level
        if level in ['genus', 'species', 'strain']:
            logger.info(
                "Starting functional prediction with input file %s at level %s"
                % (os.path.abspath(input), level))
            function_run_and_save(input,
                                  func_db,
                                  output,
                                  TAXAMAP[level],
                                  save_median_taxatable=save_median_taxatable)
        else:
            continue
Exemplo n.º 2
0
def summarize_functional(ctx, input, database, output):
    prefix = ".".join(os.path.basename(input).split('.')[:-1]).replace(".kegg", "")


    # Check if output exists, if not then make
    if not os.path.exists(output):
        os.makedirs(output)

    # Load the datafiles to locate function db
    data_files = _load_metadata(database)

    # Load the functional db
    logger.info("Loading the functional database and converting.")
    func_db = parse_function_db(data_files, database)
    kegg_df = pd.read_csv(input, sep="\t", index_col=0)

    out_kegg_modules_df, out_kegg_modules_coverage = summarize_kegg_table(kegg_df, func_db['modules'])
    out_kegg_pathways_df, out_kegg_pathways_coverage = summarize_kegg_table(kegg_df, func_db['pathways'])

    out_kegg_modules_df.to_csv(os.path.join(output, "%s.kegg.modules.txt" % prefix), sep='\t', float_format="%d",
                               na_rep=0, index_label="#MODULE ID")
    out_kegg_modules_coverage.to_csv(os.path.join(output, "%s.kegg.modules.coverage.txt" % prefix), sep='\t',
                                     float_format="%f", na_rep=0, index_label="#MODULE ID")
    out_kegg_pathways_df.to_csv(os.path.join(output, "%s.kegg.pathways.txt" % prefix), sep='\t', float_format="%d",
                                na_rep=0, index_label="#PATHWAY ID")
    out_kegg_pathways_coverage.to_csv(os.path.join(output, "%s.kegg.pathways.coverage.txt" % prefix), sep='\t',
                                      float_format="%f", na_rep=0, index_label="#PATHWAY ID")
Exemplo n.º 3
0
 def _post_align(self, outf, **kwargs):
     alignments = set()
     i = 0
     with open(outf) as alignment_file:
         alignment_gen = csv.reader(alignment_file, delimiter="\t")
         for row in alignment_gen:
             alignment_score = float(row[2])
             if alignment_score >= self.percent_id:
                 alignments.add(row[0])
                 i += 1
     logger.info("Human hits filter: %d" % i)
     return alignments
Exemplo n.º 4
0
def function_run_and_save(input,
                          func_db,
                          output,
                          level,
                          save_median_taxatable=True):
    prefix = ".".join(os.path.basename(input).split('.')[:-1])

    kegg_pathways_df = func_db['pathways']
    kegg_modules_df = func_db['modules']
    row_names = func_db['names']
    kegg_ids = func_db['kegg_ids']
    kegg_table_csr = func_db['csr']

    logger.debug(
        "Level for summarization %d and starting summarizing KEGG Table at level with median."
        % level)
    if level < 8:
        kegg_table_csr, row_names = summarize_at_level(kegg_table_csr,
                                                       row_names, kegg_ids,
                                                       level)
    logger.debug("Number of rows %d" % len(list(row_names.keys())))

    if TAXA[level - 1] not in prefix:
        prefix += "." + TAXA[level - 1]

    logger.info("Reading in taxatable for functional prediction at %s." %
                os.path.abspath(input))
    taxatable_df = pd.read_csv(input, sep="\t", index_col=0)
    logger.debug("Taxatable for functional prediction shape %s" %
                 str(taxatable_df.shape))
    taxatable_df = taxatable_df[[type(_) == str for _ in taxatable_df.index]]

    taxatable_df['summary'] = [
        ';'.join(_.split(';')[:level]).replace(' ', '_')
        for _ in taxatable_df.index
    ]
    # Drop names above
    taxatable_df = taxatable_df[[
        _.count(';') + 1 >= level for _ in taxatable_df['summary']
    ]]
    taxatable_df = taxatable_df.groupby('summary').sum().fillna(0.)

    # Normalizing for depth at median depth
    taxatable_df = normalize_by_median_depth(taxatable_df)
    if save_median_taxatable:
        taxatable_df.to_csv(os.path.join(output, "%s.normalized.txt" % prefix),
                            sep='\t',
                            float_format="%d",
                            na_rep=0,
                            index_label="#OTU ID")

    logger.debug("Taxatable summarized shape %s" % str(taxatable_df.shape))

    logger.info("Starting functional prediction.")
    out_kegg_table_df, out_kegg_modules_df, out_kegg_modules_coverage, out_kegg_pathways_df, out_kegg_pathways_coverage = _do_function(
        taxatable_df, row_names, kegg_ids, kegg_table_csr, kegg_modules_df,
        kegg_pathways_df)
    out_kegg_table_df.to_csv(os.path.join(output, "%s.kegg.txt" % prefix),
                             sep='\t',
                             float_format="%d",
                             na_rep=0,
                             index_label="#KEGG ID")
    out_kegg_modules_df.to_csv(os.path.join(output,
                                            "%s.kegg.modules.txt" % prefix),
                               sep='\t',
                               float_format="%d",
                               na_rep=0,
                               index_label="#MODULE ID")
    out_kegg_modules_coverage.to_csv(os.path.join(
        output, "%s.kegg.modules.coverage.txt" % prefix),
                                     sep='\t',
                                     float_format="%f",
                                     na_rep=0,
                                     index_label="#MODULE ID")
    out_kegg_pathways_df.to_csv(os.path.join(output,
                                             "%s.kegg.pathways.txt" % prefix),
                                sep='\t',
                                float_format="%d",
                                na_rep=0,
                                index_label="#PATHWAY ID")
    out_kegg_pathways_coverage.to_csv(os.path.join(
        output, "%s.kegg.pathways.coverage.txt" % prefix),
                                      sep='\t',
                                      float_format="%f",
                                      na_rep=0,
                                      index_label="#PATHWAY ID")
Exemplo n.º 5
0
def get_coverage_of_microbes(infile,
                             shear,
                             level,
                             parse_taxonomy_from_row=lambda row: row[-1]):
    #Load in the shear df at level
    shear_df = summarize_bayes_at_level(shear, level=level)

    samples_begin_map = dict()
    taxa_hits = defaultdict(int)

    logger.info("Started the coverage parsing.")
    with open(infile) as utree_f:
        csv_embalm = csv.reader(utree_f, delimiter='\t')
        # qname, lca, confidence, support
        for num, line in enumerate(csv_embalm):
            if num % 10000 == 0:
                logger.info("Parsed %d lines of b6." % num)
            # TODO confidence/support filter
            begin = int(line[8])
            taxaname = parse_taxonomy_from_row(line)
            taxa_level = taxaname.count(';') + 1
            if taxa_level >= level:
                if taxa_level != level:
                    taxaname = ';'.join(taxaname.split(";")[:level])
                if taxaname in shear_df.index:
                    taxa_hits[taxaname] += 1
                    indx = int(np.floor(begin / 100.))
                    if not taxaname in samples_begin_map:
                        genome_length = shear_df['genome_length_median'][
                            taxaname]
                        samples_begin_map[taxaname] = np.zeros(genome_length)
                    if indx == 0:
                        samples_begin_map[taxaname][0] += 1
                    elif indx >= shear_df['genome_length_median'][taxaname]:
                        samples_begin_map[taxaname][-1] += 1
                    else:
                        samples_begin_map[taxaname][indx] += 1
                        samples_begin_map[taxaname][indx + 1] += 1
                else:
                    logger.warning("The taxa %s not found." % taxaname)

    xx = np.zeros((len(samples_begin_map), 8))
    for i, taxaname in enumerate(sorted(samples_begin_map.keys())):
        if i % 1000 == 0:
            logger.info("Calculated %d coverages." % i)
        unique_hits = taxa_hits[taxaname]
        hits = samples_begin_map[taxaname]
        coverages = zero_runs(hits)
        if coverages[0][0] == 0:
            if coverages[-1][-1] == hits.shape[0]:
                temp = coverages[:, 1] - coverages[:, 0]
                coverages = np.concatenate(
                    (coverages, np.atleast_2d(np.array([0,
                                                        temp[0] + temp[-1]]))))
        max_uncovered_region = np.max(coverages[:, 1] - coverages[:, 0])
        percent_max_unconvered = max_uncovered_region / shear_df[
            'genome_length_median'][taxaname]
        percent_covered = np.sum(
            hits > 0) / shear_df['genome_length_median'][taxaname]
        unique_counts = shear_df.iloc[:, level - 1][taxaname]
        expected_c = expected_coverage(unique_counts, unique_hits)
        row = np.array([
            max_uncovered_region, percent_max_unconvered, percent_covered,
            shear_df['genome_length_median'][taxaname], unique_hits,
            unique_counts, expected_c, percent_covered / (expected_c)
        ])
        row[np.isnan(row)] = 0
        xx[i] = row
    df = pd.DataFrame(xx,
                      columns=[
                          'max_uncovered_region',
                          'percent_max_uncovered_region',
                          'percent_of_genome_covered', 'median_genome_size',
                          'hits_in_clade', 'unique_counts_of_clade',
                          'expected_coverage', 'ratio_covered_over_expected'
                      ],
                      index=sorted(samples_begin_map.keys()))
    logger.info("Completed the coverage analysis.")
    return df