Пример #1
0
def summarize_analysis(
        *args,
        **kwargs) -> Tuple[Optional[Sample], Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    # TODO: Delete the following comment lines in a future release
    # including = ontology.including   # See comment below for the reason
    # excluding = ontology.excluding   # in/excluding are not used anymore
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = col.Counter()
    summary_acc: Counter[Id] = col.Counter()
    summary_score: Scores = Scores({})
    summary: Optional[Sample] = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    # Avoid including/excluding here as get_taxa is not as 'clever' as allin1
    #  and taxa are already included/excluded in the derived samples
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Пример #2
0
def summarize_analysis(
        *args, **kwargs) -> Tuple[Sample, Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    including = ontology.including
    excluding = ontology.excluding
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = Counter()
    summary_acc: Counter[Id] = Counter()
    summary_score: Scores = Scores({})
    summary: Sample = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score,
                  include=including,
                  exclude=excluding)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Пример #3
0
def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Пример #4
0
def process_report(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/Kraken report files (to be usually called in parallel!).
    """
    # TODO: Full review to report support
    # Recover input and parameters
    filerep: Filename = args[0]
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['mintaxa']
    collapse: bool = taxonomy.collapse
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    debug: bool = kwargs['debug']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(filerep)

    # Read Centrifuge/Kraken report file to get abundances
    log: str
    abundances: Counter[TaxId]
    log, abundances, _ = read_report(filerep)
    output.write(log)
    # Remove root counts, in case
    if kwargs['root']:
        vwrite(gray('Removing'), abundances[ROOT], gray('"ROOT" reads... '))
        abundances[ROOT] = 0
        vwrite(green('OK!'), '\n')

    # Build taxonomy tree
    output.write('  \033[90mBuilding taxonomy tree...\033[0m')
    tree = TaxTree()
    tree.grow(taxonomy=taxonomy,
              counts=abundances)  # Grow tax tree from root node
    output.write('\033[92m OK! \033[0m\n')

    # Prune the tree
    output.write('  \033[90mPruning taxonomy tree...\033[0m')
    tree.prune(mintaxa, None, collapse, debug)
    tree.shape()
    output.write('\033[92m OK! \033[0m\n')

    # Get the taxa with their abundances and taxonomical levels
    output.write('  \033[90mFiltering taxa...\033[0m')
    new_abund: Counter[TaxId] = col.Counter()
    new_accs: Counter[TaxId] = col.Counter()
    ranks: Ranks = Ranks({})
    tree.get_taxa(abundance=new_abund,
                  accs=new_accs,
                  ranks=ranks,
                  mindepth=0,
                  maxdepth=0,
                  include=including,
                  exclude=excluding)
    new_abund = +new_abund  # remove zero and negative counts
    if including or excluding:  # Recalculate accumulated counts
        new_tree = TaxTree()
        new_tree.grow(taxonomy, new_abund)  # Grow tree with new abund
        new_tree.shape()
        new_abund = col.Counter()  # Reset abundances
        new_accs = col.Counter()  # Reset accumulated
        new_tree.get_taxa(new_abund, new_accs)  # Get new accumulated counts
    out: SampleDataByTaxId = SampleDataByTaxId()
    out.set(counts=new_abund, ranks=ranks, accs=new_accs)
    output.write('\033[92m OK! \033[0m\n')
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, SampleStats(), Err.NO_ERROR