예제 #1
0
파일: core.py 프로젝트: oatesa/recentrifuge
 def shared_ctrl_analysis():
     """Perform last steps of shared taxa analysis"""
     shared_ctrl_tree: TaxTree = TaxTree()
     shared_ctrl_out: SampleDataById = SampleDataById(
         ['shared', 'accs'])
     shared_ctrl_tree.allin1(ontology=ontology,
                             counts=shared_ctrl_counts,
                             scores=shared_ctrl_score,
                             min_taxa=get_shared_mintaxa(),
                             include=including,
                             exclude=(exclude_candidates -
                                      shared_crossover),
                             out=shared_ctrl_out)
     shared_ctrl_out.purge_counters()
     out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
     output.write(
         gray(f'  Ctrl-shared: Including {len(out_counts)}'
              ' shared taxa. Generating sample... '))
     if out_counts:
         sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
         samples.append(sample)
         counts[Sample(sample)] = out_counts
         accs[Sample(sample)] = shared_ctrl_out.get_accs()
         scores[sample] = shared_ctrl_out.get_shared_scores()
         output.write(green('OK!\n'))
     else:
         output.write(yellow('VOID\n'))
예제 #2
0
def _debug_dummy_plot(
    taxonomy: Taxonomy,
    htmlfile: Filename,
    scoring: Scoring = Scoring.SHEL,
):
    """

    Generate dummy Krona plot via Krona 2.0 XML spec and exit

    """
    print(gray(f'Generating dummy Krona plot {htmlfile}...'), end='')
    sys.stdout.flush()
    samples: List[Sample] = [
        Sample('SINGLE'),
    ]
    krona: KronaTree = KronaTree(
        samples,
        min_score=Score(35),
        max_score=Score(100),
        scoring=scoring,
    )
    polytree: MultiTree = MultiTree(samples=samples)
    polytree.grow(ontology=taxonomy)
    polytree.toxml(ontology=taxonomy, krona=krona)
    krona.tohtml(htmlfile, pretty=True)
    print(green('OK!'))
예제 #3
0
파일: core.py 프로젝트: oatesa/recentrifuge
def summarize_analysis(
        *args,
        **kwargs) -> Tuple[Optional[Sample], Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    # TODO: Delete the following comment lines in a future release
    # including = ontology.including   # See comment below for the reason
    # excluding = ontology.excluding   # in/excluding are not used anymore
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = col.Counter()
    summary_acc: Counter[Id] = col.Counter()
    summary_score: Scores = Scores({})
    summary: Optional[Sample] = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    # Avoid including/excluding here as get_taxa is not as 'clever' as allin1
    #  and taxa are already included/excluded in the derived samples
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
예제 #4
0
def summarize_analysis(
        *args, **kwargs) -> Tuple[Sample, Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    including = ontology.including
    excluding = ontology.excluding
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = Counter()
    summary_acc: Counter[Id] = Counter()
    summary_score: Scores = Scores({})
    summary: Sample = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score,
                  include=including,
                  exclude=excluding)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
예제 #5
0
파일: core.py 프로젝트: oatesa/recentrifuge
    def cross_analysis(iteration, raw):
        """Cross analysis: exclusive and part of shared&ctrl"""
        nonlocal shared_counts, shared_score
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def partial_shared_update(i):
            """Perform shared and shared-control taxa partial evaluations"""
            nonlocal shared_counts, shared_score
            nonlocal shared_ctrl_counts, shared_ctrl_score
            if i == 0:  # 1st iteration: Initialize shared abundance and score
                shared_counts.update(sub_shared_counts)
                shared_score.update(sub_shared_score)
            elif i < controls:  # Just update shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
            elif i == controls:  # Initialize shared-control counters
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts.update(sub_shared_counts)
                shared_ctrl_score.update(sub_shared_score)
            elif controls:  # Both: Accumulate shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts &= sub_shared_counts
                shared_ctrl_score &= sub_shared_score
            else:  # Both: Accumulate shared abundance and score (no controls)
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score

        exclude: Set[Id] = set()
        # Get taxids at this rank that are present in the other samples
        for sample in (smpl for smpl in raws if smpl != raw):
            exclude.update(taxids[sample][rank])
        exclude.update(excluding)  # Add explicit excluding taxa if any
        output.write(f'  \033[90mExclusive: From \033[0m{raw}\033[90m '
                     f'excluding {len(exclude)} taxa. '
                     f'Generating sample...\033[0m')

        exclude_tree = TaxTree()
        exclude_out = SampleDataById(['counts', 'scores', 'accs'])
        exclude_tree.allin1(ontology=ontology,
                            counts=counts[raw],
                            scores=scores[raw],
                            min_taxa=mintaxas[raw],
                            min_rank=rank,
                            just_min_rank=True,
                            include=including,
                            exclude=exclude,
                            out=exclude_out)
        exclude_out.purge_counters()
        if exclude_out.counts:  # Avoid adding empty samples
            sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}')
            samples.append(sample)
            counts[sample] = exclude_out.get_counts()
            accs[sample] = exclude_out.get_accs()
            scores[sample] = exclude_out.get_scores()
            output.write('\033[92m OK! \033[0m\n')
        else:
            output.write('\033[93m VOID \033[0m\n')

        # Get partial abundance and score for the shared analysis
        sub_shared_tree = TaxTree()
        sub_shared_out = SampleDataById(['shared', 'accs'])
        sub_shared_tree.allin1(ontology=ontology,
                               counts=counts[raw],
                               scores=scores[raw],
                               min_taxa=mintaxas[raw],
                               min_rank=rank,
                               just_min_rank=True,
                               include=including,
                               exclude=excluding,
                               out=sub_shared_out)
        sub_shared_out.purge_counters()
        # Scale scores by abundance
        sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts()
        sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores()
        sub_shared_score *= sub_shared_counts
        partial_shared_update(iteration)
예제 #6
0
파일: core.py 프로젝트: oatesa/recentrifuge
    def control_analysis():
        """Perform last steps of control and shared controls analysis"""
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)

        # Get taxids at this rank that are present in the control samples
        exclude_candidates: Set[Id] = set()
        for i in range(controls):
            exclude_candidates.update(taxids[raws[i]][rank])
        exclude_sets: Dict[Sample, Set[Id]]
        shared_crossover: Set[Id] = set()  # Shared taxa contaminating controls
        if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES):
            robust_contamination_removal()
        else:  # If this case, just apply strict control
            exclude_sets = {
                file: exclude_candidates
                for file in raws[controls::]
            }
        # Add explicit excluding taxa (if any) to exclude sets
        for exclude_set in exclude_sets.values():
            exclude_set.update(excluding)
        exclude_candidates.update(excluding)
        # Process each sample excluding control taxa
        for raw in raws[controls:]:
            output.write(
                gray('  Ctrl: From') + f' {raw} ' +
                gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. '
                     f'Generating sample... '))
            ctrl_tree = TaxTree()
            ctrl_out = SampleDataById(['counts', 'scores', 'accs'])
            ctrl_tree.allin1(ontology=ontology,
                             counts=counts[raw],
                             scores=scores[raw],
                             min_taxa=mintaxas[raw],
                             min_rank=rank,
                             just_min_rank=True,
                             include=including,
                             exclude=exclude_sets[raw],
                             out=ctrl_out)
            ctrl_out.purge_counters()
            if ctrl_out.counts:  # Avoid adding empty samples
                sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}')
                samples.append(sample)
                counts[sample] = ctrl_out.get_counts()
                accs[sample] = ctrl_out.get_accs()
                scores[sample] = ctrl_out.get_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        def shared_ctrl_analysis():
            """Perform last steps of shared taxa analysis"""
            shared_ctrl_tree: TaxTree = TaxTree()
            shared_ctrl_out: SampleDataById = SampleDataById(
                ['shared', 'accs'])
            shared_ctrl_tree.allin1(ontology=ontology,
                                    counts=shared_ctrl_counts,
                                    scores=shared_ctrl_score,
                                    min_taxa=get_shared_mintaxa(),
                                    include=including,
                                    exclude=(exclude_candidates -
                                             shared_crossover),
                                    out=shared_ctrl_out)
            shared_ctrl_out.purge_counters()
            out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
            output.write(
                gray(f'  Ctrl-shared: Including {len(out_counts)}'
                     ' shared taxa. Generating sample... '))
            if out_counts:
                sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
                samples.append(sample)
                counts[Sample(sample)] = out_counts
                accs[Sample(sample)] = shared_ctrl_out.get_accs()
                scores[sample] = shared_ctrl_out.get_shared_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        # Shared-control taxa final analysis
        if shared_ctrl_counts:
            # Normalize scaled scores by total abundance
            shared_ctrl_score /= (+shared_ctrl_counts)
            # Get averaged abundance by number of samples minus ctrl samples
            shared_ctrl_counts //= (len(raws) - controls)
            shared_ctrl_analysis()
        else:
            output.write(
                gray('  Ctrl-shared: No taxa! ') + yellow('VOID') +
                gray(' sample.\n'))
예제 #7
0
def process_report(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/Kraken report files (to be usually called in parallel!).
    """
    # TODO: Full review to report support
    # Recover input and parameters
    filerep: Filename = args[0]
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['mintaxa']
    collapse: bool = taxonomy.collapse
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    debug: bool = kwargs['debug']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(filerep)

    # Read Centrifuge/Kraken report file to get abundances
    log: str
    abundances: Counter[TaxId]
    log, abundances, _ = read_report(filerep)
    output.write(log)
    # Remove root counts, in case
    if kwargs['root']:
        vwrite(gray('Removing'), abundances[ROOT], gray('"ROOT" reads... '))
        abundances[ROOT] = 0
        vwrite(green('OK!'), '\n')

    # Build taxonomy tree
    output.write('  \033[90mBuilding taxonomy tree...\033[0m')
    tree = TaxTree()
    tree.grow(taxonomy=taxonomy,
              counts=abundances)  # Grow tax tree from root node
    output.write('\033[92m OK! \033[0m\n')

    # Prune the tree
    output.write('  \033[90mPruning taxonomy tree...\033[0m')
    tree.prune(mintaxa, None, collapse, debug)
    tree.shape()
    output.write('\033[92m OK! \033[0m\n')

    # Get the taxa with their abundances and taxonomical levels
    output.write('  \033[90mFiltering taxa...\033[0m')
    new_abund: Counter[TaxId] = col.Counter()
    new_accs: Counter[TaxId] = col.Counter()
    ranks: Ranks = Ranks({})
    tree.get_taxa(abundance=new_abund,
                  accs=new_accs,
                  ranks=ranks,
                  mindepth=0,
                  maxdepth=0,
                  include=including,
                  exclude=excluding)
    new_abund = +new_abund  # remove zero and negative counts
    if including or excluding:  # Recalculate accumulated counts
        new_tree = TaxTree()
        new_tree.grow(taxonomy, new_abund)  # Grow tree with new abund
        new_tree.shape()
        new_abund = col.Counter()  # Reset abundances
        new_accs = col.Counter()  # Reset accumulated
        new_tree.get_taxa(new_abund, new_accs)  # Get new accumulated counts
    out: SampleDataByTaxId = SampleDataByTaxId()
    out.set(counts=new_abund, ranks=ranks, accs=new_accs)
    output.write('\033[92m OK! \033[0m\n')
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, SampleStats(), Err.NO_ERROR
예제 #8
0
def process_output(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/LMAT output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    scoring: Scoring = kwargs['scoring']
    lmat: bool = kwargs['lmat']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read Centrifuge/LMAT output files to get abundances
    read_method: Callable[[Filename, Scoring, Optional[Score]],  # Input
                          Tuple[str, SampleStats, Counter[TaxId],
                                Dict[TaxId, Score]]  # Output
                          ]
    if lmat:
        read_method = read_lmat_output
    else:
        read_method = read_output
    log: str
    counts: Counter[TaxId]
    scores: Dict[TaxId, Score]
    log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Update field in stat about control nature of the sample
    stat.is_ctrl = is_ctrl
    # Move cellular_organisms counts to root, in case
    if taxonomy.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... '))
        if counts[ROOT]:
            stat.num_taxa -= 1
            scores[ROOT] = (
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ROOT] * counts[ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ROOT]))
        else:
            scores[ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ROOT]:
        vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT])
        stat.num_taxa -= 1
        counts[ROOT] = 0
        scores[ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building taxonomy tree
    output.write(gray('Building from raw data... '))
    vwrite(gray('\n  Building taxonomy tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[TaxId]
    orphans: Set[TaxId]
    ancestors, orphans = taxonomy.get_ancestors(counts.keys())
    out = SampleDataByTaxId(['all'])
    tree.allin1(taxonomy=taxonomy,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Give stats about orphan taxid
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of total)\n'
                f'{lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of total)\n')
        else:
            vwrite(green('OK!\n'))
    # Check the lost of taxids (plasmids typically) under some conditions
    if debug and not excluding and not including:
        vwrite(gray('  Additional checking of taxid loss... '))
        lost = 0
        for taxid in counts:
            if not out.counts[taxid]:
                lost += 1
                vwrite(yellow('Warning!'), f'Lost taxid={taxid}: '
                       f'{taxonomy.get_name(taxid)}\n')
        if lost:
            vwrite(
                yellow('WARNING!'), f'Lost {lost} taxids ('
                f'{lost/len(counts):.2%} of total)'
                '\n')
        else:
            vwrite(green('OK!\n'))

    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error
예제 #9
0
def process_output(
        *args,
        **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]:
    """
    Process classifiers output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    ontology: Ontology = kwargs['ontology']
    mintaxa: Optional[int] = (kwargs['ctrlmintaxa']
                              if is_ctrl else kwargs['mintaxa'])
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Union[Tuple, Set[Id]] = ontology.including
    excluding: Union[Tuple, Set[Id]] = ontology.excluding
    scoring: Scoring = kwargs['scoring']
    classifier: Classifier = kwargs['classifier']
    genfmt: GenericFormat = kwargs['genfmt']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read taxonomic classifier output files to get abundances
    read_method: Callable[  # Format: [[Input], Output]
        [Filename, Scoring, Optional[Score]],
        Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]]
    log: str
    stat: SampleStats
    counts: Counter[Id]
    scores: Dict[Id, Score]
    if classifier is Classifier.GENERIC:  # Direct call to generic method
        log, stat, counts, scores = read_generic_output(
            target_file, scoring, minscore, genfmt)
    else:  # Use read_method
        if classifier is Classifier.KRAKEN:
            read_method = read_kraken_output
        elif classifier is Classifier.CLARK:
            read_method = read_clark_output
        elif classifier is Classifier.LMAT:
            read_method = read_lmat_output
        elif classifier is Classifier.CENTRIFUGE:
            read_method = read_output
        else:
            raise Exception(red('\nERROR!'),
                            f'taxclass: Unknown classifier "{classifier}".')
        log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Complete/Update fields in stats
    stat.is_ctrl = is_ctrl  # set control nature of the sample
    if mintaxa is not None:  # manual mintaxa has precedence over automatic
        stat.mintaxa = mintaxa
    else:  # update local value with the automatically guessed value
        mintaxa = stat.mintaxa
    # Move cellular_organisms counts to root, in case
    if ontology.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n'))
        if counts[ontology.ROOT]:
            stat.decrease_filtered_taxids()
            scores[ontology.ROOT] = Score(
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ontology.ROOT] * counts[ontology.ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT]))
        else:
            scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ontology.ROOT]:
        vwrite(gray('Removing'), counts[ontology.ROOT],
               gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt -
                                     counts[ontology.ROOT])
        stat.decrease_filtered_taxids()
        counts[ontology.ROOT] = 0
        scores[ontology.ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building ontology tree
    output.write(
        gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' +
        gray(' ... \n'))
    vwrite(gray('  Building ontology tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[Id]
    orphans: Set[Id]
    ancestors, orphans = ontology.get_ancestors(counts.keys())
    out = SampleDataById(['all'])
    tree.allin1(ontology=ontology,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Stats: Complete final value for TaxIDs after tree building and folding
    final_taxids: int = len(out.counts) if out.counts is not None else 0
    stat.set_final_taxids(final_taxids)

    # Check for additional loss of reads (due to include/exclude an orphans)
    output.write(gray('  Check for more seqs lost ([in/ex]clude affects)... '))
    if out.counts is not None:
        discard: int = sum(counts.values()) - sum(out.counts.values())
        if discard:
            output.write(
                blue('\n  Info:') + f' {discard} ' +
                gray('additional seqs discarded (') +
                f'{discard/sum(counts.values()):.3%} ' +
                gray('of accepted)\n'))
        else:
            output.write(green('OK!\n'))
    else:
        output.write(red('No counts in sample tree!\n'))
    # Warn or give detailed stats about orphan taxid and orphan seqs
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('  Warning!'), gray('Orphan taxid'),
                       f'{orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('  WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of accepted)\n'
                f'    and {lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of accepted)\n')
        else:
            vwrite(green('OK!\n'))
    elif orphans:
        output.write(
            yellow('\n  Warning!') + f' {len(orphans)} orphan taxids' +
            gray(' (rerun with --debug for details)\n'))
    # Check the removal of TaxIDs (accumulation of leaves in parents)
    if debug and not excluding and including == {ontology.ROOT}:
        vwrite(gray('  Assess accumulation due to "folding the tree"...\n'))
        migrated: int = 0
        if out.counts is not None:
            for taxid in counts:
                if out.counts[taxid] == 0:
                    migrated += 1
                    vwrite(
                        blue('  Info:'),
                        gray(f'Folded TaxID {taxid} (') +
                        f'{ontology.get_name(taxid)}' + gray(') with ') +
                        f'{counts[taxid]}' + gray(' original seqs\n'))
        if migrated:
            vwrite(
                blue('  INFO:'), f'{migrated} TaxIDs folded ('
                f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)'
                '\n')
            vwrite(
                blue('  INFO:'), f'Final assigned TaxIDs: {final_taxids} '
                f'(reduced to {final_taxids/len(+counts):.2%} of '
                'number of TAF)\n')
        else:
            vwrite(blue('  INFO:'), gray('No migration!'), green('OK!\n'))
    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error