Exemplo n.º 1
0
def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]:
    """
    Read Centrifuge/Kraken report file

    Args:
        report_file: report file name

    Returns:
        log string, abundances counter, taxlevel dict

    """
    # TODO: Discontinued method, to be erased in a future release
    output: io.StringIO = io.StringIO(newline='')
    abundances: Counter[Id] = col.Counter()
    level_dic = {}
    output.write(f'\033[90mLoading report file {report_file}...\033[0m')
    try:
        with open(report_file, 'r') as file:
            for report_line in file:
                _, _, taxnum, taxlev, _tid, _ = report_line.split('\t')
                tid = Id(_tid)
                abundances[tid] = int(taxnum)
                level_dic[tid] = Rank.centrifuge(taxlev)
    except KeyboardInterrupt:
        print(gray(' User'), yellow('interrupted!'))
        raise
    except Exception:
        print(red('ERROR!'), 'Cannot read "' + report_file + '"')
        raise
    else:
        output.write('\033[92m OK! \033[0m\n')
    return output.getvalue(), abundances, level_dic
Exemplo n.º 2
0
 def shared_ctrl_analysis():
     """Perform last steps of shared taxa analysis"""
     shared_ctrl_tree: TaxTree = TaxTree()
     shared_ctrl_out: SampleDataById = SampleDataById(
         ['shared', 'accs'])
     shared_ctrl_tree.allin1(ontology=ontology,
                             counts=shared_ctrl_counts,
                             scores=shared_ctrl_score,
                             min_taxa=get_shared_mintaxa(),
                             include=including,
                             exclude=(exclude_candidates -
                                      shared_crossover),
                             out=shared_ctrl_out)
     shared_ctrl_out.purge_counters()
     out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
     output.write(
         gray(f'  Ctrl-shared: Including {len(out_counts)}'
              ' shared taxa. Generating sample... '))
     if out_counts:
         sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
         samples.append(sample)
         counts[Sample(sample)] = out_counts
         accs[Sample(sample)] = shared_ctrl_out.get_accs()
         scores[sample] = shared_ctrl_out.get_shared_scores()
         output.write(green('OK!\n'))
     else:
         output.write(yellow('VOID\n'))
Exemplo n.º 3
0
def summarize_analysis(
        *args,
        **kwargs) -> Tuple[Optional[Sample], Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    # TODO: Delete the following comment lines in a future release
    # including = ontology.including   # See comment below for the reason
    # excluding = ontology.excluding   # in/excluding are not used anymore
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = col.Counter()
    summary_acc: Counter[Id] = col.Counter()
    summary_score: Scores = Scores({})
    summary: Optional[Sample] = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    # Avoid including/excluding here as get_taxa is not as 'clever' as allin1
    #  and taxa are already included/excluded in the derived samples
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Exemplo n.º 4
0
def summarize_analysis(
        *args, **kwargs) -> Tuple[Sample, Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    including = ontology.including
    excluding = ontology.excluding
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = Counter()
    summary_acc: Counter[Id] = Counter()
    summary_score: Scores = Scores({})
    summary: Sample = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score,
                  include=including,
                  exclude=excluding)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Exemplo n.º 5
0
 def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None:
     """Generate a mock Centrifuge output file from source file"""
     with open(out, 'w') as fout, open(args.file) as fcfg:
         vprint(gray('Generating'), blue(out), gray('file... '))
         fout.write(fcfg.readline())  # copy cfg output file header
         reads_writen: int = 0
         for line in fcfg:
             tid = Id(line.split('\t')[2])
             if mock_layout[tid]:
                 fout.write(line)
                 mock_layout[tid] -= 1
                 reads_writen += 1
                 if not sum(mock_layout.values()):
                     vprint(reads_writen, 'reads', green('OK!\n'))
                     break
     if sum(mock_layout.values()):
         print(red('ERROR!\n'))
         print(gray('Incomplete read copy by taxid:'))
         mock_layout = +mock_layout  # Delete zero counts elements
         for tid in mock_layout:
             print(yellow(mock_layout[tid]), gray('reads missing for tid'),
                   tid, '(', cyan(ncbi.get_name(tid)), ')\n')
Exemplo n.º 6
0
def read_clark_output(
    output_file: Filename,
    scoring: Scoring = Scoring.CLARK_C,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read CLARK(-l)(-S) full mode output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_confs: Dict[Id, List[Score]] = {}
    all_gammas: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split(',')
            if len(header) != 8:
                print(
                    red('\nERROR! ') + 'CLARK output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'ID,Length,Gamma,1st,score1,2nd,score2,conf')
                print(magenta('Found:'), ','.join(header), end='')
                print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S '
                      'with full mode (', blue('-m 0'), ')')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_label, _length, _gamma, _tid1, _score1, _tid2, _score2,
                     _conf) = output_line.split(',')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = int(_length)
                    gamma: Score = Score(float(_gamma))
                    tid1: Id = Id(_tid1)
                    score1: Score = Score(float(_score1))
                    tid2: Id = Id(_tid2)
                    score2: Score = Score(float(_score2))
                    conf: Score = Score(float(_conf))
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                # Select tid and score between CLARK assignments 1 and 2
                tid: Id = tid1
                score: Score = score1
                if tid1 == UNCLASSIFIED:
                    if tid2 == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    else:  # Majority of read unclassified
                        tid = tid2
                        score = score2
                        conf = Score(1 - conf)  # Get CLARK's h2/(h1+h2)
                # From CLARK_C(S) score get "single hit equivalent length"
                shel: Score = Score(score + K_MER_SIZE)
                taxids.add(tid)  # Save all the selected tids (tid1 or tid2)
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.CLARK_C:
                        if conf < minscore:
                            continue
                    elif scoring is Scoring.CLARK_G:
                        if gamma < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_confs[tid].append(conf)
                except KeyError:
                    all_confs[tid] = [
                        conf,
                    ]
                try:
                    all_gammas[tid].append(gamma)
                except KeyError:
                    all_gammas[tid] = [
                        gamma,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]

    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_confs,
                                    scores3=all_gammas,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Hit (score): min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Conf. score: min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Gamma score: min = ') + f'{stat.sco3.mini:.1f},' +
        gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco3.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.CLARK_C:
        out_scores = {
            tid: Score(mean(all_confs[tid]) * 100)
            for tid in all_confs
        }
    elif scoring is Scoring.CLARK_G:
        out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Exemplo n.º 7
0
def process_rank(
    *args, **kwargs
) -> Tuple[List[Sample], Dict[Sample, UnionCounter], Dict[Sample, Counter[Id]],
           Dict[Sample, UnionScores]]:
    """
    Process results for a taxlevel (to be usually called in parallel!).
    """

    # Recover input and parameters
    rank: Rank = args[0]
    controls: int = kwargs['controls']
    mintaxas: Dict[Sample, int] = kwargs['mintaxas']
    ontology: Ontology = kwargs['ontology']
    including = ontology.including
    excluding = ontology.excluding
    taxids: Dict[Sample, TaxLevels] = kwargs['taxids']
    counts: Dict[Sample, UnionCounter] = kwargs['counts']
    accs: Dict[Sample, Counter[Id]] = kwargs['accs']
    scores: Dict[Sample, UnionScores] = kwargs['scores']
    raws: List[Sample] = kwargs['raw_samples']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args) -> None:
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    def fltlst2str(lst: List[float]) -> str:
        """Convert a list of floats into a nice string"""
        return '[' + gray((', '.join(f'{elm:.1g}' for elm in lst))) + ']'

    def blst2str(lst: List[bool]) -> str:
        """Convert a list of booleans into a nice string"""
        return ('[' + (', '.join(magenta('T') if elm else 'F'
                                 for elm in lst)) + ']')

    def get_shared_mintaxa() -> int:
        """Give a value of mintaxa for shared derived samples

        This value is currently the minimum of the mintaxa of all the
         (non control) raw samples.
        """
        return min([mintaxas[smpl] for smpl in raws[controls:]])

    # Declare/define variables
    samples: List[Sample] = []
    # pylint: disable = unused-variable
    shared_counts: SharedCounter = SharedCounter()
    shared_score: SharedCounter = SharedCounter()
    shared_ctrl_counts: SharedCounter = SharedCounter()
    shared_ctrl_score: SharedCounter = SharedCounter()
    # pylint: enable = unused-variable

    output.write(f'\033[90mAnalysis for taxonomic rank "'
                 f'\033[95m{rank.name.lower()}\033[90m":\033[0m\n')

    def cross_analysis(iteration, raw):
        """Cross analysis: exclusive and part of shared&ctrl"""
        nonlocal shared_counts, shared_score
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def partial_shared_update(i):
            """Perform shared and shared-control taxa partial evaluations"""
            nonlocal shared_counts, shared_score
            nonlocal shared_ctrl_counts, shared_ctrl_score
            if i == 0:  # 1st iteration: Initialize shared abundance and score
                shared_counts.update(sub_shared_counts)
                shared_score.update(sub_shared_score)
            elif i < controls:  # Just update shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
            elif i == controls:  # Initialize shared-control counters
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts.update(sub_shared_counts)
                shared_ctrl_score.update(sub_shared_score)
            elif controls:  # Both: Accumulate shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts &= sub_shared_counts
                shared_ctrl_score &= sub_shared_score
            else:  # Both: Accumulate shared abundance and score (no controls)
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score

        exclude: Set[Id] = set()
        # Get taxids at this rank that are present in the other samples
        for sample in (smpl for smpl in raws if smpl != raw):
            exclude.update(taxids[sample][rank])
        exclude.update(excluding)  # Add explicit excluding taxa if any
        output.write(f'  \033[90mExclusive: From \033[0m{raw}\033[90m '
                     f'excluding {len(exclude)} taxa. '
                     f'Generating sample...\033[0m')

        exclude_tree = TaxTree()
        exclude_out = SampleDataById(['counts', 'scores', 'accs'])
        exclude_tree.allin1(ontology=ontology,
                            counts=counts[raw],
                            scores=scores[raw],
                            min_taxa=mintaxas[raw],
                            min_rank=rank,
                            just_min_rank=True,
                            include=including,
                            exclude=exclude,
                            out=exclude_out)
        exclude_out.purge_counters()
        if exclude_out.counts:  # Avoid adding empty samples
            sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}')
            samples.append(sample)
            counts[sample] = exclude_out.get_counts()
            accs[sample] = exclude_out.get_accs()
            scores[sample] = exclude_out.get_scores()
            output.write('\033[92m OK! \033[0m\n')
        else:
            output.write('\033[93m VOID \033[0m\n')

        # Get partial abundance and score for the shared analysis
        sub_shared_tree = TaxTree()
        sub_shared_out = SampleDataById(['shared', 'accs'])
        sub_shared_tree.allin1(ontology=ontology,
                               counts=counts[raw],
                               scores=scores[raw],
                               min_taxa=mintaxas[raw],
                               min_rank=rank,
                               just_min_rank=True,
                               include=including,
                               exclude=excluding,
                               out=sub_shared_out)
        sub_shared_out.purge_counters()
        # Scale scores by abundance
        sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts()
        sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores()
        sub_shared_score *= sub_shared_counts
        partial_shared_update(iteration)

    def shared_analysis():
        """Perform last steps of shared taxa analysis"""
        shared_tree: TaxTree = TaxTree()
        shared_out: SampleDataById = SampleDataById(['shared', 'accs'])
        shared_tree.allin1(ontology=ontology,
                           counts=shared_counts,
                           scores=shared_score,
                           min_taxa=get_shared_mintaxa(),
                           include=including,
                           exclude=excluding,
                           out=shared_out)
        shared_out.purge_counters()
        out_counts: SharedCounter = shared_out.get_shared_counts()
        output.write(
            gray(f'  Shared: Including {len(out_counts)}'
                 ' shared taxa. Generating sample... '))
        if out_counts:
            sample = Sample(f'{STR_SHARED}_{rank.name.lower()}')
            samples.append(sample)
            counts[Sample(sample)] = out_counts
            accs[Sample(sample)] = shared_out.get_accs()
            scores[sample] = shared_out.get_shared_scores()
            output.write(green('OK!\n'))
        else:
            output.write(yellow('VOID\n'))

    def control_analysis():
        """Perform last steps of control and shared controls analysis"""
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)

        # Get taxids at this rank that are present in the control samples
        exclude_candidates: Set[Id] = set()
        for i in range(controls):
            exclude_candidates.update(taxids[raws[i]][rank])
        exclude_sets: Dict[Sample, Set[Id]]
        shared_crossover: Set[Id] = set()  # Shared taxa contaminating controls
        if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES):
            robust_contamination_removal()
        else:  # If this case, just apply strict control
            exclude_sets = {
                file: exclude_candidates
                for file in raws[controls::]
            }
        # Add explicit excluding taxa (if any) to exclude sets
        for exclude_set in exclude_sets.values():
            exclude_set.update(excluding)
        exclude_candidates.update(excluding)
        # Process each sample excluding control taxa
        for raw in raws[controls:]:
            output.write(
                gray('  Ctrl: From') + f' {raw} ' +
                gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. '
                     f'Generating sample... '))
            ctrl_tree = TaxTree()
            ctrl_out = SampleDataById(['counts', 'scores', 'accs'])
            ctrl_tree.allin1(ontology=ontology,
                             counts=counts[raw],
                             scores=scores[raw],
                             min_taxa=mintaxas[raw],
                             min_rank=rank,
                             just_min_rank=True,
                             include=including,
                             exclude=exclude_sets[raw],
                             out=ctrl_out)
            ctrl_out.purge_counters()
            if ctrl_out.counts:  # Avoid adding empty samples
                sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}')
                samples.append(sample)
                counts[sample] = ctrl_out.get_counts()
                accs[sample] = ctrl_out.get_accs()
                scores[sample] = ctrl_out.get_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        def shared_ctrl_analysis():
            """Perform last steps of shared taxa analysis"""
            shared_ctrl_tree: TaxTree = TaxTree()
            shared_ctrl_out: SampleDataById = SampleDataById(
                ['shared', 'accs'])
            shared_ctrl_tree.allin1(ontology=ontology,
                                    counts=shared_ctrl_counts,
                                    scores=shared_ctrl_score,
                                    min_taxa=get_shared_mintaxa(),
                                    include=including,
                                    exclude=(exclude_candidates -
                                             shared_crossover),
                                    out=shared_ctrl_out)
            shared_ctrl_out.purge_counters()
            out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
            output.write(
                gray(f'  Ctrl-shared: Including {len(out_counts)}'
                     ' shared taxa. Generating sample... '))
            if out_counts:
                sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
                samples.append(sample)
                counts[Sample(sample)] = out_counts
                accs[Sample(sample)] = shared_ctrl_out.get_accs()
                scores[sample] = shared_ctrl_out.get_shared_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        # Shared-control taxa final analysis
        if shared_ctrl_counts:
            # Normalize scaled scores by total abundance
            shared_ctrl_score /= (+shared_ctrl_counts)
            # Get averaged abundance by number of samples minus ctrl samples
            shared_ctrl_counts //= (len(raws) - controls)
            shared_ctrl_analysis()
        else:
            output.write(
                gray('  Ctrl-shared: No taxa! ') + yellow('VOID') +
                gray(' sample.\n'))

    # Cross analysis iterating by output: exclusive and part of shared&ctrl
    for num_file, raw_sample_name in enumerate(raws):
        cross_analysis(num_file, raw_sample_name)

    # Shared taxa final analysis
    shared_counts = +shared_counts  # remove counts <= 0
    if shared_counts:
        # Normalize scaled scores by total abundance (after eliminating zeros)
        shared_score /= (+shared_counts)
        # Get averaged abundance by number of samples
        shared_counts //= len(raws)
        shared_analysis()
    else:
        output.write(
            gray('  Shared: No shared taxa! ') + yellow('VOID') +
            gray(' sample.\n'))

    # Control sample subtraction
    if controls:
        control_analysis()

    # Print output and return
    print(output.getvalue())
    sys.stdout.flush()
    return samples, counts, accs, scores
Exemplo n.º 8
0
        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)
Exemplo n.º 9
0
    def control_analysis():
        """Perform last steps of control and shared controls analysis"""
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)

        # Get taxids at this rank that are present in the control samples
        exclude_candidates: Set[Id] = set()
        for i in range(controls):
            exclude_candidates.update(taxids[raws[i]][rank])
        exclude_sets: Dict[Sample, Set[Id]]
        shared_crossover: Set[Id] = set()  # Shared taxa contaminating controls
        if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES):
            robust_contamination_removal()
        else:  # If this case, just apply strict control
            exclude_sets = {
                file: exclude_candidates
                for file in raws[controls::]
            }
        # Add explicit excluding taxa (if any) to exclude sets
        for exclude_set in exclude_sets.values():
            exclude_set.update(excluding)
        exclude_candidates.update(excluding)
        # Process each sample excluding control taxa
        for raw in raws[controls:]:
            output.write(
                gray('  Ctrl: From') + f' {raw} ' +
                gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. '
                     f'Generating sample... '))
            ctrl_tree = TaxTree()
            ctrl_out = SampleDataById(['counts', 'scores', 'accs'])
            ctrl_tree.allin1(ontology=ontology,
                             counts=counts[raw],
                             scores=scores[raw],
                             min_taxa=mintaxas[raw],
                             min_rank=rank,
                             just_min_rank=True,
                             include=including,
                             exclude=exclude_sets[raw],
                             out=ctrl_out)
            ctrl_out.purge_counters()
            if ctrl_out.counts:  # Avoid adding empty samples
                sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}')
                samples.append(sample)
                counts[sample] = ctrl_out.get_counts()
                accs[sample] = ctrl_out.get_accs()
                scores[sample] = ctrl_out.get_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        def shared_ctrl_analysis():
            """Perform last steps of shared taxa analysis"""
            shared_ctrl_tree: TaxTree = TaxTree()
            shared_ctrl_out: SampleDataById = SampleDataById(
                ['shared', 'accs'])
            shared_ctrl_tree.allin1(ontology=ontology,
                                    counts=shared_ctrl_counts,
                                    scores=shared_ctrl_score,
                                    min_taxa=get_shared_mintaxa(),
                                    include=including,
                                    exclude=(exclude_candidates -
                                             shared_crossover),
                                    out=shared_ctrl_out)
            shared_ctrl_out.purge_counters()
            out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
            output.write(
                gray(f'  Ctrl-shared: Including {len(out_counts)}'
                     ' shared taxa. Generating sample... '))
            if out_counts:
                sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
                samples.append(sample)
                counts[Sample(sample)] = out_counts
                accs[Sample(sample)] = shared_ctrl_out.get_accs()
                scores[sample] = shared_ctrl_out.get_shared_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        # Shared-control taxa final analysis
        if shared_ctrl_counts:
            # Normalize scaled scores by total abundance
            shared_ctrl_score /= (+shared_ctrl_counts)
            # Get averaged abundance by number of samples minus ctrl samples
            shared_ctrl_counts //= (len(raws) - controls)
            shared_ctrl_analysis()
        else:
            output.write(
                gray('  Ctrl-shared: No taxa! ') + yellow('VOID') +
                gray(' sample.\n'))
Exemplo n.º 10
0
def read_kraken_output(
    output_file: Filename,
    scoring: Scoring = Scoring.KRAKEN,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Kraken output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_kmerel: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split('\t')
            if len(header) != 5:
                print(
                    red('\nERROR! ') + 'Kraken output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'C/U, ID, taxid, length, list of mappings')
                print(magenta('Found:'), '\t'.join(header), end='')
                print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_clas, _label, _tid, _length,
                     _maps) = output_line.split('\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = sum(map(int, _length.split('|')))
                    num_read += 1
                    nt_read += length
                    if _clas == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    tid: Id = Id(_tid)
                    maps: List[str] = _maps.split()
                    try:
                        maps.remove('|:|')
                    except ValueError:
                        pass
                    mappings: Counter[Id] = col.Counter()
                    for pair in maps:
                        couple: List[str] = pair.split(':')
                        mappings[Id(couple[0])] += int(couple[1])
                    # From Kraken score get "single hit equivalent length"
                    shel: Score = Score(mappings[tid] + K_MER_SIZE)
                    score: Score = Score(mappings[tid] /
                                         sum(mappings.values()) *
                                         100)  # % relative to all k-mers
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.KRAKEN:
                        if score < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_kmerel[tid].append(score)
                except KeyError:
                    all_kmerel[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_kmerel,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.KRAKEN:
        out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Exemplo n.º 11
0
def process_output(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/LMAT output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    scoring: Scoring = kwargs['scoring']
    lmat: bool = kwargs['lmat']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read Centrifuge/LMAT output files to get abundances
    read_method: Callable[[Filename, Scoring, Optional[Score]],  # Input
                          Tuple[str, SampleStats, Counter[TaxId],
                                Dict[TaxId, Score]]  # Output
                          ]
    if lmat:
        read_method = read_lmat_output
    else:
        read_method = read_output
    log: str
    counts: Counter[TaxId]
    scores: Dict[TaxId, Score]
    log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Update field in stat about control nature of the sample
    stat.is_ctrl = is_ctrl
    # Move cellular_organisms counts to root, in case
    if taxonomy.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... '))
        if counts[ROOT]:
            stat.num_taxa -= 1
            scores[ROOT] = (
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ROOT] * counts[ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ROOT]))
        else:
            scores[ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ROOT]:
        vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT])
        stat.num_taxa -= 1
        counts[ROOT] = 0
        scores[ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building taxonomy tree
    output.write(gray('Building from raw data... '))
    vwrite(gray('\n  Building taxonomy tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[TaxId]
    orphans: Set[TaxId]
    ancestors, orphans = taxonomy.get_ancestors(counts.keys())
    out = SampleDataByTaxId(['all'])
    tree.allin1(taxonomy=taxonomy,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Give stats about orphan taxid
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of total)\n'
                f'{lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of total)\n')
        else:
            vwrite(green('OK!\n'))
    # Check the lost of taxids (plasmids typically) under some conditions
    if debug and not excluding and not including:
        vwrite(gray('  Additional checking of taxid loss... '))
        lost = 0
        for taxid in counts:
            if not out.counts[taxid]:
                lost += 1
                vwrite(yellow('Warning!'), f'Lost taxid={taxid}: '
                       f'{taxonomy.get_name(taxid)}\n')
        if lost:
            vwrite(
                yellow('WARNING!'), f'Lost {lost} taxids ('
                f'{lost/len(counts):.2%} of total)'
                '\n')
        else:
            vwrite(green('OK!\n'))

    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error
Exemplo n.º 12
0
def read_output(
    output_file: Filename,
    scoring: Scoring = Scoring.SHEL,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]]:
    """
    Read Centrifuge output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[TaxId, List[Score]] = {}
    all_length: Dict[TaxId, List[int]] = {}
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    error_read: int = None
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            file.readline()  # discard header
            for output_line in file:
                try:
                    _, _, _tid, _score, _, _, _length, *_ = output_line.split(
                        '\t')
                except ValueError:
                    print(
                        red('Error'), f'parsing line: ({output_line}) '
                        f'in {output_file}. Ignoring line!')
                    error_read = num_read + 1
                    continue
                tid = TaxId(_tid)
                try:
                    # From Centrifuge score get "single hit equivalent length"
                    shel = Score(float(_score)**0.5 + 15)
                    length = int(_length)
                except ValueError:
                    print(red('Error'), f'parsing score ({_score}) for query',
                          f'length ({_length}) for taxid {_tid}',
                          f'in {output_file}. Ignoring line!')
                    continue
                num_read += 1
                nt_read += length
                if tid == UNCLASSIFIED:  # Just count unclassified reads
                    num_uncl += 1
                    continue
                elif minscore is not None and shel < minscore:
                    continue  # Ignore read if low confidence
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if error_read == num_read + 1:  # Check if error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[TaxId] = Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs)
    # Output statistics
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(f'  {stat.num_taxa}' + gray(f' taxa with assigned reads\n'))
    # Select score output
    out_scores: Dict[TaxId, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[TaxId, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[TaxId, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        raise Exception(f'\n\033[91mERROR!\033[0m Unknown Scoring "{scoring}"')
    # Return
    return output.getvalue(), stat, counts, out_scores
Exemplo n.º 13
0
def main():
    """Main entry point to Recentrifuge."""
    def configure_parser():
        """Argument Parser Configuration"""
        parser = argparse.ArgumentParser(
            description='Analyze results of metagenomic taxonomic classifiers',
            epilog=f'%(prog)s  - Release {__version__} - {__date__}' + LICENSE,
            formatter_class=argparse.RawDescriptionHelpFormatter)
        parser.add_argument(
            '-V',
            '--version',
            action='version',
            version=f'%(prog)s version {__version__} released in {__date__}')
        parser_in = parser.add_argument_group(
            'input', 'Define Recentrifuge input files and formats')
        parser_in.add_argument('-n',
                               '--nodespath',
                               action='store',
                               metavar='PATH',
                               default=TAXDUMP_PATH,
                               help=('path for the nodes information files '
                                     '(nodes.dmp and names.dmp from NCBI)'))
        parser_filein = parser_in.add_mutually_exclusive_group(required=True)
        parser_filein.add_argument(
            '-f',
            '--file',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('Centrifuge output files. If a single directory is entered, '
                  'every .out file inside will be taken as a different sample.'
                  ' Multiple -f is available to include several samples.'))
        parser_filein.add_argument(
            '-l',
            '--lmat',
            action='append',
            metavar='FILE',
            type=Filename,
            default=None,
            help=('LMAT output dir or file prefix. If just "." is entered, '
                  'every subdirectory under the current directory will be '
                  'taken as a sample and scanned looking for LMAT output files'
                  '. Multiple -l is available to include several samples.'))
        parser_filein.add_argument(
            '-k',
            '--clark',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('CLARK(S) output files. If a single directory is entered, '
                  'every .csv file inside will be taken as a different sample.'
                  ' Multiple -k is available to include several samples.'))
        parser_filein.add_argument(
            '-r',
            '--report',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('Centrifuge/Kraken report files '
                  '(multiple -r is available to include several samples)'))
        parser_out = parser.add_argument_group(
            'output', 'Related to the Recentrifuge output files')
        parser_out.add_argument(
            '-o',
            '--outhtml',
            action='store',
            metavar='FILE',
            type=Filename,
            help='HTML output file (if not given, the filename will be '
            'inferred from input files)')
        parser_out.add_argument(
            '-e',
            '--excel',
            action='store',
            metavar='OUTPUT_TYPE',
            choices=[str(excel) for excel in Excel],
            default=str(Excel(0)),
            help=(f'type of excel report to be generated, and can be one of '
                  f'{[str(excel) for excel in Excel]}'))
        parser_coarse = parser.add_argument_group(
            'tuning', 'Coarse tuning of algorithm parameters')
        parser_cross = parser_coarse.add_mutually_exclusive_group(
            required=False)
        parser_cross.add_argument(
            '-c',
            '--controls',
            action='store',
            metavar='CONTROLS_NUMBER',
            type=int,
            default=0,
            help=('this number of first samples will be treated as negative '
                  'controls; default is no controls'))
        parser_coarse.add_argument(
            '-s',
            '--scoring',
            action='store',
            metavar='SCORING',
            choices=[str(each_score) for each_score in Scoring],
            default=str(Scoring(0)),
            help=(f'type of scoring to be applied, and can be one of '
                  f'{[str(scoring) for scoring in Scoring]}'))
        parser_coarse.add_argument(
            '-y',
            '--minscore',
            action='store',
            metavar='NUMBER',
            type=lambda txt: Score(float(txt)),
            default=None,
            help=('minimum score/confidence of the classification of a read '
                  'to pass the quality filter; all pass by default'))
        parser_coarse.add_argument(
            '-m',
            '--mintaxa',
            action='store',
            metavar='INT',
            type=int,
            default=DEFMINTAXA,
            help='minimum taxa to avoid collapsing one level to the parent one'
        )
        parser_coarse.add_argument(
            '-x',
            '--exclude',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('NCBI taxid code to exclude a taxon and all underneath '
                  '(multiple -x is available to exclude several taxid)'))
        parser_coarse.add_argument(
            '-i',
            '--include',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('NCBI taxid code to include a taxon and all underneath '
                  '(multiple -i is available to include several taxid); '
                  'by default, all the taxa are considered for inclusion'))
        parser_cross.add_argument('-a',
                                  '--avoidcross',
                                  action='store_true',
                                  help='avoid cross analysis')
        parser_fine = parser.add_argument_group(
            'fine tuning', 'Fine tuning of algorithm parameters')
        parser_fine.add_argument(
            '-z',
            '--ctrlminscore',
            action='store',
            metavar='NUMBER',
            type=lambda txt: Score(float(txt)),
            default=None,
            help=('minimum score/confidence of the classification of a read '
                  'in control samples to pass the quality filter; if defaults '
                  'to "minscore"'))
        parser_fine.add_argument(
            '-w',
            '--ctrlmintaxa',
            action='store',
            metavar='INT',
            type=int,
            default=None,
            help='minimum taxa to avoid collapsing one level to the parent one'
            ' in control samples; it defaults to "mintaxa"')
        parser_fine.add_argument(
            '-u',
            '--summary',
            action='store',
            metavar='OPTION',
            choices=['add', 'only', 'avoid'],
            default='add',
            help=(
                'select to "add" summary samples to other samples, or to '
                '"only" show summary samples or to "avoid" summaries at all'))
        parser_fine.add_argument(
            '-t',
            '--takeoutroot',
            action='store_true',
            help='remove counts directly assigned to the "root" level')
        parser_fine.add_argument('--nokollapse',
                                 action='store_true',
                                 help='show the "cellular organisms" taxon')
        parser_mode = parser.add_argument_group('advanced',
                                                'Advanced modes of running')
        parser_mode.add_argument(
            '--dummy',  # hidden flag: just generate a dummy plot for JS debug
            action='store_true',
            help=argparse.SUPPRESS)
        parser_mode.add_argument(
            '-g',
            '--debug',
            action='store_true',
            help='increase output verbosity and perform additional checks')
        parser_mode.add_argument('--sequential',
                                 action='store_true',
                                 help='deactivate parallel processing')
        return parser

    def check_debug():
        """Check debugging mode"""
        if args.debug:
            print(blue('INFO:'), gray('Debugging mode activated'))
            print(blue('INFO:'), gray('Active parameters:'))
            for key, value in vars(args).items():
                if value:
                    print(gray(f'\t{key} ='), f'{value}')

    def select_inputs():
        """Choose right classifier, input and output files"""
        nonlocal process, scoring, input_files, plasmidfile, classifier

        if reports:
            classifier = Classifier.KRAKEN
            process = process_report
            input_files = reports
        elif clarks:
            classifier = Classifier.CLARK
            process = process_output
            input_files = clarks
            if len(clarks) == 1 and os.path.isdir(clarks[0]):
                select_clark_inputs(clarks)
        elif lmats:
            classifier = Classifier.LMAT
            scoring = Scoring.LMAT
            process = process_output
            input_files = lmats
            plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE))
            select_lmat_inputs(lmats)
        elif outputs:
            classifier = Classifier.CENTRIFUGE
            process = process_output
            input_files = outputs
            if len(outputs) == 1 and os.path.isdir(outputs[0]):
                select_centrifuge_inputs(outputs)

    def check_controls():
        """Check and info about the control samples"""
        if args.controls:
            if args.controls > len(input_files):
                print(red(' ERROR!'), gray('More controls than samples'))
                exit(1)
            print(gray('Control(s) sample(s) for subtractions:'))
            for i in range(args.controls):
                print(blue(f'\t{input_files[i]}'))

    def select_html_file():
        """HTML filename selection"""
        nonlocal htmlfile
        if lmats:  # Select case for dir name or filename prefix
            if os.path.isdir(lmats[0]):  # Dir name
                dirname = os.path.dirname(os.path.normpath(lmats[0]))
                if not dirname or dirname == '.':
                    basename = 'output'
                else:
                    basename = os.path.basename(dirname)
            else:  # Explicit path and file name prefix is provided
                dirname, basename = os.path.split(lmats[0])
            htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX))
        elif reports:
            htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX)
        else:
            htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX)

    def read_samples():
        """Read samples"""
        print(gray('\nPlease, wait, processing files in parallel...\n'))
        # Enable parallelization with 'spawn' under known platforms
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(
                    processes=min(os.cpu_count(), len(input_files))) as pool:
                async_results = [
                    pool.apply_async(
                        process,
                        args=[
                            input_files[num],  # file name
                            True if num < args.controls else False
                        ],  # is ctrl?
                        kwds=kwargs) for num in range(len(input_files))
                ]
                for file, (sample, tree, out, stat,
                           err) in zip(input_files,
                                       [r.get() for r in async_results]):
                    if err is Err.NO_ERROR:
                        samples.append(sample)
                        trees[sample] = tree
                        taxids[sample] = out.get_taxlevels()
                        counts[sample] = out.counts
                        accs[sample] = out.accs
                        scores[sample] = out.scores
                        stats[sample] = stat
                    elif err is Err.VOID_CTRL:
                        print('There were void controls.', red('Aborting!'))
                        exit(1)
        else:  # sequential processing of each sample
            for num, file in enumerate(input_files):
                (sample, tree, out, stat,
                 err) = process(file, True if num < args.controls else False,
                                **kwargs)
                if err is Err.NO_ERROR:
                    samples.append(sample)
                    trees[sample] = tree
                    taxids[sample] = out.get_taxlevels()
                    counts[sample] = out.counts
                    accs[sample] = out.accs
                    scores[sample] = out.scores
                    stats[sample] = stat
                elif err is Err.VOID_CTRL:
                    print('There were void controls.', red('Aborting!'))
                    exit(1)
        raw_samples.extend(samples)  # Store raw sample names

    def analyze_samples():
        """Cross analysis of samples in parallel by taxlevel"""
        print(gray('Please, wait. Performing cross analysis in parallel...\n'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({
            'taxids': taxids,
            'counts': counts,
            'scores': scores,
            'accs': accs,
            'raw_samples': raw_samples
        })
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')  # Important for OSX&Win
            with mpctx.Pool(processes=min(os.cpu_count(),
                                          len(Rank.selected_ranks))) as pool:
                async_results = [
                    pool.apply_async(process_rank, args=[level], kwds=kwargs)
                    for level in Rank.selected_ranks
                ]
                for level, (smpls, abunds, accumulators,
                            score) in zip(Rank.selected_ranks,
                                          [r.get() for r in async_results]):
                    samples.extend(smpls)
                    counts.update(abunds)
                    accs.update(accumulators)
                    scores.update(score)
        else:  # sequential processing of each selected rank
            for level in Rank.selected_ranks:
                (smpls, abunds, accumulators,
                 score) = process_rank(level, **kwargs)
                samples.extend(smpls)
                counts.update(abunds)
                accs.update(accumulators)
                scores.update(score)

    def summarize_samples():
        """Summary of samples in parallel by type of cross-analysis"""
        # timing initialization
        summ_start_time: float = time.perf_counter()
        print(gray('Please, wait. Generating summaries in parallel...'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({'samples': samples})
        # Get list of set of samples to summarize (note pylint bug #776)
        # pylint: disable=unsubscriptable-object
        target_analysis: col.OrderedDict[str, None] = col.OrderedDict({
            f'{raw}_{study}': None
            for study in [STR_EXCLUSIVE, STR_CONTROL] for raw in raw_samples
            for smpl in samples if smpl.startswith(f'{raw}_{study}')
        })
        # pylint: enable=unsubscriptable-object
        # Add shared and control_shared analysis if they exist (are not void)
        for study in [STR_SHARED, STR_CONTROL_SHARED]:
            for smpl in samples:
                if smpl.startswith(study):
                    target_analysis[study] = None
                    break

        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(
                    processes=min(os.cpu_count(), len(input_files))) as pool:
                async_results = [
                    pool.apply_async(summarize_analysis,
                                     args=[analysis],
                                     kwds=kwargs)
                    for analysis in target_analysis
                ]
                for analysis, (summary, abund, acc,
                               score) in zip(target_analysis,
                                             [r.get() for r in async_results]):
                    if summary:  # Avoid adding empty samples
                        summaries.append(summary)
                        counts[summary] = abund
                        accs[summary] = acc
                        scores[summary] = score
        else:  # sequential processing of each selected rank
            for analysis in target_analysis:
                (summary, abund, acc,
                 score) = summarize_analysis(analysis, **kwargs)
                if summary:  # Avoid adding empty samples
                    summaries.append(summary)
                    counts[summary] = abund
                    accs[summary] = acc
                    scores[summary] = score
        # Timing results
        print(gray('Summary elapsed time:'),
              f'{time.perf_counter() - summ_start_time:.3g}', gray('sec'))

    def generate_krona():
        """Generate Krona plot with all the results via Krona 2.0 XML spec"""

        print(gray('\nBuilding the taxonomy multiple tree... '), end='')
        sys.stdout.flush()
        krona: KronaTree = KronaTree(
            samples,
            num_raw_samples=len(raw_samples),
            stats=stats,
            min_score=Score(
                min([
                    min(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            max_score=Score(
                max([
                    max(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            scoring=scoring,
        )
        polytree.grow(ontology=ncbi,
                      abundances=counts,
                      accs=accs,
                      scores=scores)
        print(green('OK!'))
        print(gray('Generating final plot (') + magenta(htmlfile) +
              gray(')... '),
              end='')
        sys.stdout.flush()
        polytree.toxml(ontology=ncbi, krona=krona)
        krona.tohtml(htmlfile, pretty=False)
        print(green('OK!'))

    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(ontology=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['Id']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:  # if controls, add specific sheet for rank
                target_ranks.extend(Rank.selected_ranks)
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        # Check if sample ends in _(STR_CONTROL)_(rank)
                        if (STR_CONTROL in samples[i].split('_')[-2:] and
                            rank.name.lower() in samples[i].split('_')[-1:])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [
                        samples[i].replace(
                            '_' + STR_CONTROL + '_' + rank.name.lower(), '')
                        for i in indexes
                    ]
                if rank is Rank.NO_RANK:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = raw_samples
                list_rows = []
                polytree.to_items(ontology=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['Id']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}'
          f' =-= by {__author__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    argparser = configure_parser()
    args = argparser.parse_args()
    outputs: List[Filename] = args.file
    reports: List[Filename] = args.report
    lmats: List[Filename] = args.lmat
    clarks: List[Filename] = args.clark
    input_files: List[Filename]
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    htmlfile: Filename = args.outhtml
    collapse: bool = not args.nokollapse
    excluding: Set[Id] = set(args.exclude)
    including: Set[Id] = set(args.include)
    scoring: Scoring = Scoring[args.scoring]
    excel: Excel = Excel[args.excel]

    check_debug()

    plasmidfile: Filename = None
    classifier: Classifier
    process: Callable[..., Tuple[Sample, TaxTree, SampleDataById, SampleStats,
                                 Err]]
    select_inputs()
    check_controls()
    if not htmlfile:
        select_html_file()

    # Load NCBI nodes, names and build children
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, collapse,
                              excluding, including, args.debug)

    # If dummy flag enabled, just create dummy krona and exit
    if args.dummy:
        _debug_dummy_plot(ncbi, htmlfile, scoring)
        exit(0)

    # Declare variables that will hold results for the samples analyzed
    trees: Dict[Sample, TaxTree] = {}
    counts: Dict[Sample, Counter[Id]] = {}
    accs: Dict[Sample, Counter[Id]] = {}
    taxids: Dict[Sample, TaxLevels] = {}
    scores: Dict[Sample, Dict[Id, Score]] = {}
    stats: Dict[Sample, SampleStats] = {}
    samples: List[Sample] = []
    raw_samples: List[Sample] = []

    # Define dictionary of parameters for methods to be called (to be extended)
    kwargs = {
        'controls':
        args.controls,
        'ctrlminscore': (args.ctrlminscore
                         if args.ctrlminscore is not None else args.minscore),
        'ctrlmintaxa':
        (args.ctrlmintaxa if args.ctrlmintaxa is not None else args.mintaxa),
        'debug':
        args.debug,
        'root':
        args.takeoutroot,
        'classifier':
        classifier,
        'minscore':
        args.minscore,
        'mintaxa':
        args.mintaxa,
        'scoring':
        scoring,
        'ontology':
        ncbi,
    }
    # The big stuff (done in parallel)
    read_samples()
    # Avoid cross analysis if just one report file or explicitly stated by flag
    if len(raw_samples) > 1 and not args.avoidcross:
        analyze_samples()
        if args.summary != 'avoid':
            summaries: List[Sample] = []
            summarize_samples()
            if args.summary == 'only':
                samples = raw_samples + summaries
            else:
                samples.extend(summaries)
    # Final result generation is done in sequential mode

    polytree: MultiTree = MultiTree(samples=samples)
    generate_krona()
    if _USE_PANDAS:
        generate_excel()
    else:
        print(yellow('WARNING!'),
              'Pandas not installed: Excel cannot be created.')

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Exemplo n.º 14
0
def process_output(
        *args,
        **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]:
    """
    Process classifiers output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    ontology: Ontology = kwargs['ontology']
    mintaxa: Optional[int] = (kwargs['ctrlmintaxa']
                              if is_ctrl else kwargs['mintaxa'])
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Union[Tuple, Set[Id]] = ontology.including
    excluding: Union[Tuple, Set[Id]] = ontology.excluding
    scoring: Scoring = kwargs['scoring']
    classifier: Classifier = kwargs['classifier']
    genfmt: GenericFormat = kwargs['genfmt']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read taxonomic classifier output files to get abundances
    read_method: Callable[  # Format: [[Input], Output]
        [Filename, Scoring, Optional[Score]],
        Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]]
    log: str
    stat: SampleStats
    counts: Counter[Id]
    scores: Dict[Id, Score]
    if classifier is Classifier.GENERIC:  # Direct call to generic method
        log, stat, counts, scores = read_generic_output(
            target_file, scoring, minscore, genfmt)
    else:  # Use read_method
        if classifier is Classifier.KRAKEN:
            read_method = read_kraken_output
        elif classifier is Classifier.CLARK:
            read_method = read_clark_output
        elif classifier is Classifier.LMAT:
            read_method = read_lmat_output
        elif classifier is Classifier.CENTRIFUGE:
            read_method = read_output
        else:
            raise Exception(red('\nERROR!'),
                            f'taxclass: Unknown classifier "{classifier}".')
        log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Complete/Update fields in stats
    stat.is_ctrl = is_ctrl  # set control nature of the sample
    if mintaxa is not None:  # manual mintaxa has precedence over automatic
        stat.mintaxa = mintaxa
    else:  # update local value with the automatically guessed value
        mintaxa = stat.mintaxa
    # Move cellular_organisms counts to root, in case
    if ontology.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n'))
        if counts[ontology.ROOT]:
            stat.decrease_filtered_taxids()
            scores[ontology.ROOT] = Score(
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ontology.ROOT] * counts[ontology.ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT]))
        else:
            scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ontology.ROOT]:
        vwrite(gray('Removing'), counts[ontology.ROOT],
               gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt -
                                     counts[ontology.ROOT])
        stat.decrease_filtered_taxids()
        counts[ontology.ROOT] = 0
        scores[ontology.ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building ontology tree
    output.write(
        gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' +
        gray(' ... \n'))
    vwrite(gray('  Building ontology tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[Id]
    orphans: Set[Id]
    ancestors, orphans = ontology.get_ancestors(counts.keys())
    out = SampleDataById(['all'])
    tree.allin1(ontology=ontology,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Stats: Complete final value for TaxIDs after tree building and folding
    final_taxids: int = len(out.counts) if out.counts is not None else 0
    stat.set_final_taxids(final_taxids)

    # Check for additional loss of reads (due to include/exclude an orphans)
    output.write(gray('  Check for more seqs lost ([in/ex]clude affects)... '))
    if out.counts is not None:
        discard: int = sum(counts.values()) - sum(out.counts.values())
        if discard:
            output.write(
                blue('\n  Info:') + f' {discard} ' +
                gray('additional seqs discarded (') +
                f'{discard/sum(counts.values()):.3%} ' +
                gray('of accepted)\n'))
        else:
            output.write(green('OK!\n'))
    else:
        output.write(red('No counts in sample tree!\n'))
    # Warn or give detailed stats about orphan taxid and orphan seqs
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('  Warning!'), gray('Orphan taxid'),
                       f'{orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('  WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of accepted)\n'
                f'    and {lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of accepted)\n')
        else:
            vwrite(green('OK!\n'))
    elif orphans:
        output.write(
            yellow('\n  Warning!') + f' {len(orphans)} orphan taxids' +
            gray(' (rerun with --debug for details)\n'))
    # Check the removal of TaxIDs (accumulation of leaves in parents)
    if debug and not excluding and including == {ontology.ROOT}:
        vwrite(gray('  Assess accumulation due to "folding the tree"...\n'))
        migrated: int = 0
        if out.counts is not None:
            for taxid in counts:
                if out.counts[taxid] == 0:
                    migrated += 1
                    vwrite(
                        blue('  Info:'),
                        gray(f'Folded TaxID {taxid} (') +
                        f'{ontology.get_name(taxid)}' + gray(') with ') +
                        f'{counts[taxid]}' + gray(' original seqs\n'))
        if migrated:
            vwrite(
                blue('  INFO:'), f'{migrated} TaxIDs folded ('
                f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)'
                '\n')
            vwrite(
                blue('  INFO:'), f'Final assigned TaxIDs: {final_taxids} '
                f'(reduced to {final_taxids/len(+counts):.2%} of '
                'number of TAF)\n')
        else:
            vwrite(blue('  INFO:'), gray('No migration!'), green('OK!\n'))
    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error
Exemplo n.º 15
0
    def tohtml(
        self,
        filename: Filename,
        pretty: bool = False,
    ) -> None:
        """
        Write Krona HTML.

        Args:
            filename: the name of the HTML output file.
            pretty: this parameter controls the layout of the XML code
                so that it is human readable for True (use for debug
                only because it uses a lot more of space and also has
                empty tags which are currently NOT SUPPORTED BY KRONA)
                and machine readable for False (default, saves space).

        Returns: None

        """

        # Warn about use of pretty option
        if pretty:
            print(
                yellow(f'\nWARNING! Pretty XML uses empty tags which are'
                       f' UNSUPPORTED by Krona-JS!'))
            print(yellow(f'WARNING! Prepare for unexpected HTML results!'))

        # Read aux files
        path = os.path.dirname(os.path.realpath(__file__))
        with open(path + HIDDEN, 'r') as file:
            hidden_image = file.read()
        with open(path + LOADING, 'r') as file:
            loading_image = file.read()
        with open(path + FAVICON, 'r') as file:
            favicon = file.read()
        path_logo: str
        if self.chart == Chart.TAXOMIC:
            path_logo = path + LOGO_RCF
        elif self.chart == Chart.GENOMIC:
            path_logo = path + LOGO_RGF
        else:
            raise Exception(f'ERROR! Unknown Chart "{self.chart}"')

        with open(path_logo, 'r') as file:
            logo = file.read()
        with open(f'{path}/{JSLIB}', 'r') as file:
            script = file.read()

        # Set root of HTML doc
        html_root = ETree.Element(  # type: ignore
            'html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml',
                            'xml:lang': 'en',
                            'lang': 'en'})
        # Prepare HTML file
        head = self.sub(html_root, 'head')
        self.sub(head, 'meta', {'charset': 'utf-8'})
        self.sub(head, 'link', {'rel': 'shortcut icon', 'href': favicon})
        self.sub(
            head, 'link', {
                'rel': 'stylesheet',
                'href': 'https://fonts.googleapis.com/css?family=Ubuntu'
            })
        self.sub(head, 'script', {'id': 'notfound'},
                 'window.onload=function(){document.body.innerHTML=""}')
        self.sub(head, 'script', {
            'language': 'javascript',
            'type': 'text/javascript'
        }, script)  # Include javascript
        body = self.sub(html_root, 'body')
        self.sub(body, 'img', {
            'id': 'hiddenImage',
            'src': hidden_image,
            'style': 'display:none'
        })
        self.sub(body, 'img', {
            'id': 'loadingImage',
            'src': loading_image,
            'style': 'display:none'
        })
        self.sub(body, 'img', {
            'id': 'logo',
            'src': logo,
            'style': 'display:none'
        })
        self.sub(body, 'noscript', None,
                 'Javascript must be enabled to view this page.')

        div = self.sub(body, 'div', {'style': 'display:none'})
        div.append(self.krona)  # Include specific XML from samples
        # Write the HTML file
        with open(filename, 'w') as html_file:
            html_file.write(
                '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')  # pylint: disable=line-too-long
            if pretty:
                html_file.write(self.to_pretty_string(html_root))
            else:
                html_file.write(
                    ETree.tostring(
                        html_root,
                        encoding='unicode',
                        method='html',
                        short_empty_elements=False,
                    ))
Exemplo n.º 16
0
def read_output(
    output_file: Filename,
    scoring: Scoring = Scoring.SHEL,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Centrifuge output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error

    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            file.readline()  # discard header
            for output_line in file:
                try:
                    _, _, _tid, _score, _, _, _length, *_ = output_line.split(
                        '\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                tid = Id(_tid)
                try:
                    # From Centrifuge score get "single hit equivalent length"
                    shel = Score(float(_score)**0.5 + 15)
                    length = int(_length)
                except ValueError:
                    print(yellow('Failure'), f'parsing score ({_score}) for ',
                          f'query length {_length} for taxid {_tid}',
                          f'in {output_file}. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                if tid == UNCLASSIFIED:  # Just count unclassified reads
                    num_uncl += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and shel < minscore:
                    continue  # Ignore read if low confidence
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') +
        f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini}' + gray(', max = ') +
        f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Exemplo n.º 17
0
def read_generic_output(
    output_file: Filename,
    scoring: Scoring = Scoring.GENERIC,
    minscore: Score = None,
    genfmt: GenericFormat = None
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read an output file from a generic classifier

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification
        genfmt: GenericFormat object specifying the files format

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    # Initialization of variables
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    # Check format
    if not isinstance(genfmt, GenericFormat):
        raise Exception(
            red('\nERROR!'),
            'Missing GenericFormat when reading a generic output.')
    try:
        with open(output_file, 'r') as file:
            # Main loop processing each file line
            for raw_line in file:
                raw_line = raw_line.strip(' \n\t')
                splitting: str
                if genfmt.typ is GenericType.CSV:
                    splitting = ','
                elif genfmt.typ is GenericType.TSV:
                    splitting = '\t'
                elif genfmt.typ is GenericType.SSV:
                    splitting = ' '
                else:
                    raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}')
                output_line: List[str] = raw_line.split(splitting)
                if len(output_line) < GenericFormat.MIN_COLS:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as an error
                    raise Exception(
                        red('\nERROR!') + ' Line ' + yellow(f'{output_line}') +
                        '\n\tin ' + yellow(f'{output_file}') + ' has < ' +
                        blue(f'{GenericFormat.MIN_COLS}') + ' required ' +
                        'columns.\n\tPlease check the file.')
                try:
                    tid: Id = Id(output_line[genfmt.tid - 1].strip(' "'))
                    length: int = int(output_line[genfmt.len - 1].strip(' "'))
                    if tid == genfmt.unc:  # Avoid read score for unclass reads
                        num_read += 1
                        nt_read += length
                        num_uncl += 1
                        continue
                    score: Score = Score(
                        float(output_line[genfmt.sco - 1].strip(' "')))
                except ValueError:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as a failure
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    if num_read > 100 and num_errors > 0.5 * num_read:
                        print(
                            red('ERROR!'),
                            'Unreliable file processing: rate of problematic'
                            f' reads is {num_errors/num_read*100:_d}, beyond'
                            ' 50%, after 100 reads. Please check the format '
                            f'of the file "{output_file}".')
                        raise
                    else:
                        continue
                num_read += 1
                nt_read += length
                taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and score < minscore:
                    continue  # Discard read if low confidence
                try:
                    all_scores[tid].append(score)
                except KeyError:
                    all_scores[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.GENERIC:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        raise Exception(red('\nERROR!'),
                        f'Generic: Unsupported Scoring "{scoring}"')
    # Return
    return output.getvalue(), stat, counts, out_scores