def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]: """ Read Centrifuge/Kraken report file Args: report_file: report file name Returns: log string, abundances counter, taxlevel dict """ # TODO: Discontinued method, to be erased in a future release output: io.StringIO = io.StringIO(newline='') abundances: Counter[Id] = col.Counter() level_dic = {} output.write(f'\033[90mLoading report file {report_file}...\033[0m') try: with open(report_file, 'r') as file: for report_line in file: _, _, taxnum, taxlev, _tid, _ = report_line.split('\t') tid = Id(_tid) abundances[tid] = int(taxnum) level_dic[tid] = Rank.centrifuge(taxlev) except KeyboardInterrupt: print(gray(' User'), yellow('interrupted!')) raise except Exception: print(red('ERROR!'), 'Cannot read "' + report_file + '"') raise else: output.write('\033[92m OK! \033[0m\n') return output.getvalue(), abundances, level_dic
def shared_ctrl_analysis(): """Perform last steps of shared taxa analysis""" shared_ctrl_tree: TaxTree = TaxTree() shared_ctrl_out: SampleDataById = SampleDataById( ['shared', 'accs']) shared_ctrl_tree.allin1(ontology=ontology, counts=shared_ctrl_counts, scores=shared_ctrl_score, min_taxa=get_shared_mintaxa(), include=including, exclude=(exclude_candidates - shared_crossover), out=shared_ctrl_out) shared_ctrl_out.purge_counters() out_counts: SharedCounter = shared_ctrl_out.get_shared_counts() output.write( gray(f' Ctrl-shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_ctrl_out.get_accs() scores[sample] = shared_ctrl_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n'))
def summarize_analysis( *args, **kwargs) -> Tuple[Optional[Sample], Counter[Id], Counter[Id], Scores]: """ Summarize for a cross-analysis (to be usually called in parallel!). """ # Recover input and parameters analysis: str = args[0] ontology: Ontology = kwargs['ontology'] # TODO: Delete the following comment lines in a future release # including = ontology.including # See comment below for the reason # excluding = ontology.excluding # in/excluding are not used anymore counts: Dict[Sample, Counter[Id]] = kwargs['counts'] scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores'] samples: List[Sample] = kwargs['samples'] output: io.StringIO = io.StringIO(newline='') # Declare/define variables summary_counts: Counter[Id] = col.Counter() summary_acc: Counter[Id] = col.Counter() summary_score: Scores = Scores({}) summary: Optional[Sample] = None output.write(gray('Summary for ') + analysis + gray('... ')) target_samples: List[Sample] = [ smpl for smpl in samples if smpl.startswith(analysis) ] assert len(target_samples) >= 1, \ red('ERROR! ') + analysis + gray(' has no samples to summarize!') for smpl in target_samples: summary_counts += counts[smpl] summary_score.update(scores[smpl]) tree = TaxTree() tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score) tree.subtract() tree.shape() summary_counts.clear() summary_score.clear() # Avoid including/excluding here as get_taxa is not as 'clever' as allin1 # and taxa are already included/excluded in the derived samples tree.get_taxa(counts=summary_counts, accs=summary_acc, scores=summary_score) summary_counts = +summary_counts # remove counts <= 0 if summary_counts: # Avoid returning empty sample (summary would be None) summary = Sample(f'{analysis}_{STR_SUMMARY}') output.write( gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') + green(' OK!\n')) else: output.write(yellow(' VOID\n')) # Print output and return print(output.getvalue(), end='') sys.stdout.flush() return summary, summary_counts, summary_acc, summary_score
def summarize_analysis( *args, **kwargs) -> Tuple[Sample, Counter[Id], Counter[Id], Scores]: """ Summarize for a cross-analysis (to be usually called in parallel!). """ # Recover input and parameters analysis: str = args[0] ontology: Ontology = kwargs['ontology'] including = ontology.including excluding = ontology.excluding counts: Dict[Sample, Counter[Id]] = kwargs['counts'] scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores'] samples: List[Sample] = kwargs['samples'] output: io.StringIO = io.StringIO(newline='') # Declare/define variables summary_counts: Counter[Id] = Counter() summary_acc: Counter[Id] = Counter() summary_score: Scores = Scores({}) summary: Sample = None output.write(gray('Summary for ') + analysis + gray('... ')) target_samples: List[Sample] = [ smpl for smpl in samples if smpl.startswith(analysis) ] assert len(target_samples) >= 1, \ red('ERROR! ') + analysis + gray(' has no samples to summarize!') for smpl in target_samples: summary_counts += counts[smpl] summary_score.update(scores[smpl]) tree = TaxTree() tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score) tree.subtract() tree.shape() summary_counts.clear() summary_score.clear() tree.get_taxa(counts=summary_counts, accs=summary_acc, scores=summary_score, include=including, exclude=excluding) summary_counts = +summary_counts # remove counts <= 0 if summary_counts: # Avoid returning empty sample (summary would be None) summary = Sample(f'{analysis}_{STR_SUMMARY}') output.write( gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') + green(' OK!\n')) else: output.write(yellow(' VOID\n')) # Print output and return print(output.getvalue(), end='') sys.stdout.flush() return summary, summary_counts, summary_acc, summary_score
def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from source file""" with open(out, 'w') as fout, open(args.file) as fcfg: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write(fcfg.readline()) # copy cfg output file header reads_writen: int = 0 for line in fcfg: tid = Id(line.split('\t')[2]) if mock_layout[tid]: fout.write(line) mock_layout[tid] -= 1 reads_writen += 1 if not sum(mock_layout.values()): vprint(reads_writen, 'reads', green('OK!\n')) break if sum(mock_layout.values()): print(red('ERROR!\n')) print(gray('Incomplete read copy by taxid:')) mock_layout = +mock_layout # Delete zero counts elements for tid in mock_layout: print(yellow(mock_layout[tid]), gray('reads missing for tid'), tid, '(', cyan(ncbi.get_name(tid)), ')\n')
def read_clark_output( output_file: Filename, scoring: Scoring = Scoring.CLARK_C, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read CLARK(-l)(-S) full mode output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_confs: Dict[Id, List[Score]] = {} all_gammas: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split(',') if len(header) != 8: print( red('\nERROR! ') + 'CLARK output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'ID,Length,Gamma,1st,score1,2nd,score2,conf') print(magenta('Found:'), ','.join(header), end='') print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S ' 'with full mode (', blue('-m 0'), ')') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_label, _length, _gamma, _tid1, _score1, _tid2, _score2, _conf) = output_line.split(',') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = int(_length) gamma: Score = Score(float(_gamma)) tid1: Id = Id(_tid1) score1: Score = Score(float(_score1)) tid2: Id = Id(_tid2) score2: Score = Score(float(_score2)) conf: Score = Score(float(_conf)) except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length # Select tid and score between CLARK assignments 1 and 2 tid: Id = tid1 score: Score = score1 if tid1 == UNCLASSIFIED: if tid2 == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: # Majority of read unclassified tid = tid2 score = score2 conf = Score(1 - conf) # Get CLARK's h2/(h1+h2) # From CLARK_C(S) score get "single hit equivalent length" shel: Score = Score(score + K_MER_SIZE) taxids.add(tid) # Save all the selected tids (tid1 or tid2) if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.CLARK_C: if conf < minscore: continue elif scoring is Scoring.CLARK_G: if gamma < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_confs[tid].append(conf) except KeyError: all_confs[tid] = [ conf, ] try: all_gammas[tid].append(gamma) except KeyError: all_gammas[tid] = [ gamma, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_confs, scores3=all_gammas, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Hit (score): min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Conf. score: min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Gamma score: min = ') + f'{stat.sco3.mini:.1f},' + gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') + f'{stat.sco3.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.CLARK_C: out_scores = { tid: Score(mean(all_confs[tid]) * 100) for tid in all_confs } elif scoring is Scoring.CLARK_G: out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def process_rank( *args, **kwargs ) -> Tuple[List[Sample], Dict[Sample, UnionCounter], Dict[Sample, Counter[Id]], Dict[Sample, UnionScores]]: """ Process results for a taxlevel (to be usually called in parallel!). """ # Recover input and parameters rank: Rank = args[0] controls: int = kwargs['controls'] mintaxas: Dict[Sample, int] = kwargs['mintaxas'] ontology: Ontology = kwargs['ontology'] including = ontology.including excluding = ontology.excluding taxids: Dict[Sample, TaxLevels] = kwargs['taxids'] counts: Dict[Sample, UnionCounter] = kwargs['counts'] accs: Dict[Sample, Counter[Id]] = kwargs['accs'] scores: Dict[Sample, UnionScores] = kwargs['scores'] raws: List[Sample] = kwargs['raw_samples'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args) -> None: """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) def fltlst2str(lst: List[float]) -> str: """Convert a list of floats into a nice string""" return '[' + gray((', '.join(f'{elm:.1g}' for elm in lst))) + ']' def blst2str(lst: List[bool]) -> str: """Convert a list of booleans into a nice string""" return ('[' + (', '.join(magenta('T') if elm else 'F' for elm in lst)) + ']') def get_shared_mintaxa() -> int: """Give a value of mintaxa for shared derived samples This value is currently the minimum of the mintaxa of all the (non control) raw samples. """ return min([mintaxas[smpl] for smpl in raws[controls:]]) # Declare/define variables samples: List[Sample] = [] # pylint: disable = unused-variable shared_counts: SharedCounter = SharedCounter() shared_score: SharedCounter = SharedCounter() shared_ctrl_counts: SharedCounter = SharedCounter() shared_ctrl_score: SharedCounter = SharedCounter() # pylint: enable = unused-variable output.write(f'\033[90mAnalysis for taxonomic rank "' f'\033[95m{rank.name.lower()}\033[90m":\033[0m\n') def cross_analysis(iteration, raw): """Cross analysis: exclusive and part of shared&ctrl""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score def partial_shared_update(i): """Perform shared and shared-control taxa partial evaluations""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score if i == 0: # 1st iteration: Initialize shared abundance and score shared_counts.update(sub_shared_counts) shared_score.update(sub_shared_score) elif i < controls: # Just update shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score elif i == controls: # Initialize shared-control counters shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts.update(sub_shared_counts) shared_ctrl_score.update(sub_shared_score) elif controls: # Both: Accumulate shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts &= sub_shared_counts shared_ctrl_score &= sub_shared_score else: # Both: Accumulate shared abundance and score (no controls) shared_counts &= sub_shared_counts shared_score &= sub_shared_score exclude: Set[Id] = set() # Get taxids at this rank that are present in the other samples for sample in (smpl for smpl in raws if smpl != raw): exclude.update(taxids[sample][rank]) exclude.update(excluding) # Add explicit excluding taxa if any output.write(f' \033[90mExclusive: From \033[0m{raw}\033[90m ' f'excluding {len(exclude)} taxa. ' f'Generating sample...\033[0m') exclude_tree = TaxTree() exclude_out = SampleDataById(['counts', 'scores', 'accs']) exclude_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude, out=exclude_out) exclude_out.purge_counters() if exclude_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}') samples.append(sample) counts[sample] = exclude_out.get_counts() accs[sample] = exclude_out.get_accs() scores[sample] = exclude_out.get_scores() output.write('\033[92m OK! \033[0m\n') else: output.write('\033[93m VOID \033[0m\n') # Get partial abundance and score for the shared analysis sub_shared_tree = TaxTree() sub_shared_out = SampleDataById(['shared', 'accs']) sub_shared_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=excluding, out=sub_shared_out) sub_shared_out.purge_counters() # Scale scores by abundance sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts() sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores() sub_shared_score *= sub_shared_counts partial_shared_update(iteration) def shared_analysis(): """Perform last steps of shared taxa analysis""" shared_tree: TaxTree = TaxTree() shared_out: SampleDataById = SampleDataById(['shared', 'accs']) shared_tree.allin1(ontology=ontology, counts=shared_counts, scores=shared_score, min_taxa=get_shared_mintaxa(), include=including, exclude=excluding, out=shared_out) shared_out.purge_counters() out_counts: SharedCounter = shared_out.get_shared_counts() output.write( gray(f' Shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_out.get_accs() scores[sample] = shared_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) def control_analysis(): """Perform last steps of control and shared controls analysis""" nonlocal shared_ctrl_counts, shared_ctrl_score def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) # Get taxids at this rank that are present in the control samples exclude_candidates: Set[Id] = set() for i in range(controls): exclude_candidates.update(taxids[raws[i]][rank]) exclude_sets: Dict[Sample, Set[Id]] shared_crossover: Set[Id] = set() # Shared taxa contaminating controls if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES): robust_contamination_removal() else: # If this case, just apply strict control exclude_sets = { file: exclude_candidates for file in raws[controls::] } # Add explicit excluding taxa (if any) to exclude sets for exclude_set in exclude_sets.values(): exclude_set.update(excluding) exclude_candidates.update(excluding) # Process each sample excluding control taxa for raw in raws[controls:]: output.write( gray(' Ctrl: From') + f' {raw} ' + gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. ' f'Generating sample... ')) ctrl_tree = TaxTree() ctrl_out = SampleDataById(['counts', 'scores', 'accs']) ctrl_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude_sets[raw], out=ctrl_out) ctrl_out.purge_counters() if ctrl_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}') samples.append(sample) counts[sample] = ctrl_out.get_counts() accs[sample] = ctrl_out.get_accs() scores[sample] = ctrl_out.get_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) def shared_ctrl_analysis(): """Perform last steps of shared taxa analysis""" shared_ctrl_tree: TaxTree = TaxTree() shared_ctrl_out: SampleDataById = SampleDataById( ['shared', 'accs']) shared_ctrl_tree.allin1(ontology=ontology, counts=shared_ctrl_counts, scores=shared_ctrl_score, min_taxa=get_shared_mintaxa(), include=including, exclude=(exclude_candidates - shared_crossover), out=shared_ctrl_out) shared_ctrl_out.purge_counters() out_counts: SharedCounter = shared_ctrl_out.get_shared_counts() output.write( gray(f' Ctrl-shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_ctrl_out.get_accs() scores[sample] = shared_ctrl_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) # Shared-control taxa final analysis if shared_ctrl_counts: # Normalize scaled scores by total abundance shared_ctrl_score /= (+shared_ctrl_counts) # Get averaged abundance by number of samples minus ctrl samples shared_ctrl_counts //= (len(raws) - controls) shared_ctrl_analysis() else: output.write( gray(' Ctrl-shared: No taxa! ') + yellow('VOID') + gray(' sample.\n')) # Cross analysis iterating by output: exclusive and part of shared&ctrl for num_file, raw_sample_name in enumerate(raws): cross_analysis(num_file, raw_sample_name) # Shared taxa final analysis shared_counts = +shared_counts # remove counts <= 0 if shared_counts: # Normalize scaled scores by total abundance (after eliminating zeros) shared_score /= (+shared_counts) # Get averaged abundance by number of samples shared_counts //= len(raws) shared_analysis() else: output.write( gray(' Shared: No shared taxa! ') + yellow('VOID') + gray(' sample.\n')) # Control sample subtraction if controls: control_analysis() # Print output and return print(output.getvalue()) sys.stdout.flush() return samples, counts, accs, scores
def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid)
def control_analysis(): """Perform last steps of control and shared controls analysis""" nonlocal shared_ctrl_counts, shared_ctrl_score def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) # Get taxids at this rank that are present in the control samples exclude_candidates: Set[Id] = set() for i in range(controls): exclude_candidates.update(taxids[raws[i]][rank]) exclude_sets: Dict[Sample, Set[Id]] shared_crossover: Set[Id] = set() # Shared taxa contaminating controls if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES): robust_contamination_removal() else: # If this case, just apply strict control exclude_sets = { file: exclude_candidates for file in raws[controls::] } # Add explicit excluding taxa (if any) to exclude sets for exclude_set in exclude_sets.values(): exclude_set.update(excluding) exclude_candidates.update(excluding) # Process each sample excluding control taxa for raw in raws[controls:]: output.write( gray(' Ctrl: From') + f' {raw} ' + gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. ' f'Generating sample... ')) ctrl_tree = TaxTree() ctrl_out = SampleDataById(['counts', 'scores', 'accs']) ctrl_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude_sets[raw], out=ctrl_out) ctrl_out.purge_counters() if ctrl_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}') samples.append(sample) counts[sample] = ctrl_out.get_counts() accs[sample] = ctrl_out.get_accs() scores[sample] = ctrl_out.get_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) def shared_ctrl_analysis(): """Perform last steps of shared taxa analysis""" shared_ctrl_tree: TaxTree = TaxTree() shared_ctrl_out: SampleDataById = SampleDataById( ['shared', 'accs']) shared_ctrl_tree.allin1(ontology=ontology, counts=shared_ctrl_counts, scores=shared_ctrl_score, min_taxa=get_shared_mintaxa(), include=including, exclude=(exclude_candidates - shared_crossover), out=shared_ctrl_out) shared_ctrl_out.purge_counters() out_counts: SharedCounter = shared_ctrl_out.get_shared_counts() output.write( gray(f' Ctrl-shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_ctrl_out.get_accs() scores[sample] = shared_ctrl_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) # Shared-control taxa final analysis if shared_ctrl_counts: # Normalize scaled scores by total abundance shared_ctrl_score /= (+shared_ctrl_counts) # Get averaged abundance by number of samples minus ctrl samples shared_ctrl_counts //= (len(raws) - controls) shared_ctrl_analysis() else: output.write( gray(' Ctrl-shared: No taxa! ') + yellow('VOID') + gray(' sample.\n'))
def read_kraken_output( output_file: Filename, scoring: Scoring = Scoring.KRAKEN, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Kraken output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_kmerel: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split('\t') if len(header) != 5: print( red('\nERROR! ') + 'Kraken output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'C/U, ID, taxid, length, list of mappings') print(magenta('Found:'), '\t'.join(header), end='') print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_clas, _label, _tid, _length, _maps) = output_line.split('\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = sum(map(int, _length.split('|'))) num_read += 1 nt_read += length if _clas == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue tid: Id = Id(_tid) maps: List[str] = _maps.split() try: maps.remove('|:|') except ValueError: pass mappings: Counter[Id] = col.Counter() for pair in maps: couple: List[str] = pair.split(':') mappings[Id(couple[0])] += int(couple[1]) # From Kraken score get "single hit equivalent length" shel: Score = Score(mappings[tid] + K_MER_SIZE) score: Score = Score(mappings[tid] / sum(mappings.values()) * 100) # % relative to all k-mers except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.KRAKEN: if score < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_kmerel[tid].append(score) except KeyError: all_kmerel[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_kmerel, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.KRAKEN: out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def process_output( *args, **kwargs ) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]: """ Process Centrifuge/LMAT output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() taxonomy: Taxonomy = kwargs['taxonomy'] mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa'] minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Set[TaxId] = taxonomy.including excluding: Set[TaxId] = taxonomy.excluding scoring: Scoring = kwargs['scoring'] lmat: bool = kwargs['lmat'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read Centrifuge/LMAT output files to get abundances read_method: Callable[[Filename, Scoring, Optional[Score]], # Input Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]] # Output ] if lmat: read_method = read_lmat_output else: read_method = read_output log: str counts: Counter[TaxId] scores: Dict[TaxId, Score] log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Update field in stat about control nature of the sample stat.is_ctrl = is_ctrl # Move cellular_organisms counts to root, in case if taxonomy.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... ')) if counts[ROOT]: stat.num_taxa -= 1 scores[ROOT] = ( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ROOT] * counts[ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ROOT])) else: scores[ROOT] = scores[CELLULAR_ORGANISMS] counts[ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ROOT]: vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT]) stat.num_taxa -= 1 counts[ROOT] = 0 scores[ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building taxonomy tree output.write(gray('Building from raw data... ')) vwrite(gray('\n Building taxonomy tree with all-in-1... ')) tree = TaxTree() ancestors: Set[TaxId] orphans: Set[TaxId] ancestors, orphans = taxonomy.get_ancestors(counts.keys()) out = SampleDataByTaxId(['all']) tree.allin1(taxonomy=taxonomy, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Give stats about orphan taxid if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n') lost += counts[orphan] vwrite( yellow('WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of total)\n' f'{lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of total)\n') else: vwrite(green('OK!\n')) # Check the lost of taxids (plasmids typically) under some conditions if debug and not excluding and not including: vwrite(gray(' Additional checking of taxid loss... ')) lost = 0 for taxid in counts: if not out.counts[taxid]: lost += 1 vwrite(yellow('Warning!'), f'Lost taxid={taxid}: ' f'{taxonomy.get_name(taxid)}\n') if lost: vwrite( yellow('WARNING!'), f'Lost {lost} taxids (' f'{lost/len(counts):.2%} of total)' '\n') else: vwrite(green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def read_output( output_file: Filename, scoring: Scoring = Scoring.SHEL, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]]: """ Read Centrifuge output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[TaxId, List[Score]] = {} all_length: Dict[TaxId, List[int]] = {} num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 error_read: int = None output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: file.readline() # discard header for output_line in file: try: _, _, _tid, _score, _, _, _length, *_ = output_line.split( '\t') except ValueError: print( red('Error'), f'parsing line: ({output_line}) ' f'in {output_file}. Ignoring line!') error_read = num_read + 1 continue tid = TaxId(_tid) try: # From Centrifuge score get "single hit equivalent length" shel = Score(float(_score)**0.5 + 15) length = int(_length) except ValueError: print(red('Error'), f'parsing score ({_score}) for query', f'length ({_length}) for taxid {_tid}', f'in {output_file}. Ignoring line!') continue num_read += 1 nt_read += length if tid == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue elif minscore is not None and shel < minscore: continue # Ignore read if low confidence try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if error_read == num_read + 1: # Check if error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[TaxId] = Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs) # Output statistics output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write(f' {stat.num_taxa}' + gray(f' taxa with assigned reads\n')) # Select score output out_scores: Dict[TaxId, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[TaxId, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[TaxId, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(f'\n\033[91mERROR!\033[0m Unknown Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores
def main(): """Main entry point to Recentrifuge.""" def configure_parser(): """Argument Parser Configuration""" parser = argparse.ArgumentParser( description='Analyze results of metagenomic taxonomic classifiers', epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-V', '--version', action='version', version=f'%(prog)s version {__version__} released in {__date__}') parser_in = parser.add_argument_group( 'input', 'Define Recentrifuge input files and formats') parser_in.add_argument('-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files ' '(nodes.dmp and names.dmp from NCBI)')) parser_filein = parser_in.add_mutually_exclusive_group(required=True) parser_filein.add_argument( '-f', '--file', action='append', metavar='FILE', type=Filename, help=('Centrifuge output files. If a single directory is entered, ' 'every .out file inside will be taken as a different sample.' ' Multiple -f is available to include several samples.')) parser_filein.add_argument( '-l', '--lmat', action='append', metavar='FILE', type=Filename, default=None, help=('LMAT output dir or file prefix. If just "." is entered, ' 'every subdirectory under the current directory will be ' 'taken as a sample and scanned looking for LMAT output files' '. Multiple -l is available to include several samples.')) parser_filein.add_argument( '-k', '--clark', action='append', metavar='FILE', type=Filename, help=('CLARK(S) output files. If a single directory is entered, ' 'every .csv file inside will be taken as a different sample.' ' Multiple -k is available to include several samples.')) parser_filein.add_argument( '-r', '--report', action='append', metavar='FILE', type=Filename, help=('Centrifuge/Kraken report files ' '(multiple -r is available to include several samples)')) parser_out = parser.add_argument_group( 'output', 'Related to the Recentrifuge output files') parser_out.add_argument( '-o', '--outhtml', action='store', metavar='FILE', type=Filename, help='HTML output file (if not given, the filename will be ' 'inferred from input files)') parser_out.add_argument( '-e', '--excel', action='store', metavar='OUTPUT_TYPE', choices=[str(excel) for excel in Excel], default=str(Excel(0)), help=(f'type of excel report to be generated, and can be one of ' f'{[str(excel) for excel in Excel]}')) parser_coarse = parser.add_argument_group( 'tuning', 'Coarse tuning of algorithm parameters') parser_cross = parser_coarse.add_mutually_exclusive_group( required=False) parser_cross.add_argument( '-c', '--controls', action='store', metavar='CONTROLS_NUMBER', type=int, default=0, help=('this number of first samples will be treated as negative ' 'controls; default is no controls')) parser_coarse.add_argument( '-s', '--scoring', action='store', metavar='SCORING', choices=[str(each_score) for each_score in Scoring], default=str(Scoring(0)), help=(f'type of scoring to be applied, and can be one of ' f'{[str(scoring) for scoring in Scoring]}')) parser_coarse.add_argument( '-y', '--minscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'to pass the quality filter; all pass by default')) parser_coarse.add_argument( '-m', '--mintaxa', action='store', metavar='INT', type=int, default=DEFMINTAXA, help='minimum taxa to avoid collapsing one level to the parent one' ) parser_coarse.add_argument( '-x', '--exclude', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to exclude a taxon and all underneath ' '(multiple -x is available to exclude several taxid)')) parser_coarse.add_argument( '-i', '--include', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to include a taxon and all underneath ' '(multiple -i is available to include several taxid); ' 'by default, all the taxa are considered for inclusion')) parser_cross.add_argument('-a', '--avoidcross', action='store_true', help='avoid cross analysis') parser_fine = parser.add_argument_group( 'fine tuning', 'Fine tuning of algorithm parameters') parser_fine.add_argument( '-z', '--ctrlminscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'in control samples to pass the quality filter; if defaults ' 'to "minscore"')) parser_fine.add_argument( '-w', '--ctrlmintaxa', action='store', metavar='INT', type=int, default=None, help='minimum taxa to avoid collapsing one level to the parent one' ' in control samples; it defaults to "mintaxa"') parser_fine.add_argument( '-u', '--summary', action='store', metavar='OPTION', choices=['add', 'only', 'avoid'], default='add', help=( 'select to "add" summary samples to other samples, or to ' '"only" show summary samples or to "avoid" summaries at all')) parser_fine.add_argument( '-t', '--takeoutroot', action='store_true', help='remove counts directly assigned to the "root" level') parser_fine.add_argument('--nokollapse', action='store_true', help='show the "cellular organisms" taxon') parser_mode = parser.add_argument_group('advanced', 'Advanced modes of running') parser_mode.add_argument( '--dummy', # hidden flag: just generate a dummy plot for JS debug action='store_true', help=argparse.SUPPRESS) parser_mode.add_argument( '-g', '--debug', action='store_true', help='increase output verbosity and perform additional checks') parser_mode.add_argument('--sequential', action='store_true', help='deactivate parallel processing') return parser def check_debug(): """Check debugging mode""" if args.debug: print(blue('INFO:'), gray('Debugging mode activated')) print(blue('INFO:'), gray('Active parameters:')) for key, value in vars(args).items(): if value: print(gray(f'\t{key} ='), f'{value}') def select_inputs(): """Choose right classifier, input and output files""" nonlocal process, scoring, input_files, plasmidfile, classifier if reports: classifier = Classifier.KRAKEN process = process_report input_files = reports elif clarks: classifier = Classifier.CLARK process = process_output input_files = clarks if len(clarks) == 1 and os.path.isdir(clarks[0]): select_clark_inputs(clarks) elif lmats: classifier = Classifier.LMAT scoring = Scoring.LMAT process = process_output input_files = lmats plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE)) select_lmat_inputs(lmats) elif outputs: classifier = Classifier.CENTRIFUGE process = process_output input_files = outputs if len(outputs) == 1 and os.path.isdir(outputs[0]): select_centrifuge_inputs(outputs) def check_controls(): """Check and info about the control samples""" if args.controls: if args.controls > len(input_files): print(red(' ERROR!'), gray('More controls than samples')) exit(1) print(gray('Control(s) sample(s) for subtractions:')) for i in range(args.controls): print(blue(f'\t{input_files[i]}')) def select_html_file(): """HTML filename selection""" nonlocal htmlfile if lmats: # Select case for dir name or filename prefix if os.path.isdir(lmats[0]): # Dir name dirname = os.path.dirname(os.path.normpath(lmats[0])) if not dirname or dirname == '.': basename = 'output' else: basename = os.path.basename(dirname) else: # Explicit path and file name prefix is provided dirname, basename = os.path.split(lmats[0]) htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX)) elif reports: htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX) else: htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX) def read_samples(): """Read samples""" print(gray('\nPlease, wait, processing files in parallel...\n')) # Enable parallelization with 'spawn' under known platforms if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') with mpctx.Pool( processes=min(os.cpu_count(), len(input_files))) as pool: async_results = [ pool.apply_async( process, args=[ input_files[num], # file name True if num < args.controls else False ], # is ctrl? kwds=kwargs) for num in range(len(input_files)) ] for file, (sample, tree, out, stat, err) in zip(input_files, [r.get() for r in async_results]): if err is Err.NO_ERROR: samples.append(sample) trees[sample] = tree taxids[sample] = out.get_taxlevels() counts[sample] = out.counts accs[sample] = out.accs scores[sample] = out.scores stats[sample] = stat elif err is Err.VOID_CTRL: print('There were void controls.', red('Aborting!')) exit(1) else: # sequential processing of each sample for num, file in enumerate(input_files): (sample, tree, out, stat, err) = process(file, True if num < args.controls else False, **kwargs) if err is Err.NO_ERROR: samples.append(sample) trees[sample] = tree taxids[sample] = out.get_taxlevels() counts[sample] = out.counts accs[sample] = out.accs scores[sample] = out.scores stats[sample] = stat elif err is Err.VOID_CTRL: print('There were void controls.', red('Aborting!')) exit(1) raw_samples.extend(samples) # Store raw sample names def analyze_samples(): """Cross analysis of samples in parallel by taxlevel""" print(gray('Please, wait. Performing cross analysis in parallel...\n')) # Update kwargs with more parameters for the followings func calls kwargs.update({ 'taxids': taxids, 'counts': counts, 'scores': scores, 'accs': accs, 'raw_samples': raw_samples }) if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') # Important for OSX&Win with mpctx.Pool(processes=min(os.cpu_count(), len(Rank.selected_ranks))) as pool: async_results = [ pool.apply_async(process_rank, args=[level], kwds=kwargs) for level in Rank.selected_ranks ] for level, (smpls, abunds, accumulators, score) in zip(Rank.selected_ranks, [r.get() for r in async_results]): samples.extend(smpls) counts.update(abunds) accs.update(accumulators) scores.update(score) else: # sequential processing of each selected rank for level in Rank.selected_ranks: (smpls, abunds, accumulators, score) = process_rank(level, **kwargs) samples.extend(smpls) counts.update(abunds) accs.update(accumulators) scores.update(score) def summarize_samples(): """Summary of samples in parallel by type of cross-analysis""" # timing initialization summ_start_time: float = time.perf_counter() print(gray('Please, wait. Generating summaries in parallel...')) # Update kwargs with more parameters for the followings func calls kwargs.update({'samples': samples}) # Get list of set of samples to summarize (note pylint bug #776) # pylint: disable=unsubscriptable-object target_analysis: col.OrderedDict[str, None] = col.OrderedDict({ f'{raw}_{study}': None for study in [STR_EXCLUSIVE, STR_CONTROL] for raw in raw_samples for smpl in samples if smpl.startswith(f'{raw}_{study}') }) # pylint: enable=unsubscriptable-object # Add shared and control_shared analysis if they exist (are not void) for study in [STR_SHARED, STR_CONTROL_SHARED]: for smpl in samples: if smpl.startswith(study): target_analysis[study] = None break if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') with mpctx.Pool( processes=min(os.cpu_count(), len(input_files))) as pool: async_results = [ pool.apply_async(summarize_analysis, args=[analysis], kwds=kwargs) for analysis in target_analysis ] for analysis, (summary, abund, acc, score) in zip(target_analysis, [r.get() for r in async_results]): if summary: # Avoid adding empty samples summaries.append(summary) counts[summary] = abund accs[summary] = acc scores[summary] = score else: # sequential processing of each selected rank for analysis in target_analysis: (summary, abund, acc, score) = summarize_analysis(analysis, **kwargs) if summary: # Avoid adding empty samples summaries.append(summary) counts[summary] = abund accs[summary] = acc scores[summary] = score # Timing results print(gray('Summary elapsed time:'), f'{time.perf_counter() - summ_start_time:.3g}', gray('sec')) def generate_krona(): """Generate Krona plot with all the results via Krona 2.0 XML spec""" print(gray('\nBuilding the taxonomy multiple tree... '), end='') sys.stdout.flush() krona: KronaTree = KronaTree( samples, num_raw_samples=len(raw_samples), stats=stats, min_score=Score( min([ min(scores[sample].values()) for sample in samples if len(scores[sample]) ])), max_score=Score( max([ max(scores[sample].values()) for sample in samples if len(scores[sample]) ])), scoring=scoring, ) polytree.grow(ontology=ncbi, abundances=counts, accs=accs, scores=scores) print(green('OK!')) print(gray('Generating final plot (') + magenta(htmlfile) + gray(')... '), end='') sys.stdout.flush() polytree.toxml(ontology=ncbi, krona=krona) krona.tohtml(htmlfile, pretty=False) print(green('OK!')) def generate_excel(): """Generate Excel with results via pandas DataFrame""" xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx') print(gray(f'Generating Excel {str(excel).lower()} summary (') + magenta(xlsx_name) + gray(')... '), end='') sys.stdout.flush() xlsxwriter = pd.ExcelWriter(xlsx_name) list_rows: List = [] # Save raw samples basic statistics data_frame: pd.DataFrame = pd.DataFrame.from_dict( {raw: stats[raw].to_dict() for raw in raw_samples}) data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats') # Save taxid related statistics per sample if excel is Excel.FULL: polytree.to_items(ontology=ncbi, items=list_rows) # Generate the pandas DataFrame from items and export to Excel iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]] cols1 = pd.MultiIndex.from_product(iterable_1, names=['Samples', 'Stats']) iterable_2 = [['Details'], ['Rank', 'Name']] cols2 = pd.MultiIndex.from_product(iterable_2) cols = cols1.append(cols2) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=cols) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=str(excel)) elif excel is Excel.CMPLXCRUNCHER: target_ranks: List = [Rank.NO_RANK] if args.controls: # if controls, add specific sheet for rank target_ranks.extend(Rank.selected_ranks) for rank in target_ranks: # Once for no rank dependency (NO_RANK) indexes: List[int] sheet_name: str columns: List[str] if args.controls: indexes = [ i for i in range(len(raw_samples), len(samples)) # Check if sample ends in _(STR_CONTROL)_(rank) if (STR_CONTROL in samples[i].split('_')[-2:] and rank.name.lower() in samples[i].split('_')[-1:]) ] sheet_name = f'{STR_CONTROL}_{rank.name.lower()}' columns = [ samples[i].replace( '_' + STR_CONTROL + '_' + rank.name.lower(), '') for i in indexes ] if rank is Rank.NO_RANK: # No rank dependency indexes = list(range(len(raw_samples))) sheet_name = f'raw_samples_{rank.name.lower()}' columns = raw_samples list_rows = [] polytree.to_items(ontology=ncbi, items=list_rows, sample_indexes=indexes) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=columns) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=sheet_name) else: raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"') xlsxwriter.save() print(green('OK!')) # timing initialization start_time: float = time.time() # Program header print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}' f' =-= by {__author__} =-=\n') sys.stdout.flush() # Parse arguments argparser = configure_parser() args = argparser.parse_args() outputs: List[Filename] = args.file reports: List[Filename] = args.report lmats: List[Filename] = args.lmat clarks: List[Filename] = args.clark input_files: List[Filename] nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE)) namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE)) htmlfile: Filename = args.outhtml collapse: bool = not args.nokollapse excluding: Set[Id] = set(args.exclude) including: Set[Id] = set(args.include) scoring: Scoring = Scoring[args.scoring] excel: Excel = Excel[args.excel] check_debug() plasmidfile: Filename = None classifier: Classifier process: Callable[..., Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]] select_inputs() check_controls() if not htmlfile: select_html_file() # Load NCBI nodes, names and build children ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, collapse, excluding, including, args.debug) # If dummy flag enabled, just create dummy krona and exit if args.dummy: _debug_dummy_plot(ncbi, htmlfile, scoring) exit(0) # Declare variables that will hold results for the samples analyzed trees: Dict[Sample, TaxTree] = {} counts: Dict[Sample, Counter[Id]] = {} accs: Dict[Sample, Counter[Id]] = {} taxids: Dict[Sample, TaxLevels] = {} scores: Dict[Sample, Dict[Id, Score]] = {} stats: Dict[Sample, SampleStats] = {} samples: List[Sample] = [] raw_samples: List[Sample] = [] # Define dictionary of parameters for methods to be called (to be extended) kwargs = { 'controls': args.controls, 'ctrlminscore': (args.ctrlminscore if args.ctrlminscore is not None else args.minscore), 'ctrlmintaxa': (args.ctrlmintaxa if args.ctrlmintaxa is not None else args.mintaxa), 'debug': args.debug, 'root': args.takeoutroot, 'classifier': classifier, 'minscore': args.minscore, 'mintaxa': args.mintaxa, 'scoring': scoring, 'ontology': ncbi, } # The big stuff (done in parallel) read_samples() # Avoid cross analysis if just one report file or explicitly stated by flag if len(raw_samples) > 1 and not args.avoidcross: analyze_samples() if args.summary != 'avoid': summaries: List[Sample] = [] summarize_samples() if args.summary == 'only': samples = raw_samples + summaries else: samples.extend(summaries) # Final result generation is done in sequential mode polytree: MultiTree = MultiTree(samples=samples) generate_krona() if _USE_PANDAS: generate_excel() else: print(yellow('WARNING!'), 'Pandas not installed: Excel cannot be created.') # Timing results print(gray('Total elapsed time:'), time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
def process_output( *args, **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]: """ Process classifiers output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() ontology: Ontology = kwargs['ontology'] mintaxa: Optional[int] = (kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']) minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Union[Tuple, Set[Id]] = ontology.including excluding: Union[Tuple, Set[Id]] = ontology.excluding scoring: Scoring = kwargs['scoring'] classifier: Classifier = kwargs['classifier'] genfmt: GenericFormat = kwargs['genfmt'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read taxonomic classifier output files to get abundances read_method: Callable[ # Format: [[Input], Output] [Filename, Scoring, Optional[Score]], Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]] log: str stat: SampleStats counts: Counter[Id] scores: Dict[Id, Score] if classifier is Classifier.GENERIC: # Direct call to generic method log, stat, counts, scores = read_generic_output( target_file, scoring, minscore, genfmt) else: # Use read_method if classifier is Classifier.KRAKEN: read_method = read_kraken_output elif classifier is Classifier.CLARK: read_method = read_clark_output elif classifier is Classifier.LMAT: read_method = read_lmat_output elif classifier is Classifier.CENTRIFUGE: read_method = read_output else: raise Exception(red('\nERROR!'), f'taxclass: Unknown classifier "{classifier}".') log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Complete/Update fields in stats stat.is_ctrl = is_ctrl # set control nature of the sample if mintaxa is not None: # manual mintaxa has precedence over automatic stat.mintaxa = mintaxa else: # update local value with the automatically guessed value mintaxa = stat.mintaxa # Move cellular_organisms counts to root, in case if ontology.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n')) if counts[ontology.ROOT]: stat.decrease_filtered_taxids() scores[ontology.ROOT] = Score( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ontology.ROOT] * counts[ontology.ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT])) else: scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS] counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ontology.ROOT]: vwrite(gray('Removing'), counts[ontology.ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ontology.ROOT]) stat.decrease_filtered_taxids() counts[ontology.ROOT] = 0 scores[ontology.ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building ontology tree output.write( gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' + gray(' ... \n')) vwrite(gray(' Building ontology tree with all-in-1... ')) tree = TaxTree() ancestors: Set[Id] orphans: Set[Id] ancestors, orphans = ontology.get_ancestors(counts.keys()) out = SampleDataById(['all']) tree.allin1(ontology=ontology, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Stats: Complete final value for TaxIDs after tree building and folding final_taxids: int = len(out.counts) if out.counts is not None else 0 stat.set_final_taxids(final_taxids) # Check for additional loss of reads (due to include/exclude an orphans) output.write(gray(' Check for more seqs lost ([in/ex]clude affects)... ')) if out.counts is not None: discard: int = sum(counts.values()) - sum(out.counts.values()) if discard: output.write( blue('\n Info:') + f' {discard} ' + gray('additional seqs discarded (') + f'{discard/sum(counts.values()):.3%} ' + gray('of accepted)\n')) else: output.write(green('OK!\n')) else: output.write(red('No counts in sample tree!\n')) # Warn or give detailed stats about orphan taxid and orphan seqs if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow(' Warning!'), gray('Orphan taxid'), f'{orphan}\n') lost += counts[orphan] vwrite( yellow(' WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of accepted)\n' f' and {lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of accepted)\n') else: vwrite(green('OK!\n')) elif orphans: output.write( yellow('\n Warning!') + f' {len(orphans)} orphan taxids' + gray(' (rerun with --debug for details)\n')) # Check the removal of TaxIDs (accumulation of leaves in parents) if debug and not excluding and including == {ontology.ROOT}: vwrite(gray(' Assess accumulation due to "folding the tree"...\n')) migrated: int = 0 if out.counts is not None: for taxid in counts: if out.counts[taxid] == 0: migrated += 1 vwrite( blue(' Info:'), gray(f'Folded TaxID {taxid} (') + f'{ontology.get_name(taxid)}' + gray(') with ') + f'{counts[taxid]}' + gray(' original seqs\n')) if migrated: vwrite( blue(' INFO:'), f'{migrated} TaxIDs folded (' f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)' '\n') vwrite( blue(' INFO:'), f'Final assigned TaxIDs: {final_taxids} ' f'(reduced to {final_taxids/len(+counts):.2%} of ' 'number of TAF)\n') else: vwrite(blue(' INFO:'), gray('No migration!'), green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def tohtml( self, filename: Filename, pretty: bool = False, ) -> None: """ Write Krona HTML. Args: filename: the name of the HTML output file. pretty: this parameter controls the layout of the XML code so that it is human readable for True (use for debug only because it uses a lot more of space and also has empty tags which are currently NOT SUPPORTED BY KRONA) and machine readable for False (default, saves space). Returns: None """ # Warn about use of pretty option if pretty: print( yellow(f'\nWARNING! Pretty XML uses empty tags which are' f' UNSUPPORTED by Krona-JS!')) print(yellow(f'WARNING! Prepare for unexpected HTML results!')) # Read aux files path = os.path.dirname(os.path.realpath(__file__)) with open(path + HIDDEN, 'r') as file: hidden_image = file.read() with open(path + LOADING, 'r') as file: loading_image = file.read() with open(path + FAVICON, 'r') as file: favicon = file.read() path_logo: str if self.chart == Chart.TAXOMIC: path_logo = path + LOGO_RCF elif self.chart == Chart.GENOMIC: path_logo = path + LOGO_RGF else: raise Exception(f'ERROR! Unknown Chart "{self.chart}"') with open(path_logo, 'r') as file: logo = file.read() with open(f'{path}/{JSLIB}', 'r') as file: script = file.read() # Set root of HTML doc html_root = ETree.Element( # type: ignore 'html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'}) # Prepare HTML file head = self.sub(html_root, 'head') self.sub(head, 'meta', {'charset': 'utf-8'}) self.sub(head, 'link', {'rel': 'shortcut icon', 'href': favicon}) self.sub( head, 'link', { 'rel': 'stylesheet', 'href': 'https://fonts.googleapis.com/css?family=Ubuntu' }) self.sub(head, 'script', {'id': 'notfound'}, 'window.onload=function(){document.body.innerHTML=""}') self.sub(head, 'script', { 'language': 'javascript', 'type': 'text/javascript' }, script) # Include javascript body = self.sub(html_root, 'body') self.sub(body, 'img', { 'id': 'hiddenImage', 'src': hidden_image, 'style': 'display:none' }) self.sub(body, 'img', { 'id': 'loadingImage', 'src': loading_image, 'style': 'display:none' }) self.sub(body, 'img', { 'id': 'logo', 'src': logo, 'style': 'display:none' }) self.sub(body, 'noscript', None, 'Javascript must be enabled to view this page.') div = self.sub(body, 'div', {'style': 'display:none'}) div.append(self.krona) # Include specific XML from samples # Write the HTML file with open(filename, 'w') as html_file: html_file.write( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n') # pylint: disable=line-too-long if pretty: html_file.write(self.to_pretty_string(html_root)) else: html_file.write( ETree.tostring( html_root, encoding='unicode', method='html', short_empty_elements=False, ))
def read_output( output_file: Filename, scoring: Scoring = Scoring.SHEL, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Centrifuge output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: file.readline() # discard header for output_line in file: try: _, _, _tid, _score, _, _, _length, *_ = output_line.split( '\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue tid = Id(_tid) try: # From Centrifuge score get "single hit equivalent length" shel = Score(float(_score)**0.5 + 15) length = int(_length) except ValueError: print(yellow('Failure'), f'parsing score ({_score}) for ', f'query length {_length} for taxid {_tid}', f'in {output_file}. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length if tid == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None and shel < minscore: continue # Ignore read if low confidence try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') + f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini}' + gray(', max = ') + f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def read_generic_output( output_file: Filename, scoring: Scoring = Scoring.GENERIC, minscore: Score = None, genfmt: GenericFormat = None ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read an output file from a generic classifier Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification genfmt: GenericFormat object specifying the files format Returns: log string, statistics, abundances counter, scores dict """ # Initialization of variables output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) # Check format if not isinstance(genfmt, GenericFormat): raise Exception( red('\nERROR!'), 'Missing GenericFormat when reading a generic output.') try: with open(output_file, 'r') as file: # Main loop processing each file line for raw_line in file: raw_line = raw_line.strip(' \n\t') splitting: str if genfmt.typ is GenericType.CSV: splitting = ',' elif genfmt.typ is GenericType.TSV: splitting = '\t' elif genfmt.typ is GenericType.SSV: splitting = ' ' else: raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}') output_line: List[str] = raw_line.split(splitting) if len(output_line) < GenericFormat.MIN_COLS: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as an error raise Exception( red('\nERROR!') + ' Line ' + yellow(f'{output_line}') + '\n\tin ' + yellow(f'{output_file}') + ' has < ' + blue(f'{GenericFormat.MIN_COLS}') + ' required ' + 'columns.\n\tPlease check the file.') try: tid: Id = Id(output_line[genfmt.tid - 1].strip(' "')) length: int = int(output_line[genfmt.len - 1].strip(' "')) if tid == genfmt.unc: # Avoid read score for unclass reads num_read += 1 nt_read += length num_uncl += 1 continue score: Score = Score( float(output_line[genfmt.sco - 1].strip(' "'))) except ValueError: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as a failure print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 if num_read > 100 and num_errors > 0.5 * num_read: print( red('ERROR!'), 'Unreliable file processing: rate of problematic' f' reads is {num_errors/num_read*100:_d}, beyond' ' 50%, after 100 reads. Please check the format ' f'of the file "{output_file}".') raise else: continue num_read += 1 nt_read += length taxids.add(tid) # Save all the tids of classified reads if minscore is not None and score < minscore: continue # Discard read if low confidence try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.GENERIC: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(red('\nERROR!'), f'Generic: Unsupported Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores