def select_clark_inputs(clarks: List[Filename], ext: str = '.csv') -> None: """Search for CLARK, CLARK-l, CLARK-S files to analyze""" dir_name = clarks[0] clarks.clear() with os.scandir(dir_name) as dir_entry: for fil in dir_entry: if not fil.name.startswith('.') and fil.name.endswith(ext): if dir_name != '.': clarks.append(Filename(os.path.join(dir_name, fil.name))) else: # Avoid sample names starting with just the dot clarks.append(Filename(fil.name)) clarks.sort() print(gray(f'CLARK {ext} files to analyze:'), clarks)
def select_kraken_inputs(krakens: List[Filename], ext: str = '.krk') -> None: """Search for Kraken files to analyze""" dir_name = krakens[0] krakens.clear() with os.scandir(dir_name) as dir_entry: for fil in dir_entry: if not fil.name.startswith('.') and fil.name.endswith(ext): if dir_name != '.': krakens.append(Filename(os.path.join(dir_name, fil.name))) else: # Avoid sample names starting with just the dot krakens.append(Filename(fil.name)) krakens.sort() print(gray(f'Kraken {ext} files to analyze:'), krakens)
def select_centrifuge_inputs(outputs: List[Filename], ext: str = '.out') -> None: """Centrifuge output files processing specific stuff""" dir_name = outputs[0] outputs.clear() with os.scandir(dir_name) as dir_entry: for fil in dir_entry: if not fil.name.startswith('.') and fil.name.endswith(ext): if dir_name != '.': outputs.append(Filename(os.path.join(dir_name, fil.name))) else: # Avoid sample names starting with just the dot outputs.append(Filename(fil.name)) outputs.sort() print(gray(f'Centrifuge {ext} files to analyze:'), outputs)
def select_inputs(): """Choose right classifier, input and output files""" nonlocal process, scoring, input_files, plasmidfile, classifier if reports: classifier = Classifier.KRAKEN process = process_report input_files = reports elif clarks: classifier = Classifier.CLARK process = process_output input_files = clarks if len(clarks) == 1 and os.path.isdir(clarks[0]): select_clark_inputs(clarks) elif lmats: classifier = Classifier.LMAT scoring = Scoring.LMAT process = process_output input_files = lmats plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE)) select_lmat_inputs(lmats) elif outputs: classifier = Classifier.CENTRIFUGE process = process_output input_files = outputs if len(outputs) == 1 and os.path.isdir(outputs[0]): select_centrifuge_inputs(outputs)
def krona_from_text( samples: List[Sample], outputs: Dict[Rank, Filename], htmlfile: Filename = Filename('Output' + HTML_SUFFIX), ): """Generate the Krona html file calling ktImportText. Superseded by krona.krona_from_xml(). """ subprc = ["ktImportText"] subprc.extend(samples) try: subprc.extend([ outputs[level][i] for level in list(Rank.selected_ranks) for i in range(len(outputs[level])) ]) except KeyError: pass subprc.extend(["-o", htmlfile]) try: subprocess.run(subprc, check=True) except subprocess.CalledProcessError: print('\n\033[91mERROR!\033[0m ktImportText: ' + 'returned a non-zero exit status (Krona plot built failed)')
def select_html_file(): """HTML filename selection""" nonlocal htmlfile if lmats: # Select case for dir name or filename prefix if os.path.isdir(lmats[0]): # Dir name dirname = os.path.dirname(os.path.normpath(lmats[0])) if not dirname or dirname == '.': basename = 'output' else: basename = os.path.basename(dirname) else: # Explicit path and file name prefix is provided dirname, basename = os.path.split(lmats[0]) htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX)) elif reports: htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX) else: htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX)
def by_excel_file(dirname: Filename = None) -> None: """Do the job in case of Excel file with all the details""" if dirname is None: dirname = Filename(os.path.dirname(xcel)) os.makedirs(dirname, exist_ok=True) # Expected index (taxids) in column after taxa name, and last row will # be removed (reserved for sum of reads in Excel file) mock_df = pd.read_excel(xcel, index_col=1, skipfooter=1, dtype=str) del mock_df['RECENTRIFUGE MOCK'] vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n') for name, series in mock_df.iteritems(): mock_layout: Counter[Id] = col.Counter(series.to_dict(dict)) # In prev, series.to_dict(col.Counter) fails, so this is workaround test: Filename = Filename(os.path.join(dirname, name + '.out')) if file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout)
def select_lmat_inputs(lmats: List[Filename]) -> None: """"LMAT files processing specific stuff""" if lmats == ['.']: lmats.clear() with os.scandir() as dir_entry: for entry in dir_entry: if not entry.name.startswith('.') and entry.is_dir(): if entry.name != os.path.basename(TAXDUMP_PATH): lmats.append(Filename(entry.name)) lmats.sort() print(gray('LMAT subdirs to analyze:'), lmats)
def by_mock_files() -> None: """Do the job in case of mock files""" if len(args.mock) == 1 and os.path.isdir(args.mock[0]): select_centrifuge_inputs(args.mock, ext='.mck') for mock in args.mock: mock_layout: Counter[Id] = read_mock_files(mock) test: Filename = Filename(mock.split('.mck')[0] + '.out') if args.file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout)
def krona_from_xml( xmlfile: Filename, htmlfile: Filename = Filename('Output' + HTML_SUFFIX), ): """Generate the Krona html file calling ktImportXML.""" subprc = ["ktImportXML"] subprc.append(xmlfile) subprc.extend(["-o", htmlfile]) try: subprocess.run(subprc, check=True) except subprocess.CalledProcessError: print('\n\033[91mERROR!\033[0m ktImportXML: ' + 'returned a non-zero exit status (Krona plot built failed)')
def format_filename(fastq: Filename) -> Filename: """Auxiliary function to properly format the output filenames. Args: fastq: Complete filename of the fastq input file Returns: Filename of the rextracted fastq output file """ fastq_filename, _ = os.path.splitext(fastq) output_list: List[str] = [fastq_filename, '_rxtr'] if including: output_list.append('_incl') output_list.extend('_'.join(including)) if excluding: output_list.append('_excl') output_list.extend('_'.join(excluding)) output_list.append('.fastq') return Filename(''.join(output_list))
def select_inputs(): """Choose right input and output files""" nonlocal process, scoring, input_files, plasmidfile if outputs and len(outputs) == 1 and os.path.isdir(outputs[0]): select_centrifuge_inputs(outputs) if lmats: plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE)) select_lmat_inputs(lmats) # Select method and arguments depending on type of files to analyze if lmats: process = process_output input_files = lmats scoring = Scoring.LMAT elif reports: process = process_report input_files = reports else: process = process_output input_files = outputs
def read_lmat_output( output_file: Filename, scoring: Scoring = Scoring.LMAT, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read LMAT output (iterate over all the output files) Args: output_file: output file name (prefix) scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} nt_read: int = 0 matchings: Counter[Match] = Counter() output_files: List[Filename] = [] # Select files to process depending on if the output files are explicitly # given or directory name is provided (all the output files there) if os.path.isdir(output_file): # Just the directory name is provided dirname = os.path.normpath(output_file) for file in os.listdir(dirname): # Add all LMAT output files in dir if ('_output' in file and file.endswith('.out') and 'canVfin' not in file and 'pyLCA' not in file): output_files.append(Filename(file)) else: # Explicit path and file name prefix is given dirname, basename = os.path.split(output_file) for file in os.listdir(dirname): # Add selected output files in dir if (file.startswith(basename) and file.endswith('.out') and 'canVfin' not in file and 'pyLCA' not in file): output_files.append(Filename(file)) if not output_files: raise Exception( f'\n\033[91mERROR!\033[0m Cannot read from "{output_file}"') # Read LMAT output files for output_name in output_files: path: Filename = Filename(os.path.join(dirname, output_name)) output.write(f'\033[90mLoading output file {path}...\033[0m') try: with open(path, 'r') as io_file: for seq in SeqIO.parse(io_file, "lmat"): tid: Id = seq.annotations['final_taxid'] score: Score = seq.annotations['final_score'] match: Match = Match.lmat(seq.annotations['final_match']) matchings[match] += 1 length: int = len(seq) nt_read += length if minscore is not None: if score < minscore: # Ignore read if low score continue if match in [Match.DIRECTMATCH, Match.MULTIMATCH]: try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR!') + f'Cannot read "{path}"') output.write(green('OK!\n')) abundances: Counter[Id] = Counter( {tid: len(all_scores[tid]) for tid in all_scores}) # Basic output statistics read_seqs: int = sum(matchings.values()) if read_seqs == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=read_seqs, seq_filt=filt_seqs, seq_clas=matchings[Match.DIRECT] + matchings[Match.MULTI]) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) multi_rel: float = matchings[Match.MULTI] / read_seqs direct_rel: float = matchings[Match.DIRECT] / read_seqs nodbhits_rel: float = matchings[Match.NODBHITS] / read_seqs tooshort_rel: float = matchings[Match.READTOOSHORT] / read_seqs lowscore_rel: float = matchings[Match.LOWSCORE] / read_seqs output.write(f'\033[90m DB Matching: ' f'Multi =\033[0m {multi_rel:.1%}\033[90m ' f'Direct =\033[0m {direct_rel:.1%}\033[90m ' f'ReadTooShort =\033[0m {tooshort_rel:.1%}\033[90m ' f'LowScore =\033[0m {lowscore_rel:.1%}\033[90m ' f'NoDbHits =\033[0m {nodbhits_rel:.1%}\033[90m\n') output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write(f' {stat.num_taxa}' + gray(f' taxa with assigned reads\n')) # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.LMAT: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} else: print(red('ERROR!'), f' LMAT: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, abundances, out_scores
def generate_mock( ncbi: Taxonomy, file: Filename, rnd: int, mocks: List[Filename], xcel: Filename, debug: bool, ): def vprint(*args): """Print only if verbose/debug mode is enabled""" if debug: print(*args, end='') sys.stdout.flush() def read_mock_files(mock: Filename) -> Counter[Id]: """Read a mock layout (.mck) file""" mock_layout: Counter[Id] = col.Counter() with open(mock, 'r') as mck: vprint(gray('\nProcessing'), blue(mock), gray('file:\n')) for line in mck: if line.startswith('#'): continue _tid, _num = line.split('\t') tid = Id(_tid) num = int(_num) mock_layout[tid] = num vprint(num, gray('\treads for taxid\t'), tid, '\t(', cyan(ncbi.get_name(tid)), ')\n') return mock_layout def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from source file""" with open(out, 'w') as fout, open(file) as fcfg: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write(fcfg.readline()) # copy cfg output file header reads_writen: int = 0 for line in fcfg: tid = Id(line.split('\t')[2]) if mock_layout[tid]: fout.write(line) mock_layout[tid] -= 1 reads_writen += 1 if not sum(mock_layout.values()): vprint(reads_writen, 'reads', green('OK!\n')) break if sum(mock_layout.values()): print(red('ERROR!\n')) print(gray('Incomplete read copy by taxid:')) mock_layout = +mock_layout # Delete zero counts elements for tid in mock_layout: print(yellow(mock_layout[tid]), gray('reads missing for tid'), tid, '(', cyan(ncbi.get_name(tid)), ')\n') def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from scratch""" with open(out, 'w') as fout: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t' 'hitLength\tqueryLength\tnumMatches\n') reads_writen: int = 0 for numtid in mock_layout: tid = Id(numtid) # Convert to Id the excel integer maxhl: int = random.randint(rnd + 1, MAX_HIT_LENGTH) rank: str = str(ncbi.get_rank(tid)).lower() for _ in range(int(mock_layout[numtid])): hit_length = random.randint(rnd + 1, maxhl) fout.write(f'test{reads_writen}\t{rank}\t' f'{tid}\t{(hit_length - 15) ** 2}\t' f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n') reads_writen += 1 vprint(reads_writen, 'reads', green('OK!\n')) if out == TEST_REXT_SMPL: # Test mode: create mock FASTQ for smpl mock_fastq(reads_writen) def by_mock_files() -> None: """Do the job in case of mock files""" if len(mocks) == 1 and os.path.isdir(mocks[0]): select_centrifuge_inputs(mocks, ext='.mck') for mock in mocks: mock_layout: Counter[Id] = read_mock_files(mock) test: Filename = Filename(mock.split('.mck')[0] + '.out') if file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout) def by_excel_file(dirname: Filename = None) -> None: """Do the job in case of Excel file with all the details""" if dirname is None: dirname = Filename(os.path.dirname(xcel)) os.makedirs(dirname, exist_ok=True) # Expected index (taxids) in column after taxa name, and last row will # be removed (reserved for sum of reads in Excel file) mock_df = pd.read_excel(xcel, index_col=1, skipfooter=1, dtype=str) del mock_df['RECENTRIFUGE MOCK'] vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n') for name, series in mock_df.iteritems(): mock_layout: Counter[Id] = col.Counter(series.to_dict(dict)) # In prev, series.to_dict(col.Counter) fails, so this is workaround test: Filename = Filename(os.path.join(dirname, name + '.out')) if file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout) def mock_fastq(num_reads: int) -> None: """Do the job in case of Excel file with all the details""" def fastq_seqs(alphabet=single_letter_alphabet): """Generator function that creates mock fastq sequences """ for seq in range(num_reads): yield SeqRecord(Seq('AGTC', alphabet), id=f'test{seq}', name=f'test{seq}', description=f'test{seq}', annotations={'quality': '@@@@'}) print(gray('Writing'), magenta(f'{num_reads}'), gray('reads in'), TEST_REXT_FSTQ, gray('...'), end='', flush=True) SeqIO.write((sq for sq in fastq_seqs()), TEST_REXT_FSTQ, 'quickfastq') print(green(' OK!')) if mocks: by_mock_files() elif xcel: by_excel_file() else: # Test mode path = os.path.dirname(os.path.realpath(__file__)) xcel = Filename(os.path.join(path, TEST_MOCK_XLSX)) vprint(gray('Test mode! Processing'), xcel, '\n') random.seed(18490) by_excel_file(dirname=TEST_OUTPUT_DIR)
def main(): """Main entry point to script.""" # Argument Parser Configuration parser = argparse.ArgumentParser( description='Extract reads following Centrifuge/Kraken output', epilog=f'%(prog)s - {__author__} - {__date__}') parser.add_argument('-V', '--version', action='version', version=f'%(prog)s release {__version__} ({__date__})') parser.add_argument('-f', '--file', action='store', metavar='FILE', required=True, help='Centrifuge output file.') parser.add_argument('-l', '--limit', action='store', metavar='NUMBER', type=int, default=None, help=('Limit of FASTQ reads to extract. ' 'Default: no limit')) parser.add_argument( '-m', '--maxreads', action='store', metavar='NUMBER', type=int, default=None, help=('Maximum number of FASTQ reads to search for the taxa. ' 'Default: no maximum')) parser.add_argument( '-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files (nodes.dmp and names.dmp' + ' from NCBI')) parser.add_argument( '-i', '--include', action='append', metavar='TAXID', type=TaxId, default=[], help=('NCBI taxid code to include a taxon and all underneath ' + '(multiple -i is available to include several taxid). ' + 'By default all the taxa is considered for inclusion.')) parser.add_argument( '-x', '--exclude', action='append', metavar='TAXID', type=TaxId, default=[], help=('NCBI taxid code to exclude a taxon and all underneath ' + '(multiple -x is available to exclude several taxid)')) parser.add_argument( '-y', '--minscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'to pass the quality filter; all pass by default')) filein = parser.add_mutually_exclusive_group(required=True) filein.add_argument('-q', '--fastq', action='store', metavar='FILE', default=None, help='Single FASTQ file (no paired-ends)') filein.add_argument('-1', '--mate1', action='store', metavar='FILE', default=None, help='Paired-ends FASTQ file for mate 1s ' '(filename usually includes _1)') parser.add_argument('-2', '--mate2', action='store', metavar='FILE', default=None, help='Paired-ends FASTQ file for mate 2s ' '(filename usually includes _2)') # timing initialization start_time: float = time.time() # Program header print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n') sys.stdout.flush() # Parse arguments args = parser.parse_args() output_file = args.file nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE)) namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE)) excluding: Set[TaxId] = set(args.exclude) including: Set[TaxId] = set(args.include) fastq_1: Filename fastq_2: Filename = args.mate2 if not fastq_2: fastq_1 = args.fastq else: fastq_1 = args.mate1 # Load NCBI nodes, names and build children plasmidfile: Filename = None ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False, excluding, including) # Build taxonomy tree print(gray('Building taxonomy tree...'), end='') sys.stdout.flush() tree = TaxTree() tree.grow(taxonomy=ncbi, look_ancestors=False) print(green(' OK!')) # Get the taxa print(gray('Filtering taxa...'), end='') sys.stdout.flush() ranks: Ranks = Ranks({}) tree.get_taxa(ranks=ranks, include=including, exclude=excluding) print(green(' OK!')) taxids: Set[TaxId] = set(ranks) taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks) num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels}) num_taxlevels = +num_taxlevels # Statistics about including taxa print(f' {len(taxids)}\033[90m taxid selected in \033[0m', end='') print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m') for rank in num_taxlevels: print(f' Number of different {rank}: {num_taxlevels[rank]}') assert taxids, red('ERROR! No taxids to search for!') # Get the records records: List[SeqRecord] = [] num_seqs: int = 0 # timing initialization start_time_load: float = time.perf_counter() print(gray(f'Loading output file {output_file}...'), end='') sys.stdout.flush() try: with open(output_file, 'rU') as file: file.readline() # discard header for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')): tid: TaxId = record.annotations['taxID'] if tid not in taxids: continue # Ignore read if low confidence score: Score = Score(record.annotations['score']) if args.minscore is not None and score < args.minscore: continue records.append(record) except FileNotFoundError: raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"') print(green(' OK!')) # Basic records statistics print( gray(' Load elapsed time: ') + f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec')) print(f' \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t' f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)') sys.stdout.flush() # FASTQ sequence dealing # records_ids: List[SeqRecord] = [record.id for record in records] records_ids: Set[SeqRecord] = {record.id for record in records} seqs1: List[SeqRecord] = [] seqs2: List[SeqRecord] = [] extracted: int = 0 i: int = 0 if fastq_2: print( f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n' f'Mseqs: \033[0m', end='') sys.stdout.flush() try: with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2: for i, (rec1, rec2) in enumerate( zip(SeqIO.parse(file1, 'quickfastq'), SeqIO.parse(file2, 'quickfastq'))): if not records_ids or (args.maxreads and i >= args.maxreads ) or (args.limit and extracted >= args.limit): break elif not i % 1000000: print(f'{i//1000000:_d}', end='') sys.stdout.flush() elif not i % 100000: print('.', end='') sys.stdout.flush() try: records_ids.remove(rec1.id) except KeyError: pass else: seqs1.append(rec1) seqs2.append(rec2) extracted += 1 except FileNotFoundError: raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files') else: print(f'\033[90mLoading FASTQ files {fastq_1}...\n' f'Mseqs: \033[0m', end='') sys.stdout.flush() try: with open(fastq_1, 'rU') as file1: for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')): if not records_ids or (args.maxreads and i >= args.maxreads ) or (args.limit and extracted >= args.limit): break elif not i % 1000000: print(f'{i//1000000:_d}', end='') sys.stdout.flush() elif not i % 100000: print('.', end='') sys.stdout.flush() try: records_ids.remove(rec1.id) except KeyError: pass else: seqs1.append(rec1) extracted += 1 except FileNotFoundError: raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file') print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! ')) def format_filename(fastq: Filename) -> Filename: """Auxiliary function to properly format the output filenames. Args: fastq: Complete filename of the fastq input file Returns: Filename of the rextracted fastq output file """ fastq_filename, _ = os.path.splitext(fastq) output_list: List[str] = [fastq_filename, '_rxtr'] if including: output_list.append('_incl') output_list.extend('_'.join(including)) if excluding: output_list.append('_excl') output_list.extend('_'.join(excluding)) output_list.append('.fastq') return Filename(''.join(output_list)) filename1: Filename = format_filename(fastq_1) SeqIO.write(seqs1, filename1, 'quickfastq') print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1) if fastq_2: filename2: Filename = format_filename(fastq_2) SeqIO.write(seqs2, filename2, 'quickfastq') print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename2) # Timing results print(gray('Total elapsed time:'), time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
def main(): """Main entry point to Recentrifuge.""" def configure_parser(): """Argument Parser Configuration""" parser = argparse.ArgumentParser( description='Analyze results of metagenomic taxonomic classifiers', epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-V', '--version', action='version', version=f'%(prog)s version {__version__} released in {__date__}') parser_in = parser.add_argument_group( 'input', 'Define Recentrifuge input files and formats') parser_in.add_argument('-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files ' '(nodes.dmp and names.dmp from NCBI)')) parser_filein = parser_in.add_mutually_exclusive_group(required=True) parser_filein.add_argument( '-f', '--file', action='append', metavar='FILE', type=Filename, help=('Centrifuge output files. If a single directory is entered, ' 'every .out file inside will be taken as a different sample.' ' Multiple -f is available to include several samples.')) parser_filein.add_argument( '-l', '--lmat', action='append', metavar='FILE', type=Filename, default=None, help=('LMAT output dir or file prefix. If just "." is entered, ' 'every subdirectory under the current directory will be ' 'taken as a sample and scanned looking for LMAT output files' '. Multiple -l is available to include several samples.')) parser_filein.add_argument( '-k', '--clark', action='append', metavar='FILE', type=Filename, help=('CLARK(S) output files. If a single directory is entered, ' 'every .csv file inside will be taken as a different sample.' ' Multiple -k is available to include several samples.')) parser_filein.add_argument( '-r', '--report', action='append', metavar='FILE', type=Filename, help=('Centrifuge/Kraken report files ' '(multiple -r is available to include several samples)')) parser_out = parser.add_argument_group( 'output', 'Related to the Recentrifuge output files') parser_out.add_argument( '-o', '--outhtml', action='store', metavar='FILE', type=Filename, help='HTML output file (if not given, the filename will be ' 'inferred from input files)') parser_out.add_argument( '-e', '--excel', action='store', metavar='OUTPUT_TYPE', choices=[str(excel) for excel in Excel], default=str(Excel(0)), help=(f'type of excel report to be generated, and can be one of ' f'{[str(excel) for excel in Excel]}')) parser_coarse = parser.add_argument_group( 'tuning', 'Coarse tuning of algorithm parameters') parser_cross = parser_coarse.add_mutually_exclusive_group( required=False) parser_cross.add_argument( '-c', '--controls', action='store', metavar='CONTROLS_NUMBER', type=int, default=0, help=('this number of first samples will be treated as negative ' 'controls; default is no controls')) parser_coarse.add_argument( '-s', '--scoring', action='store', metavar='SCORING', choices=[str(each_score) for each_score in Scoring], default=str(Scoring(0)), help=(f'type of scoring to be applied, and can be one of ' f'{[str(scoring) for scoring in Scoring]}')) parser_coarse.add_argument( '-y', '--minscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'to pass the quality filter; all pass by default')) parser_coarse.add_argument( '-m', '--mintaxa', action='store', metavar='INT', type=int, default=DEFMINTAXA, help='minimum taxa to avoid collapsing one level to the parent one' ) parser_coarse.add_argument( '-x', '--exclude', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to exclude a taxon and all underneath ' '(multiple -x is available to exclude several taxid)')) parser_coarse.add_argument( '-i', '--include', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to include a taxon and all underneath ' '(multiple -i is available to include several taxid); ' 'by default, all the taxa are considered for inclusion')) parser_cross.add_argument('-a', '--avoidcross', action='store_true', help='avoid cross analysis') parser_fine = parser.add_argument_group( 'fine tuning', 'Fine tuning of algorithm parameters') parser_fine.add_argument( '-z', '--ctrlminscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'in control samples to pass the quality filter; if defaults ' 'to "minscore"')) parser_fine.add_argument( '-w', '--ctrlmintaxa', action='store', metavar='INT', type=int, default=None, help='minimum taxa to avoid collapsing one level to the parent one' ' in control samples; it defaults to "mintaxa"') parser_fine.add_argument( '-u', '--summary', action='store', metavar='OPTION', choices=['add', 'only', 'avoid'], default='add', help=( 'select to "add" summary samples to other samples, or to ' '"only" show summary samples or to "avoid" summaries at all')) parser_fine.add_argument( '-t', '--takeoutroot', action='store_true', help='remove counts directly assigned to the "root" level') parser_fine.add_argument('--nokollapse', action='store_true', help='show the "cellular organisms" taxon') parser_mode = parser.add_argument_group('advanced', 'Advanced modes of running') parser_mode.add_argument( '--dummy', # hidden flag: just generate a dummy plot for JS debug action='store_true', help=argparse.SUPPRESS) parser_mode.add_argument( '-g', '--debug', action='store_true', help='increase output verbosity and perform additional checks') parser_mode.add_argument('--sequential', action='store_true', help='deactivate parallel processing') return parser def check_debug(): """Check debugging mode""" if args.debug: print(blue('INFO:'), gray('Debugging mode activated')) print(blue('INFO:'), gray('Active parameters:')) for key, value in vars(args).items(): if value: print(gray(f'\t{key} ='), f'{value}') def select_inputs(): """Choose right classifier, input and output files""" nonlocal process, scoring, input_files, plasmidfile, classifier if reports: classifier = Classifier.KRAKEN process = process_report input_files = reports elif clarks: classifier = Classifier.CLARK process = process_output input_files = clarks if len(clarks) == 1 and os.path.isdir(clarks[0]): select_clark_inputs(clarks) elif lmats: classifier = Classifier.LMAT scoring = Scoring.LMAT process = process_output input_files = lmats plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE)) select_lmat_inputs(lmats) elif outputs: classifier = Classifier.CENTRIFUGE process = process_output input_files = outputs if len(outputs) == 1 and os.path.isdir(outputs[0]): select_centrifuge_inputs(outputs) def check_controls(): """Check and info about the control samples""" if args.controls: if args.controls > len(input_files): print(red(' ERROR!'), gray('More controls than samples')) exit(1) print(gray('Control(s) sample(s) for subtractions:')) for i in range(args.controls): print(blue(f'\t{input_files[i]}')) def select_html_file(): """HTML filename selection""" nonlocal htmlfile if lmats: # Select case for dir name or filename prefix if os.path.isdir(lmats[0]): # Dir name dirname = os.path.dirname(os.path.normpath(lmats[0])) if not dirname or dirname == '.': basename = 'output' else: basename = os.path.basename(dirname) else: # Explicit path and file name prefix is provided dirname, basename = os.path.split(lmats[0]) htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX)) elif reports: htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX) else: htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX) def read_samples(): """Read samples""" print(gray('\nPlease, wait, processing files in parallel...\n')) # Enable parallelization with 'spawn' under known platforms if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') with mpctx.Pool( processes=min(os.cpu_count(), len(input_files))) as pool: async_results = [ pool.apply_async( process, args=[ input_files[num], # file name True if num < args.controls else False ], # is ctrl? kwds=kwargs) for num in range(len(input_files)) ] for file, (sample, tree, out, stat, err) in zip(input_files, [r.get() for r in async_results]): if err is Err.NO_ERROR: samples.append(sample) trees[sample] = tree taxids[sample] = out.get_taxlevels() counts[sample] = out.counts accs[sample] = out.accs scores[sample] = out.scores stats[sample] = stat elif err is Err.VOID_CTRL: print('There were void controls.', red('Aborting!')) exit(1) else: # sequential processing of each sample for num, file in enumerate(input_files): (sample, tree, out, stat, err) = process(file, True if num < args.controls else False, **kwargs) if err is Err.NO_ERROR: samples.append(sample) trees[sample] = tree taxids[sample] = out.get_taxlevels() counts[sample] = out.counts accs[sample] = out.accs scores[sample] = out.scores stats[sample] = stat elif err is Err.VOID_CTRL: print('There were void controls.', red('Aborting!')) exit(1) raw_samples.extend(samples) # Store raw sample names def analyze_samples(): """Cross analysis of samples in parallel by taxlevel""" print(gray('Please, wait. Performing cross analysis in parallel...\n')) # Update kwargs with more parameters for the followings func calls kwargs.update({ 'taxids': taxids, 'counts': counts, 'scores': scores, 'accs': accs, 'raw_samples': raw_samples }) if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') # Important for OSX&Win with mpctx.Pool(processes=min(os.cpu_count(), len(Rank.selected_ranks))) as pool: async_results = [ pool.apply_async(process_rank, args=[level], kwds=kwargs) for level in Rank.selected_ranks ] for level, (smpls, abunds, accumulators, score) in zip(Rank.selected_ranks, [r.get() for r in async_results]): samples.extend(smpls) counts.update(abunds) accs.update(accumulators) scores.update(score) else: # sequential processing of each selected rank for level in Rank.selected_ranks: (smpls, abunds, accumulators, score) = process_rank(level, **kwargs) samples.extend(smpls) counts.update(abunds) accs.update(accumulators) scores.update(score) def summarize_samples(): """Summary of samples in parallel by type of cross-analysis""" # timing initialization summ_start_time: float = time.perf_counter() print(gray('Please, wait. Generating summaries in parallel...')) # Update kwargs with more parameters for the followings func calls kwargs.update({'samples': samples}) # Get list of set of samples to summarize (note pylint bug #776) # pylint: disable=unsubscriptable-object target_analysis: col.OrderedDict[str, None] = col.OrderedDict({ f'{raw}_{study}': None for study in [STR_EXCLUSIVE, STR_CONTROL] for raw in raw_samples for smpl in samples if smpl.startswith(f'{raw}_{study}') }) # pylint: enable=unsubscriptable-object # Add shared and control_shared analysis if they exist (are not void) for study in [STR_SHARED, STR_CONTROL_SHARED]: for smpl in samples: if smpl.startswith(study): target_analysis[study] = None break if platform.system() and not args.sequential: # Only for known systems mpctx = mp.get_context('fork') with mpctx.Pool( processes=min(os.cpu_count(), len(input_files))) as pool: async_results = [ pool.apply_async(summarize_analysis, args=[analysis], kwds=kwargs) for analysis in target_analysis ] for analysis, (summary, abund, acc, score) in zip(target_analysis, [r.get() for r in async_results]): if summary: # Avoid adding empty samples summaries.append(summary) counts[summary] = abund accs[summary] = acc scores[summary] = score else: # sequential processing of each selected rank for analysis in target_analysis: (summary, abund, acc, score) = summarize_analysis(analysis, **kwargs) if summary: # Avoid adding empty samples summaries.append(summary) counts[summary] = abund accs[summary] = acc scores[summary] = score # Timing results print(gray('Summary elapsed time:'), f'{time.perf_counter() - summ_start_time:.3g}', gray('sec')) def generate_krona(): """Generate Krona plot with all the results via Krona 2.0 XML spec""" print(gray('\nBuilding the taxonomy multiple tree... '), end='') sys.stdout.flush() krona: KronaTree = KronaTree( samples, num_raw_samples=len(raw_samples), stats=stats, min_score=Score( min([ min(scores[sample].values()) for sample in samples if len(scores[sample]) ])), max_score=Score( max([ max(scores[sample].values()) for sample in samples if len(scores[sample]) ])), scoring=scoring, ) polytree.grow(ontology=ncbi, abundances=counts, accs=accs, scores=scores) print(green('OK!')) print(gray('Generating final plot (') + magenta(htmlfile) + gray(')... '), end='') sys.stdout.flush() polytree.toxml(ontology=ncbi, krona=krona) krona.tohtml(htmlfile, pretty=False) print(green('OK!')) def generate_excel(): """Generate Excel with results via pandas DataFrame""" xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx') print(gray(f'Generating Excel {str(excel).lower()} summary (') + magenta(xlsx_name) + gray(')... '), end='') sys.stdout.flush() xlsxwriter = pd.ExcelWriter(xlsx_name) list_rows: List = [] # Save raw samples basic statistics data_frame: pd.DataFrame = pd.DataFrame.from_dict( {raw: stats[raw].to_dict() for raw in raw_samples}) data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats') # Save taxid related statistics per sample if excel is Excel.FULL: polytree.to_items(ontology=ncbi, items=list_rows) # Generate the pandas DataFrame from items and export to Excel iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]] cols1 = pd.MultiIndex.from_product(iterable_1, names=['Samples', 'Stats']) iterable_2 = [['Details'], ['Rank', 'Name']] cols2 = pd.MultiIndex.from_product(iterable_2) cols = cols1.append(cols2) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=cols) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=str(excel)) elif excel is Excel.CMPLXCRUNCHER: target_ranks: List = [Rank.NO_RANK] if args.controls: # if controls, add specific sheet for rank target_ranks.extend(Rank.selected_ranks) for rank in target_ranks: # Once for no rank dependency (NO_RANK) indexes: List[int] sheet_name: str columns: List[str] if args.controls: indexes = [ i for i in range(len(raw_samples), len(samples)) # Check if sample ends in _(STR_CONTROL)_(rank) if (STR_CONTROL in samples[i].split('_')[-2:] and rank.name.lower() in samples[i].split('_')[-1:]) ] sheet_name = f'{STR_CONTROL}_{rank.name.lower()}' columns = [ samples[i].replace( '_' + STR_CONTROL + '_' + rank.name.lower(), '') for i in indexes ] if rank is Rank.NO_RANK: # No rank dependency indexes = list(range(len(raw_samples))) sheet_name = f'raw_samples_{rank.name.lower()}' columns = raw_samples list_rows = [] polytree.to_items(ontology=ncbi, items=list_rows, sample_indexes=indexes) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=columns) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=sheet_name) else: raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"') xlsxwriter.save() print(green('OK!')) # timing initialization start_time: float = time.time() # Program header print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}' f' =-= by {__author__} =-=\n') sys.stdout.flush() # Parse arguments argparser = configure_parser() args = argparser.parse_args() outputs: List[Filename] = args.file reports: List[Filename] = args.report lmats: List[Filename] = args.lmat clarks: List[Filename] = args.clark input_files: List[Filename] nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE)) namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE)) htmlfile: Filename = args.outhtml collapse: bool = not args.nokollapse excluding: Set[Id] = set(args.exclude) including: Set[Id] = set(args.include) scoring: Scoring = Scoring[args.scoring] excel: Excel = Excel[args.excel] check_debug() plasmidfile: Filename = None classifier: Classifier process: Callable[..., Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]] select_inputs() check_controls() if not htmlfile: select_html_file() # Load NCBI nodes, names and build children ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, collapse, excluding, including, args.debug) # If dummy flag enabled, just create dummy krona and exit if args.dummy: _debug_dummy_plot(ncbi, htmlfile, scoring) exit(0) # Declare variables that will hold results for the samples analyzed trees: Dict[Sample, TaxTree] = {} counts: Dict[Sample, Counter[Id]] = {} accs: Dict[Sample, Counter[Id]] = {} taxids: Dict[Sample, TaxLevels] = {} scores: Dict[Sample, Dict[Id, Score]] = {} stats: Dict[Sample, SampleStats] = {} samples: List[Sample] = [] raw_samples: List[Sample] = [] # Define dictionary of parameters for methods to be called (to be extended) kwargs = { 'controls': args.controls, 'ctrlminscore': (args.ctrlminscore if args.ctrlminscore is not None else args.minscore), 'ctrlmintaxa': (args.ctrlmintaxa if args.ctrlmintaxa is not None else args.mintaxa), 'debug': args.debug, 'root': args.takeoutroot, 'classifier': classifier, 'minscore': args.minscore, 'mintaxa': args.mintaxa, 'scoring': scoring, 'ontology': ncbi, } # The big stuff (done in parallel) read_samples() # Avoid cross analysis if just one report file or explicitly stated by flag if len(raw_samples) > 1 and not args.avoidcross: analyze_samples() if args.summary != 'avoid': summaries: List[Sample] = [] summarize_samples() if args.summary == 'only': samples = raw_samples + summaries else: samples.extend(summaries) # Final result generation is done in sequential mode polytree: MultiTree = MultiTree(samples=samples) generate_krona() if _USE_PANDAS: generate_excel() else: print(yellow('WARNING!'), 'Pandas not installed: Excel cannot be created.') # Timing results print(gray('Total elapsed time:'), time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
def generate_excel(): """Generate Excel with results via pandas DataFrame""" xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx') print(gray(f'Generating Excel {str(excel).lower()} summary (') + magenta(xlsx_name) + gray(')... '), end='') sys.stdout.flush() xlsxwriter = pd.ExcelWriter(xlsx_name) list_rows: List = [] # Save raw samples basic statistics data_frame: pd.DataFrame = pd.DataFrame.from_dict( {raw: stats[raw].to_dict() for raw in raw_samples}) data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats') # Save taxid related statistics per sample if excel is Excel.FULL: polytree.to_items(taxonomy=ncbi, items=list_rows) # Generate the pandas DataFrame from items and export to Excel iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]] cols1 = pd.MultiIndex.from_product(iterable_1, names=['Samples', 'Stats']) iterable_2 = [['Details'], ['Rank', 'Name']] cols2 = pd.MultiIndex.from_product(iterable_2) cols = cols1.append(cols2) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=cols) data_frame.index.names = ['TaxId'] data_frame.to_excel(xlsxwriter, sheet_name=str(excel)) elif excel is Excel.CMPLXCRUNCHER: target_ranks: List = [Rank.NO_RANK] if args.controls: target_ranks = [ Rank.SPECIES, Rank.GENUS, # Ranks of interest Rank.FAMILY, Rank.ORDER ] # for cmplxcruncher for rank in target_ranks: # Once for no rank dependency (NO_RANK) indexes: List[int] sheet_name: str columns: List[str] if args.controls: indexes = [ i for i in range(len(raw_samples), len(samples)) if (samples[i].startswith(STR_CONTROL) and rank.name.lower() in samples[i]) ] sheet_name = f'{STR_CONTROL}_{rank.name.lower()}' columns = [samples[i].split('_')[2] for i in indexes] else: # No rank dependency indexes = list(range(len(raw_samples))) sheet_name = f'raw_samples_{rank.name.lower()}' columns = [samples[i].split('_')[0] for i in indexes] list_rows = [] polytree.to_items(taxonomy=ncbi, items=list_rows, sample_indexes=indexes) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=columns) data_frame.index.names = ['TaxId'] data_frame.to_excel(xlsxwriter, sheet_name=sheet_name) else: raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"') xlsxwriter.save() print(green('OK!'))
def main(): """Main entry point to script.""" def vprint(*args): """Print only if verbose/debug mode is enabled""" if debug: print(*args, end='') sys.stdout.flush() def configure_parser(): """Argument Parser Configuration""" parser = argparse.ArgumentParser( description='Generate mock samples for Recentrifuge testing', epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE, formatter_class=argparse.RawDescriptionHelpFormatter) parser_mode = parser.add_mutually_exclusive_group(required=True) parser_mode.add_argument( '-f', '--file', action='store', metavar='FILE', type=Filename, help='Explicit source: Centrifuge output file as source') parser_mode.add_argument( '-r', '--random', action='store', metavar='MHL', type=int, default=15, help=('Random score generated. Please provide the minimum hit ' 'length (mhl) of the classification; 15 by default')) parser.add_argument( '-g', '--debug', action='store_true', help='increase output verbosity and perform additional checks') parser_input = parser.add_mutually_exclusive_group(required=True) parser_input.add_argument( '-m', '--mock', action='append', metavar='FILE', type=Filename, help=('Mock files to be read for mock Centrifuge sequences layout.' ' If a single directory is entered, every .mck file inside ' 'will be taken as a different sample. ' 'Multiple -f is available to include several samples.')) if _USE_PANDAS: parser_input.add_argument('-x', '--xcel', action='store', metavar='FILE', type=Filename, help='Excel file with the mock layout.') parser.add_argument('-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files ' '(nodes.dmp and names.dmp from NCBI)')) parser.add_argument( '-V', '--version', action='version', version=f'%(prog)s release {__version__} ({__date__})') return parser def check_debug(): """Check debugging mode""" if args.debug: print(gray('INFO: Debugging mode activated\n')) def read_mock_files(mock: Filename) -> Counter[Id]: """Read a mock layout (.mck) file""" mock_layout: Counter[Id] = col.Counter() with open(mock, 'r') as file: vprint(gray('\nProcessing'), blue(mock), gray('file:\n')) for line in file: if line.startswith('#'): continue _tid, _num = line.split('\t') tid = Id(_tid) num = int(_num) mock_layout[tid] = num vprint(num, gray('\treads for taxid\t'), tid, '\t(', cyan(ncbi.get_name(tid)), ')\n') return mock_layout def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from source file""" with open(out, 'w') as fout, open(args.file) as fcfg: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write(fcfg.readline()) # copy cfg output file header reads_writen: int = 0 for line in fcfg: tid = Id(line.split('\t')[2]) if mock_layout[tid]: fout.write(line) mock_layout[tid] -= 1 reads_writen += 1 if not sum(mock_layout.values()): vprint(reads_writen, 'reads', green('OK!\n')) break if sum(mock_layout.values()): print(red('ERROR!\n')) print(gray('Incomplete read copy by taxid:')) mock_layout = +mock_layout # Delete zero counts elements for tid in mock_layout: print(yellow(mock_layout[tid]), gray('reads missing for tid'), tid, '(', cyan(ncbi.get_name(tid)), ')\n') def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from scratch""" with open(out, 'w') as fout: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t' 'hitLength\tqueryLength\tnumMatches\n') reads_writen: int = 0 for numtid in mock_layout: tid = Id(numtid) # Convert to Id the excel integer maxhl: int = random.randint(args.random + 1, MAX_HIT_LENGTH) rank: str = str(ncbi.get_rank(tid)).lower() for _ in range(int(mock_layout[numtid])): hit_length = random.randint(args.random + 1, maxhl) fout.write(f'test{reads_writen}\t{rank}\t' f'{tid}\t{(hit_length-15)**2}\t' f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n') reads_writen += 1 vprint(reads_writen, 'reads', green('OK!\n')) def by_mock_files() -> None: """Do the job in case of mock files""" if len(args.mock) == 1 and os.path.isdir(args.mock[0]): select_centrifuge_inputs(args.mock, ext='.mck') for mock in args.mock: mock_layout: Counter[Id] = read_mock_files(mock) test: Filename = Filename(mock.split('.mck')[0] + '.out') if args.file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout) def by_excel_file() -> None: """Do the job in case of Excel file with all the details""" dirname = os.path.dirname(args.xcel) # Expected index (taxids) in column after taxa name, and last row will # be removed (reserved for sum of reads in Excel file) mock_df = pd.read_excel(args.xcel, index_col=1, skip_footer=1, dtype=str) del mock_df['RECENTRIFUGE MOCK'] vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n') for name, series in mock_df.iteritems(): mock_layout: Counter[Id] = col.Counter(series.to_dict(dict)) # In prev, series.to_dict(col.Counter) fails, so this is workaround test: Filename = Filename(os.path.join(dirname, name + '.out')) if args.file: mock_from_source(test, mock_layout) else: mock_from_scratch(test, mock_layout) # Program header print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}' f' =-= by {__author__} =-=\n') sys.stdout.flush() # Parse arguments argparser = configure_parser() args = argparser.parse_args() nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE)) namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE)) debug: bool = args.debug check_debug() # Load NCBI nodes, names and build children ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, None, False) if args.mock: by_mock_files() elif args.xcel: by_excel_file()
def generate_excel(): """Generate Excel with results via pandas DataFrame""" xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx') print(gray(f'Generating Excel {str(excel).lower()} summary (') + magenta(xlsx_name) + gray(')... '), end='') sys.stdout.flush() xlsxwriter = pd.ExcelWriter(xlsx_name) list_rows: List = [] # Save raw samples basic statistics data_frame: pd.DataFrame = pd.DataFrame.from_dict( {raw: stats[raw].to_dict() for raw in raw_samples}) data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats') # Save taxid related statistics per sample if excel is Excel.FULL: polytree.to_items(ontology=ncbi, items=list_rows) # Generate the pandas DataFrame from items and export to Excel iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]] cols1 = pd.MultiIndex.from_product(iterable_1, names=['Samples', 'Stats']) iterable_2 = [['Details'], ['Rank', 'Name']] cols2 = pd.MultiIndex.from_product(iterable_2) cols = cols1.append(cols2) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=cols) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=str(excel)) elif excel is Excel.CMPLXCRUNCHER: target_ranks: List = [Rank.NO_RANK] if args.controls: # if controls, add specific sheet for rank target_ranks.extend(Rank.selected_ranks) for rank in target_ranks: # Once for no rank dependency (NO_RANK) indexes: List[int] sheet_name: str columns: List[str] if args.controls: indexes = [ i for i in range(len(raw_samples), len(samples)) # Check if sample ends in _(STR_CONTROL)_(rank) if (STR_CONTROL in samples[i].split('_')[-2:] and rank.name.lower() in samples[i].split('_')[-1:]) ] sheet_name = f'{STR_CONTROL}_{rank.name.lower()}' columns = [ samples[i].replace( '_' + STR_CONTROL + '_' + rank.name.lower(), '') for i in indexes ] if rank is Rank.NO_RANK: # No rank dependency indexes = list(range(len(raw_samples))) sheet_name = f'raw_samples_{rank.name.lower()}' columns = raw_samples list_rows = [] polytree.to_items(ontology=ncbi, items=list_rows, sample_indexes=indexes) data_frame = pd.DataFrame.from_items(list_rows, orient='index', columns=columns) data_frame.index.names = ['Id'] data_frame.to_excel(xlsxwriter, sheet_name=sheet_name) else: raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"') xlsxwriter.save() print(green('OK!'))
from recentrifuge.config import TEST_INPUT_DIR, TEST_OUTPUT_DIR, MOCK_XLSX from recentrifuge.config import REXTRACT_TEST_SAMPLE, REXTRACT_TEST_FASTQ from recentrifuge.config import gray, blue, green, red, yellow, cyan, magenta from recentrifuge.taxonomy import Taxonomy # optional package pandas (to read Excel with mock layout) _USE_PANDAS = True try: import pandas as pd except ImportError: pd = None _USE_PANDAS = False MAX_HIT_LENGTH: int = 200 # Max hit length for random score generation TEST_MOCK_XLSX = os.path.join(TEST_INPUT_DIR, MOCK_XLSX) TEST_XCEL = Filename( os.path.join(os.path.dirname(os.path.realpath(__file__)), TEST_MOCK_XLSX)) TEST_REXT_SMPL = os.path.join(TEST_OUTPUT_DIR, REXTRACT_TEST_SAMPLE) TEST_REXT_FSTQ = os.path.join(TEST_OUTPUT_DIR, REXTRACT_TEST_FASTQ) def generate_mock( ncbi: Taxonomy, file: Filename, rnd: int, mocks: List[Filename], xcel: Filename, debug: bool, ): def vprint(*args): """Print only if verbose/debug mode is enabled""" if debug: