def tax_file_to_dict(path, out_dict): with open(path) as infile: for line in infile: fields = line.strip().split('\t') # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't. read = path.replace('_nofilter', '') + fields[0] tax_string = fields[1] if len(fields) > 1 else 'n_Unclassified' tax, tax_wranks = parse_tax_string(tax_string) out_dict[read] = tax_wranks
def add_features(abunds, tax, tax_wranks, tax_nofilter, tax_nofilter_wranks): unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified') for feat in abunds: if feat not in tax: assert feat not in tax_wranks assert feat not in tax_nofilter assert feat not in tax_nofilter_wranks tax[feat] = unclass_list tax_wranks[feat] = unclass_list_wranks tax_nofilter[feat] = unclass_list tax_nofilter_wranks[feat] = unclass_list_wranks
def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path, override={'$projectdir': args.project_path}) nokegg, nocog, nopfam, doublepass = map(int, [ perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'], perlVars['$doublepass'] ]) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise elif args.sqm2anvio or args.force_overwrite: # We know what we are doing. pass else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Calculate tables and write results. prefix = args.output_dir + '/' + perlVars['$projectname'] + '.' ### Functions if not args.sqm2anvio: # Were custom annotation databases used in this project? methods = [ f.split('.')[-1] for f in listdir(perlVars['$resultpath']) if len(f.split('.')) > 2 and f.split('.')[-2] == 'fun3' ] customMethods = [ method for method in methods if method not in ('kegg', 'cog', 'pfam', 'wranks') ] # Parse ORF table. sampleNames, orfs, kegg, cog, pfam, custom = parse_orf_table( perlVars['$mergedfile'], nokegg, nocog, nopfam, args.trusted_functions, args.ignore_unclassified, customMethods) # Round aggregated functional abundances. # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs. # We round the aggregates to the closest integer for convenience. kegg['abundances'] = { k: a.round().astype(int) for k, a in kegg['abundances'].items() } cog['abundances'] = { k: a.round().astype(int) for k, a in cog['abundances'].items() } pfam['abundances'] = { k: a.round().astype(int) for k, a in pfam['abundances'].items() } #write_row_dict(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv') if not nokegg: write_row_dict(['Name', 'Path'], kegg['info'], prefix + 'KO.names.tsv') write_row_dict(sampleNames, kegg['abundances'], prefix + 'KO.abund.tsv') write_row_dict(sampleNames, kegg['bases'], prefix + 'KO.bases.tsv') write_row_dict(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv') if 'copyNumber' in kegg: write_row_dict(sampleNames, kegg['copyNumber'], prefix + 'KO.copyNumber.tsv') if not nocog: write_row_dict(['Name', 'Path'], cog['info'], prefix + 'COG.names.tsv') write_row_dict(sampleNames, cog['abundances'], prefix + 'COG.abund.tsv') write_row_dict(sampleNames, cog['bases'], prefix + 'COG.bases.tsv') write_row_dict(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv') if 'copyNumber' in cog: write_row_dict(sampleNames, cog['copyNumber'], prefix + 'COG.copyNumber.tsv') write_row_dict(sampleNames, {'COG0468': cog['coverages']['COG0468']}, prefix + 'RecA.tsv') if not nopfam: write_row_dict(sampleNames, pfam['abundances'], prefix + 'PFAM.abund.tsv') write_row_dict(sampleNames, pfam['bases'], prefix + 'PFAM.bases.tsv') write_row_dict(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv') if 'copyNumber' in pfam: write_row_dict(sampleNames, pfam['copyNumber'], prefix + 'PFAM.copyNumber.tsv') for method, d in custom.items(): write_row_dict(['Name'], d['info'], prefix + method + '.names.tsv') write_row_dict(sampleNames, d['abundances'], prefix + method + '.abund.tsv') write_row_dict(sampleNames, d['bases'], prefix + method + '.bases.tsv') write_row_dict(sampleNames, d['tpm'], prefix + method + '.tpm.tsv') if 'copyNumber' in d: write_row_dict(sampleNames, d['copyNumber'], prefix + method + '.copyNumber.tsv') else: # Not super beautiful code. Just read the orf names and create a fake orf dict # since we need to know the names of all the orfs to create the taxonomy output. orfs = {'abundances': read_orf_names(perlVars['$mergedfile'])} ### Taxonomy. fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[ '$fun3tax'] orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks') orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table( fun_prefix + '.noidfilter.wranks') # Add ORFs not present in the input tax file. unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified') for orf in orfs['abundances']: if orf not in orf_tax: assert orf not in orf_tax_wranks assert orf not in orf_tax_nofilter assert orf not in orf_tax_nofilter_wranks orf_tax[orf] = unclass_list orf_tax_wranks[orf] = unclass_list_wranks orf_tax_nofilter[orf] = unclass_list orf_tax_nofilter_wranks[orf] = unclass_list_wranks orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {} for orf in orf_tax: tax = orf_tax[orf] tax_nofilter = orf_tax_nofilter[orf] if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in ( tax[0], tax_nofilter[0]): # We check both taxonomies. orf_tax_prokfilter[orf] = tax orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf] else: orf_tax_prokfilter[orf] = tax_nofilter orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf] contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table( perlVars['$contigtable']) write_row_dict(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv') write_row_dict(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv') if not args.sqm2anvio: fna_blastx = perlVars['$fna_blastx'] if doublepass else None write_orf_seqs(orfs['abundances'].keys(), perlVars['$aafile'], fna_blastx, perlVars['$rnafile'], perlVars['$trnafile'] + '.fasta', prefix + 'orf.sequences.tsv') write_contig_seqs(perlVars['$contigsfna'], prefix + 'contig.sequences.tsv') write_row_dict(TAXRANKS, orf_tax_nofilter, prefix + 'orf.tax.nofilter.tsv') write_row_dict(TAXRANKS, orf_tax_prokfilter, prefix + 'orf.tax.prokfilter.tsv') ### Bins if not int(perlVars['$nobins']): bin_tpm, bin_tax, bin_tax_wranks = parse_bin_table( perlVars['$bintable']) write_row_dict(TAXRANKS, bin_tax, prefix + 'bin.tax.tsv') for idx, rank in enumerate(TAXRANKS): tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'], orf_tax_prokfilter_wranks, idx) write_row_dict(sampleNames, tax_abunds_orfs, prefix + '{}.prokfilter.abund.tsv'.format(rank)) #write_row_dict(sampleNames, normalize_abunds(tax_abunds_orfs, 100), prefix + '{}.prokfilter.percent.tsv'.format(rank)) tax_abunds_contigs = aggregate_tax_abunds(contig_abunds, contig_tax_wranks, idx) write_row_dict(sampleNames, tax_abunds_contigs, prefix + '{}.allfilter.abund.tsv'.format(rank))
def main(args): ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise elif args.force_overwrite: pass else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Project name and samples. project_name = args.project_path.strip('/').split('/')[-1] output_prefix = project_name #args.output_dir.strip('/').split('/')[-1] samples = defaultdict(int) with open('{}/{}.out.mappingstat'.format(args.project_path, project_name)) as infile: for line in infile: if line.startswith('#'): continue sample, file_, total_reads, reads_with_hits_to_nr = line.strip( ).split('\t') samples[sample] += int(total_reads) ### Parse taxonomy. def tax_file_to_dict(path, out_dict): with open(path) as infile: for line in infile: fields = line.strip().split('\t') # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't. read = path.replace('_nofilter', '') + fields[0] tax_string = fields[1] if len(fields) > 1 else 'n_Unclassified' tax, tax_wranks = parse_tax_string(tax_string) out_dict[read] = tax_wranks tax_dict = { filt: { rank: {sample: defaultdict(int) for sample in samples} for rank in TAXRANKS } for filt in TAXFILTERS } for sample in samples: read_tax = {'nofilter': {}, 'allfilter': {}, 'prokfilter': {}} ### Parse nofilter taxonomy. nofilter_tax_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith('.tax_nofilter.wranks') ] for tax_file in nofilter_tax_files: path = '{}/{}/{}'.format(args.project_path, sample, tax_file) tax_file_to_dict(path, read_tax['nofilter']) ### Parse taxonomy with filters. allfilter_tax_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith('.tax.wranks') ] for tax_file in allfilter_tax_files: path = '{}/{}/{}'.format(args.project_path, sample, tax_file) tax_file_to_dict(path, read_tax['allfilter']) assert read_tax['nofilter'].keys() == read_tax['allfilter'].keys() ### Generate taxonomy with filters only for prokaryotes. for read in read_tax['nofilter']: if 'k_Bacteria' in ( read_tax['nofilter'][read][0], read_tax['allfilter'][read][0]) or 'k_Archaea' in ( read_tax['nofilter'][read][0], read_tax['allfilter'][read][0]): read_tax['prokfilter'][read] = read_tax['allfilter'][read] else: read_tax['prokfilter'][read] = read_tax['nofilter'][read] ### Aggregate counts from the same taxa. for filt in TAXFILTERS: for read, tax in read_tax[filt].items(): for i, rank in enumerate(TAXRANKS): tax_dict[filt][rank][sample][tax[i]] += 1 ### Add unclassified and write results. for filt in TAXFILTERS: if filt == 'nofilter': continue for i, rank in enumerate(TAXRANKS): dict_to_write = tax_dict[filt][rank] for sample, taxa in dict_to_write.items(): classified_reads = sum(taxa.values()) total_reads = samples[sample] dict_to_write[sample][parse_tax_string('n_Unclassified')[1] [i]] += (total_reads - classified_reads) DataFrame.from_dict(dict_to_write).fillna(0).to_csv( '{}/{}.{}.{}.abund.tsv'.format(args.output_dir, output_prefix, rank, filt), sep='\t') ### Parse functions. # Is there any custom annotation method apart from kegg and COG? custom_methods = [ f.split('.')[-1].replace('fun', '') for f in listdir(args.project_path) if 'fun' in f.split('.')[-1] and not f.endswith('funcog') and not f.endswith('funkegg') ] for method in custom_methods: FUNMETHODS[method] = method fun_dict = { method: {sample: defaultdict(float) for sample in samples} for method in FUNMETHODS } for sample in samples: for method in FUNMETHODS: fun_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith(method) ] for fun_file in fun_files: with open('{}/{}/{}'.format(args.project_path, sample, fun_file)) as infile: infile.readline() infile.readline() for line in infile: fields = line.strip().split('\t') while len(fields) < 3: fields.append('Unclassified') funs = fields[2] if args.trusted_functions else fields[ 1] funs = funs.split(';') for fun in funs: fun_dict[method][sample][fun] += 1 / len( funs ) # Split the counts in multi-kegg annotations. for method, method_name in FUNMETHODS.items(): dict_to_write = fun_dict[method] for sample, funs in dict_to_write.items(): classified_reads = sum(funs.values()) total_reads = samples[sample] dict_to_write[sample]['Unclassified'] += (total_reads - classified_reads) DataFrame.from_dict(dict_to_write).fillna(0).to_csv( '{}/{}.{}.abund.tsv'.format(args.output_dir, output_prefix, method_name), sep='\t')
def main(args): ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise elif args.force_overwrite: pass else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Project name and samples. project_name = args.project_path.strip('/').split('/')[-1] output_prefix = project_name #args.output_dir.strip('/').split('/')[-1] samples = defaultdict(int) samples_orfs = defaultdict(int) with open('{}/{}.out.mappingstat'.format(args.project_path, project_name)) as infile: for line in infile: if line.startswith('#'): continue sample, file_, total_reads, reads_with_hits_to_nr, *total_hits = line.strip( ).split( '\t') # *_ since longreads output will have one more column if total_hits: total_orfs = 0 with open( '{}/{}/{}.nt.fasta'.format(args.project_path, sample, sample) ) as infile: # in longreads mode we can have more than one ORF per read for line in infile: if line.startswith('>'): total_orfs += 1 samples_orfs[sample] = total_orfs longreads = True else: longreads = False samples[sample] += int(total_reads) ### Parse taxonomy. def tax_file_to_dict(path, out_dict): with open(path) as infile: for line in infile: fields = line.strip().split('\t') # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't. read = path.replace('_nofilter', '') + fields[0] if not longreads: tax_string = fields[1] if len( fields) > 1 else 'n_Unclassified' else: tax_string = fields[1] if fields[1] else 'n_Unclassified' tax, tax_wranks = parse_tax_string(tax_string) out_dict[read] = tax_wranks tax_dict = { filt: { rank: {sample: defaultdict(int) for sample in samples} for rank in TAXRANKS } for filt in TAXFILTERS } for sample in samples: read_tax = {'nofilter': {}, 'allfilter': {}, 'prokfilter': {}} ### Parse nofilter taxonomy if not longreads: nofilter_tax_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith('.tax_nofilter.wranks') ] else: nofilter_tax_files = ['readconsensus_nofilter.txt'] for tax_file in nofilter_tax_files: path = '{}/{}/{}'.format(args.project_path, sample, tax_file) tax_file_to_dict(path, read_tax['nofilter']) ### Parse taxonomy with filters. if not longreads: allfilter_tax_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith('.tax.wranks') ] else: allfilter_tax_files = ['readconsensus.txt'] for tax_file in allfilter_tax_files: path = '{}/{}/{}'.format(args.project_path, sample, tax_file) tax_file_to_dict(path, read_tax['allfilter']) assert read_tax['nofilter'].keys() == read_tax['allfilter'].keys() ### Generate taxonomy with filters only for prokaryotes. for read in read_tax['nofilter']: if 'k_Bacteria' in ( read_tax['nofilter'][read][0], read_tax['allfilter'][read][0]) or 'k_Archaea' in ( read_tax['nofilter'][read][0], read_tax['allfilter'][read][0]): read_tax['prokfilter'][read] = read_tax['allfilter'][read] else: read_tax['prokfilter'][read] = read_tax['nofilter'][read] ### Aggregate counts from the same taxa. for filt in TAXFILTERS: for read, tax in read_tax[filt].items(): for i, rank in enumerate(TAXRANKS): tax_dict[filt][rank][sample][tax[i]] += 1 ### Add unclassified and write results. for filt in TAXFILTERS: #if filt == 'nofilter': # continue for i, rank in enumerate(TAXRANKS): dict_to_write = tax_dict[filt][rank] for sample, taxa in dict_to_write.items(): classified_reads = sum(taxa.values()) total_reads = samples[sample] dict_to_write[sample][parse_tax_string('n_Unclassified')[1] [i]] += (total_reads - classified_reads) DataFrame.from_dict(dict_to_write).fillna(0).to_csv( '{}/{}.{}.{}.abund.tsv'.format(args.output_dir, output_prefix, rank, filt), sep='\t') ### Parse functions. # Is there any custom annotation method apart from kegg and COG? custom_methods = [ f.split('.')[-1].replace('fun', '') for f in listdir(args.project_path) if 'fun' in f.split('.')[-1] and not f.endswith('funcog') and not f.endswith('funkegg') ] for method in custom_methods: FUNMETHODS[method] = method found_methods = set() fun_dict = { method: {sample: defaultdict(float) for sample in samples} for method in FUNMETHODS } for sample in samples: for method in FUNMETHODS: fun_files = [ f for f in listdir('{}/{}'.format(args.project_path, sample)) if f.endswith(method) ] for fun_file in fun_files: found_methods.add(method) with open('{}/{}/{}'.format(args.project_path, sample, fun_file)) as infile: infile.readline() infile.readline() for line in infile: fields = line.strip().split('\t') while len(fields) < 3: fields.append('Unclassified') funs = fields[2] if args.trusted_functions else fields[ 1] funs = funs.split(';') for fun in funs: fun_dict[method][sample][fun] += 1 / len( funs ) # Split the counts in multi-kegg annotations. for method, method_name in FUNMETHODS.items(): dict_to_write = fun_dict[method] for sample, funs in dict_to_write.items(): classified_reads = sum(funs.values()) if not longreads: total_reads = samples[sample] else: total_reads = samples_orfs[ sample] # In longreads mode we can have more than one orf per read. For taxonomy we get the consensus for each reads, but in functions we count ORFs independently. dict_to_write[sample]['Unclassified'] += (total_reads - classified_reads) DataFrame.from_dict(dict_to_write).fillna(0).to_csv( '{}/{}.{}.abund.tsv'.format(args.output_dir, output_prefix, method_name), sep='\t') # Write function names and hierarchy paths for kegg/cog. for method in found_methods: if method not in ('kegg', 'cogs'): continue method_name = FUNMETHODS[method] function_info = 'keggfun2.txt' if method == 'kegg' else 'coglist.txt' with open('{}/{}'.format(data_dir, function_info)) as infile, open( '{}/{}.{}.names.tsv'.format(args.output_dir, output_prefix, method_name), 'w') as outfile: infile.readline() # Burn headers. info = {} for line in infile: if method == 'kegg': fun_id, gene_name, fun_name, path = line.strip().split( '\t') else: line = line.strip().split('\t') if len(line) == 3: fun_id, fun_name, path = line else: # UGH! fun_id, fun_name = line path = '{} (path not available)', format(fun_id) info[fun_id] = (fun_name, path) allFuns = sorted({ fun for sample in fun_dict[method] for fun in fun_dict[method][sample] }) outfile.write('\tName\tPath\n') for fun in allFuns: if fun == 'Unclassified': continue if fun in info: outfile.write('{}\t{}\t{}\n'.format( fun, info[fun][0], info[fun][1])) else: outfile.write( '{}\t{} (name not available)\t{} (path not available)\n' .format(fun, fun, fun)) outfile.write('Unclassified\tUnclassified\tUnclassified\n') # Write function names for extra methods. for method in found_methods: if method in ('kegg', 'cogs'): continue method_name = FUNMETHODS[method] written = set() if method in ('kegg', 'cogs'): continue with open('{}/{}.out.allreads.fun{}'.format(args.project_path, project_name, method)) as infile, \ open('{}/{}.{}.names.tsv'.format(args.output_dir, output_prefix, method_name), 'w') as outfile: infile.readline() # Burn headers. infile.readline() outfile.write('\tName\n') for line in infile: line = line.strip('\n').split( '\t' ) # Explicitly strip just '\n' so I don't remove tabs when there are empty fields. ID = line[0] if ID not in written: written.add(ID) outfile.write('{}\t{}\n'.format(ID, line[-1])) outfile.write('Unclassified\tUnclassified\n')