def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) genus_species = organism.split(';')[-1] genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, genus_species)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) # genus_species = organism.split(';')[-1] # genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, organism)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')
def list_organisms(ofus, hclus, nt_cat, typetable, outpath, cut_h): bgc_dd = defaultdict(list) for value, key in hclus.itertuples(index=True): key = str('%05d' % key) bgc_dd[key].extend(list([value])) ofu_list = ofus.split(',') i = 0 # Preload the Database and Tree db = RefSeqDatabase() nt = NCBITree() for ofu in ofu_list: ofu = str(ofu) if ofu.startswith('ofu'): ofu_n = str(ofu.replace('ofu', '')) elif ofu.startswith('ofu_'): ofu_n = str(ofu.replace('ofu', '')) else: ofu_n = ofu bgcs = bgc_dd[ofu_n] name_dict = defaultdict(list) with suppress_stdout(): for bgc in bgcs: if bgc.startswith('ncbi_tid'): ncbi_tid = bgc.split('|')[1] if ncbi_tid == 'na': name = bgc.split('|')[3] else: ncbi_tid = int(ncbi_tid) name = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True) elif '|genbank|' in bgc: gbk_id = bgc.split('|')[3].split('_cluster')[0] if nt_cat == '-': sys.exit( 'Genbank ID BGC headers require an NT Catalog for annotation... see --help' ) tid, organism = genbank_id_to_tid(gbk_id, nt_cat) name = organism else: refseqid = '_'.join(bgc.split('_')[:2]) name = refseq_to_name(refseqid, db=db, nt=nt) if typetable is not False: ctype = typetable.filter(like=bgc, axis=0) ctype = str(ctype.iloc[0, 0]) else: ctype = 'NA' if bgc == name: name_dict[bgc] = [ctype, refseqid] else: name_dict[bgc] = [ctype, name] ofu_file = ''.join(['ofu', ofu_n, '_id', cut_h, '.txt']) with open(os.path.join(outpath, ofu_file), 'w') as outf: outdf = pd.DataFrame.from_dict(name_dict, orient='index') outdf.columns = ['predicted_type', 'organism'] outdf.to_csv(outf, sep='\t') i += 1 print('\nOrganism information for %d OFUs written to file.\n' % i) return bgc_dd
def genbank_id_to_tid(gbk_id, nt_cat): with open(nt_cat, 'r') as nt_catalog: reader = csv.reader(nt_catalog, delimiter='\t') next(reader) nt = NCBITree() gbk_set = set() gbk_set.add(gbk_id) for line in reader: if line[1] in gbk_set: tid = line[2] tid = int(tid) organism = nt.green_genes_lineage(tid, depth=8, depth_force=True) else: tid, organism = 'na', 'k__None;p__None;c__None;o__None;f__None;g__None;s__None;t__None' return tid, organism
def refseq_to_name(refseq_id, db=RefSeqDatabase(), nt=NCBITree()): ncbi_tid = db.get_ncbi_tid_from_refseq_accession(refseq_id) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True) else: organism = refseq_id # if DOJO fails to find the tid, just return the refseq accession ID. return organism
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: if args.assembly != '-': ncbi_tid = db.get_ncbi_tid_from_assembly_accession_version(args.assembly)[0] elif args.refseq != '-': ncbi_tid = db.get_ncbi_tid_from_refseq_accession(args.refseq)[0] elif args.tid != '-': ncbi_tid = int(args.tid) organism = nt.green_genes_lineage(ncbi_tid) # genus_species = organism.split(';')[-1] # genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|organism|%s\n' % (ncbi_tid, organism)) outf.write('\n')
def list_organism_ofus(orgs, nt_cat, hclus, height, outpath): bgc_dd = defaultdict(list) for value, key in hclus.itertuples(index=True): key = str('%05d' % key) key = ''.join(['ofu', key]) bgc_dd[key].extend(list([value])) orgs_list = orgs.split(',') i = len(orgs_list) ofu_dict = defaultdict(list) # Preload the Database and Tree db = RefSeqDatabase() nt = NCBITree() for org in orgs_list: # print(org) org_ofu_dup = [] for ofu_num, ofu_orgs in bgc_dd.items(): for ofu_org in ofu_orgs: if org in ofu_org: if org.startswith('BGC'): name = org else: name = identify_organism(org, nt_cat, db=db, nt=nt) if ofu_num not in org_ofu_dup: org_ofu_dup.append(ofu_num) ofu_dict[name].extend([ofu_num]) else: continue height = str(height) ofu_file = ''.join(['OFUs_from_similarity_level', height, '.txt']) outdf = pd.DataFrame.from_dict(ofu_dict, orient='index') if not outdf.empty: with open(os.path.join(outpath, ofu_file), 'w') as outf: outdf.to_csv(outf, sep='\t', header=False) print( '\nOFU assigments written to file for the %d organism ID entries given.\n' % i) else: print( '\nNo OFU assignments found; check RefSeq / Genbank / MIBiG identifier and format' ) return ofu_dict
def tid_to_name(ncbi_tid, nt=NCBITree()): organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True) return organism
def main(): parser = make_arg_parser() args = parser.parse_args() if os.getcwd().split('/')[-1] == 'antismash_results': for rdir in os.listdir('.'): if rdir.startswith('GCF'): parse_products(rdir) else: pass else: print( '\nNOTE:\nProgram may fail if you are not in a proper antismash_results directory\n' ) for rdir in os.listdir('.'): if rdir.startswith('GCF'): parse_products(rdir) else: pass if args.compile: if "compiled_cluster_types" not in os.listdir('.'): os.mkdir("compiled_cluster_types") outfilename = 'compiled_cluster_types.csv' with open(os.path.join("compiled_cluster_types", outfilename), 'w') as outf: for cdir in os.listdir('.'): if cdir.startswith('GCF'): if "cluster_types" not in os.listdir(cdir): pass else: outf = compile_types(cdir, outf) outf.close() if args.annotate: if not args.compile: print( '\nSORRY:\nAnnotation only available with compiled option (-c)\n' ) quit() else: # Preload the Database and Tree db = RefSeqDatabase() nt = NCBITree() strain_label = [] with open( os.path.join('compiled_cluster_types', 'compiled_cluster_types.csv')) as intab: odf = pd.read_csv(intab, index_col=0, header=None) refseq_list = list(odf.index) for refseq_id in refseq_list: organism = refseq_to_name(refseq_id, db=db, nt=nt) ncbi_tid = refseq_to_tid(refseq_id, db=db) ncbi_tid = str(ncbi_tid) # genus_species = organism.split(';')[-1] # genus_species = genus_species.replace('s__', '') if ncbi_tid == organism: # sometimes DOJO can't look up the refseq accession; in this case, just return refseq. strain_label.append(refseq_id) else: strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, organism)) odf.index = strain_label odf.columns = ['cluster_type'] an_outn = 'annotated_cluster_types.csv' with open(os.path.join('compiled_cluster_types', an_outn), 'w') as an_outf: odf.to_csv(an_outf) else: pass
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')] for fna_file in fna_files: sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line method = args.method height = 1 - (args.height / 100) with open(args.input, 'r') as inf: if args.clusterme: print('...performing hierarchical clustering, tree cut at height of %s...\n' % args.height) hclus = process_hierarchy(inf, height, method) else: hclus = pd.read_csv(inf, sep=',', header=0, index_col=0) size = hclus.max(0)[0] # get the total number of clustered OFUs (depends on height cut) print('\n...Preparing OFU profile for %s OFUs...\n' % size) size += 1 fill = outer(size) dd = defaultdict(fill) # Initialize the dict with all zeros # Collapse into an OFU reference table, strains vs OFUs if args.clusterme: hclus.to_csv('hcsv_temp.csv') with open('hcsv_temp.csv', 'r') as inf2: df = cluster_ofus(inf2, dd) else: with open(args.input, 'r') as inf2: df = cluster_ofus(inf2, dd) j = 0 k = 0 if args.annotate: # Preload the Database and Tree db = RefSeqDatabase() nt = NCBITree() strain_label = [] refseq_list = list(df.index) for refseq_id in refseq_list: if refseq_id.startswith('ncbi_tid'): ncbi_tid = refseq_id.split('|')[1] if ncbi_tid == 'na': genbank = '|'.join(refseq_id.split('_')[1].split('|')[2:4]) j += 1 else: ncbi_tid = int(ncbi_tid) organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True) if organism == 'k__;p__;c__;o__;f__;g__;s__;t__' and ncbi_tid != 'na': strain_label.append('|'.join(['ncbi_tid', str(ncbi_tid)])) k += 1 elif ncbi_tid == 'na': strain_label.append(genbank) else: strain_label.append(organism) else: # TODO: Finish the regex for refseq id # p = re.compile(r"N\w\_[\w+\d+]*\.\d") # m = p.search(refseq_id) # searches using the regex defined above # refseq_id_extract = ''.join(m) organism = refseq_to_name(refseq_id, db=db, nt=nt) ncbi_tid = refseq_to_tid(refseq_id, db=db) ncbi_tid = str(ncbi_tid) if args.taxonomy: if ncbi_tid == organism: # sometimes DOJO can't look up the refseq accession; in this case, just return refseq. strain_label.append(refseq_id) else: strain_label.append(organism) elif args.ncbitid: if ncbi_tid == organism: # sometimes DOJO can't look up the refseq accession; in this case, just return refseq. strain_label.append(refseq_id) else: strain_label.append(ncbi_tid) else: if ncbi_tid == organism: # sometimes DOJO can't look up the refseq accession; in this case, just return refseq. strain_label.append(refseq_id) elif organism.endswith('None') or organism.endswith('t__'): genus_species = organism.split(';')[-2] # genus_species = genus_species.strip('s__') strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, genus_species)) else: strain = organism.split(';')[-1] # strain = strain.strip('t__') strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, strain)) df.index = strain_label df.sort_index(axis=0, inplace=True) if j > 0 or k > 0: print('Note: Organism information was not obtained for all clusters:\n') if j > 0: print('%s clusters had no NCBI tid...\n' % j) if k > 0: print('%s clusters did not match a full named taxonomy annotation\n' % k) else: pass with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: df.to_csv(outf) print('...all done, cleaning up...\n') os.remove('hcsv_temp.csv')