def subset_fasta(): parser = make_arg_parser() args = parser.parse_args() with open(args.input) as inf: fasta = FASTA(inf) num_reads = sum(1 for i in fasta.read()) with open(args.input) as inf: fasta = FASTA(inf) fasta_gen = fasta.read() filtered_fasta_gen = filter_fasta(fasta_gen, num_reads, args.keep) with open(args.output, 'w') if args.output else sys.stdout as outf: for title, data in filtered_fasta_gen: outf.write('>%s\n%s\n' % (title, data))
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) # genus_species = organism.split(';')[-1] # genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, organism)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) genus_species = organism.split(';')[-1] genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, genus_species)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')
def extract_genome_lengths(input, map, output): d = defaultdict(int) inf_fasta = FASTA(input) for header, seq in inf_fasta.read(): map_line = next(map).rstrip().split('\t')[1].replace('; ', ';') d[map_line] += len(seq) for key, value in d.items(): output.write('%s\t%s\n' % (key, value))
def filter_dusted_fasta(input, threshold, output): fasta_gen = FASTA(input) for title, seq in fasta_gen.read(): seq = re.sub('[^A-Z]', 'N', seq) hits = sum(1.0 for i in seq if not i == 'N') if len(seq) and hits and hits/len(seq) > threshold: output.write('>%s\n' % title) output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
def filter_dusted_fasta(input, threshold, output): fasta_gen = FASTA(input) for title, seq in fasta_gen.read(): seq = re.sub('[^A-Z]', 'N', seq) hits = sum(1.0 for i in seq if not i == 'N') if len(seq) and hits and hits / len(seq) > threshold: output.write('>%s\n' % title) output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
def compile_ofu_dnasequences(inf_d, bgc, dna_outf): fasta_gen = FASTA(inf_d) bgc = str(bgc) for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster', '_cluster') if bgc in header: dna_outf.write(''.join(['>', header, '\n'])) dna_outf.write(''.join([sequence, '\n'])) return dna_outf
def compile_ofu_sequences(inf_m, bgc, aa_outf): mpfa_gen = FASTA(inf_m) bgc = str(bgc) for header, sequence in mpfa_gen.read(): if '.cluster' in header: header = header.replace('.cluster', '_cluster') if bgc in header: aa_outf.write(''.join(['>', header, '\n'])) aa_outf.write(''.join([sequence, '\n'])) return aa_outf
def cmd_open_fasta(fastas): """Loads one or FASTA files for processing. """ for fasta in fastas: try: click.echo('Opening "%s"' % fasta) file_handle = click.open_file(fasta) inf_fasta = FASTA(file_handle) yield inf_fasta.read() except Exception as e: click.echo('Could not open FASTA "%s": %s' % (fasta, e), err=True)
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate( inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print( "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def build_cluster_map(inf, bread='ref|,|'): begin, end = bread.split(',') cluster_map = defaultdict(set) fasta_gen = FASTA(inf) for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster', '_cluster') ref = find_between(header, begin, end) header_split = ref.split('_') key = '_'.join(header_split[:3]) value = '_'.join(header_split[-2:]) cluster_map[key].add(value) return cluster_map
def build_cluster_map(inf, bread='ref|,|'): begin,end = bread.split(',') cluster_map = defaultdict(set) fasta_gen = FASTA(inf) for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') ref = find_between(header, begin, end) header_split = ref.split('_') key = '_'.join(header_split[:3]) value = header_split[-1] cluster_map[key].add(value) return cluster_map
def compile_files(outpath): protein_seqs = os.path.join(outpath, 'asDB_protein_seqs.mpfa') dna_seqs = os.path.join(outpath, 'asDB_dna_seqs.fna') with open(protein_seqs, 'w') as aa_outfile: for aafile in os.listdir(os.path.join(outpath, 'antismash_db_protein_seqs')): aafile = os.path.join(outpath, 'antismash_db_protein_seqs', aafile) with open(aafile, 'r') as aa_in: fasta_gen = FASTA(aa_in) for header, sequence in fasta_gen.read(): aa_outfile.write('>' + header + '\n') aa_outfile.write(sequence + '\n') aa_outfile.close() with open(dna_seqs, 'w') as dna_outfile: for dnafile in os.listdir(os.path.join(outpath, 'antismash_db_dna_seqs')): dnafile = os.path.join(outpath, 'antismash_db_dna_seqs', dnafile) with open(dnafile, 'r') as dna_in: fasta_gen = FASTA(dna_in) for header, sequence in fasta_gen.read(): dna_outfile.write('>' + header + '\n') dna_outfile.write(sequence + '\n') dna_outfile.close() return None
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split()[0])[0] outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' % (ncbi_tid, assembly_version, header)) outf.write(sequence+'\n')
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version( header.split()[0])[0] outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' % (ncbi_tid, assembly_version, header)) outf.write(sequence + '\n')
def pluckify_mibig(inf, outaa, gbk_dd): mibig_orig = FASTA(inf) for header, sequence in mibig_orig.read(): m_head = header.split('|') # print(m_head) bgc_id = m_head[0] bgc_info = gbk_dd[bgc_id] # print(bgc_info) bgc_tid = bgc_info[0] if bgc_tid == 'None': continue bgc_bug = bgc_info[1] outaa.write('>' + 'ncbi_tid|' + str(bgc_tid) + '|mibig|' + bgc_id + '.1_cluster001' + '_' + m_head[1] + '_' + m_head[2] + '|genbank|' + m_head[6] + '|organism|' + bgc_bug + '\n') outaa.write(sequence + '\n') return None
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'ncbi': annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def annotate_fasta(input, output, extract_refseq_id, prefixes, depth, depth_force): verify_make_dir(output) if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) with open(input, 'r') if input != '-' else sys.stdin as inf: with open(os.path.join(output, output_fn + '.annotated.fna'), 'w') as output_fna: with open(os.path.join(output, output_fn + '.annotated.map'), 'w') as output_map: inf_fasta = FASTA(inf) annotater = refseq_annotater(inf_fasta.read(), prefixes, extract_refseq_id, depth=depth, depth_force=depth_force) for lines_fna, lines_map in annotater: output_fna.write(lines_fna) output_map.write(lines_map)
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version( refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def soft_mask2hard_mask(input, output): fasta_gen = FASTA(input) for title, seq in fasta_gen.read(): output.write('>%s\n' % title) output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
#!/usr/bin/env python import ipdb from ninja_utils.parsers import FASTA import csv accession2taxid = dict() with open(snakemake.input.mapping) as inf: csv_inf = csv.reader(inf, delimiter='\t') next(csv_inf) for line in csv_inf: accession2taxid[line[0]] = line[1] with open(snakemake.output[0], 'w') as outf: with open(snakemake.input.fasta[0]) as inf: parser = FASTA(inf) for title, seq in parser.read(): rowname = title.split(".")[0] if rowname in accession2taxid: taxid = accession2taxid[rowname] outf.write('>%s|kraken:taxid|%s\n%s\n' % (title, taxid, seq))