def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) genus_species = organism.split(';')[-1] genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, genus_species)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')
def main(): parser = make_arg_parser() args = parser.parse_args() db = RefSeqDatabase() nt = NCBITree() # parse command line with open(args.input, 'r') if args.input != '-' else sys.stdin as inf: fasta_gen = FASTA(inf) assembly_version = os.path.basename(args.input).split('_genomic')[0] with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') else: pass ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0]) if ncbi_tid: ncbi_tid = ncbi_tid[0] organism = nt.gg_lineage(ncbi_tid) # genus_species = organism.split(';')[-1] # genus_species = genus_species.replace('s__','') outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, organism)) outf.write(sequence+'\n') else: outf.write('>ref|%s|\n' % (header)) outf.write(sequence+'\n')