def refseq_get_ftp_links_from_file(input, output): db = RefSeqDatabase() tree = NCBITree() ncbi_tid_set = set() for line in input: line = str.replace(line, ' unclassified', '') line = str.replace(line, 'cf', '') [ ncbi_tid_set.add(_[0]) for _ in db.yield_ncbi_tid_row_from_name(line.strip()) ] ncbi_tid_successors = set() # How many total strains are there in HMP? # for ncbi_tid in ncbi_tid_set: #TODO Switch the tree around - predecessor and successors [ ncbi_tid_successors.add(_) for _ in tree.tree.predecessors_iter(ncbi_tid) if not _ in ncbi_tid_set ] ncbi_tid_set = set.union(ncbi_tid_set, ncbi_tid_successors) output.write('ncbi_tid,gg_lineage,ftp_link\n') for ncbi_tid in ncbi_tid_set: [ output.write('%s,%s,%s\n' % (ncbi_tid, tree.gg_lineage(ncbi_tid), ftp_link)) for ftp_link in db.yield_ftp_links(ncbi_tid) ]
def download_refseq_all(verbose): pool = multiprocessing.Pool(processes=4) rf = RefSeqDatabase() data = rf.get_blaze() tree = NCBITree() specified_kingdoms = {'k__Bacteria', 'k__Viruses', 'k__Archaea'} kingdoms = [] ftp_view = data.tree[data.tree.ftp != '' and data.tree.refseq_version != ''] ftp_links = yield_ftp_links(ftp_view, specified_kingdoms, tree) # ftp_test = [next(ftp_links) for _ in range(10)] pool.map(download_ftp_link, ftp_links) print('Done')
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate( inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print( "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version( refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'ncbi': annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def download_refseq(output, prefixes, kingdoms): url_dict = defaultdict(str, zip(('archaea', 'bacteria', 'fungi', 'viral', 'protozoa'), ('ftp://ftp.ncbi.nlm.nih.gov/refseq/release/archaea', 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/bacteria', 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/fungi', 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral', 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/protozoa'))) kingdoms = kingdoms.split(',') if '*' in kingdoms: urls = url_dict.values() else: urls = [url_dict[_] for _ in kingdoms] db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') if '*' in prefixes: prefix_set = set([str.encode(_) for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([str.encode(_) for _ in prefixes]) with click.open_file(output, 'wb') as outf: for url in urls: # Request the listing of the directory req = urllib.request.Request(url) string = urllib.request.urlopen(req).read().decode('utf-8') # Grab the filename ending with catalog.gz pattern_cat = re.compile('[a-zA-Z0-9.-]*.genomic.fna.gz') filelist = pattern_cat.findall(string) for file in filelist: req_file = urllib.request.Request('%s/%s' % (url, file)) with urllib.request.urlopen(req_file, 'rb') as ftp_stream: fasta_fh = line_bytestream_gzip(ftp_stream) for title, seq in binary_fasta(fasta_fh, db, prefix_set): outf.write(b'>%s\n%s\n' % (title, seq))
def test(self): rfd = RefSeqDatabase() rfd._create() assert_equals(None, None)