Exemplo n.º 1
0
def refseq_get_ftp_links_from_file(input, output):
    db = RefSeqDatabase()
    tree = NCBITree()

    ncbi_tid_set = set()
    for line in input:
        line = str.replace(line, ' unclassified', '')
        line = str.replace(line, 'cf', '')

        [
            ncbi_tid_set.add(_[0])
            for _ in db.yield_ncbi_tid_row_from_name(line.strip())
        ]

    ncbi_tid_successors = set()
    # How many total strains are there in HMP?
    #
    for ncbi_tid in ncbi_tid_set:
        #TODO Switch the tree around - predecessor and successors
        [
            ncbi_tid_successors.add(_)
            for _ in tree.tree.predecessors_iter(ncbi_tid)
            if not _ in ncbi_tid_set
        ]

    ncbi_tid_set = set.union(ncbi_tid_set, ncbi_tid_successors)
    output.write('ncbi_tid,gg_lineage,ftp_link\n')
    for ncbi_tid in ncbi_tid_set:
        [
            output.write('%s,%s,%s\n' %
                         (ncbi_tid, tree.gg_lineage(ncbi_tid), ftp_link))
            for ftp_link in db.yield_ftp_links(ncbi_tid)
        ]
def download_refseq_all(verbose):
    pool = multiprocessing.Pool(processes=4)
    rf = RefSeqDatabase()
    data = rf.get_blaze()
    tree = NCBITree()
    specified_kingdoms = {'k__Bacteria', 'k__Viruses', 'k__Archaea'}
    kingdoms = []

    ftp_view = data.tree[data.tree.ftp != '' and data.tree.refseq_version != '']
    ftp_links = yield_ftp_links(ftp_view, specified_kingdoms, tree)
    # ftp_test = [next(ftp_links) for _ in range(10)]

    pool.map(download_ftp_link, ftp_links)
    print('Done')
Exemplo n.º 3
0
def download_refseq_all(verbose):
    pool = multiprocessing.Pool(processes=4)
    rf = RefSeqDatabase()
    data = rf.get_blaze()
    tree = NCBITree()
    specified_kingdoms = {'k__Bacteria', 'k__Viruses', 'k__Archaea'}
    kingdoms = []

    ftp_view = data.tree[data.tree.ftp != ''
                         and data.tree.refseq_version != '']
    ftp_links = yield_ftp_links(ftp_view, specified_kingdoms, tree)
    # ftp_test = [next(ftp_links) for _ in range(10)]

    pool.map(download_ftp_link, ftp_links)
    print('Done')
Exemplo n.º 4
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth,
                  depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id,
                                              prefixes,
                                              db,
                                              tree,
                                              depth=depth,
                                              depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id,
                                          prefixes,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(
                            inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print(
            "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file."
            % (outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
Exemplo n.º 5
0
def refseq_annotate(input, output, extract_refseq_id, prefixes):
    db = RefSeqDatabase()

    # check for the glob prefix
    prefixes = prefixes.split(',')

    begin, end = extract_refseq_id.split(',')

    if '*' in prefixes:
        prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()])
    else:
        prefix_set = set([_ for _ in prefixes])

    inf_fasta = FASTA(input)
    for title, seq in inf_fasta.read():
        title = '>' + title
        refseq_accession_version = find_between(title, begin, end)
        if refseq_accession_version[:2] in prefix_set:
            ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version)
            if ncbi_tid:
                title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:])
            output.write('%s\n%s\n' % (title, seq))
Exemplo n.º 6
0
def refseq_annotate(input, output, extract_refseq_id, prefixes):
    db = RefSeqDatabase()

    # check for the glob prefix
    prefixes = prefixes.split(',')

    begin, end = extract_refseq_id.split(',')

    if '*' in prefixes:
        prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()])
    else:
        prefix_set = set([_ for _ in prefixes])

    inf_fasta = FASTA(input)
    for title, seq in inf_fasta.read():
        title = '>' + title
        refseq_accession_version = find_between(title, begin, end)
        if refseq_accession_version[:2] in prefix_set:
            ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(
                refseq_accession_version)
            if ncbi_tid:
                title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:])
            output.write('%s\n%s\n' % (title, seq))
Exemplo n.º 7
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'ncbi':
            annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
Exemplo n.º 8
0
def download_refseq(output, prefixes, kingdoms):
    url_dict = defaultdict(str,
        zip(('archaea', 'bacteria', 'fungi', 'viral', 'protozoa'), ('ftp://ftp.ncbi.nlm.nih.gov/refseq/release/archaea',
                                                                    'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/bacteria',
                                                                    'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/fungi',
                                                                    'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral',
                                                                    'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/protozoa')))

    kingdoms = kingdoms.split(',')
    if '*' in kingdoms:
        urls = url_dict.values()
    else:
        urls = [url_dict[_] for _ in kingdoms]

    db = RefSeqDatabase()
    # check for the glob prefix
    prefixes = prefixes.split(',')
    if '*' in prefixes:
        prefix_set = set([str.encode(_) for _ in db.refseq_prefix_mapper.keys()])
    else:
        prefix_set = set([str.encode(_) for _ in prefixes])

    with click.open_file(output, 'wb') as outf:
        for url in urls:
            # Request the listing of the directory
            req = urllib.request.Request(url)
            string = urllib.request.urlopen(req).read().decode('utf-8')

            # Grab the filename ending with catalog.gz
            pattern_cat = re.compile('[a-zA-Z0-9.-]*.genomic.fna.gz')
            filelist = pattern_cat.findall(string)

            for file in filelist:
                req_file = urllib.request.Request('%s/%s' % (url, file))
                with urllib.request.urlopen(req_file, 'rb') as ftp_stream:
                    fasta_fh = line_bytestream_gzip(ftp_stream)
                    for title, seq in binary_fasta(fasta_fh, db, prefix_set):
                        outf.write(b'>%s\n%s\n' % (title, seq))
Exemplo n.º 9
0
 def test(self):
     rfd = RefSeqDatabase()
     rfd._create()
     assert_equals(None, None)