Пример #1
0
def get_random_ncbi_ids_set(count, db, keyword_search):
    """Get random NCBI ids"""

    logging.info('Fetching {} NCBI IDs for the {} database'.format(count, db))

    if not config.ENTREZ_EMAIL:
        msg = (
            "ENTREZ_EMAIL environnment variable undefined. "
            "You must define one with a valid email to use this program."
        )
        logging.error(msg)
        raise ConfigurationError(msg)

    Entrez.email = config.ENTREZ_EMAIL

    retstart = random.randint(1, 10000)
    retmax = count * 10

    handle = Entrez.esearch(
        db=db, retmax=retmax, retstart=retstart, term=keyword_search
    )
    pub_search = Entrez.read(handle)
    handle.close()

    return get_random_ids(pub_search['IdList'], count)
Пример #2
0
    def search_info(self, term, options=''):
        Entrez.email = self.email
        search_handle = Entrez.esearch(db="nucleotide",
                                       term=term+options,
                                       usehistory="y")
        search_results = Entrez.read(search_handle)
        search_handle.close()

        webenv = search_results["WebEnv"]
        query_key = search_results["QueryKey"]

        gi_list = search_results["IdList"]
        retmax = int(search_results["RetMax"])
        count = int(search_results["Count"])

        assert retmax == len(gi_list)

        dict = {"Count": count, "IdList": gi_list, "ReturnMax": retmax,
                "WebEnv": webenv, "QueryKey": query_key}
        return dict
Пример #3
0
def fetch_ncbi(ID, db, fmt='fasta', output_path='.'):
    """Fetch a ncbi entity

    Args:
        ID: NCBI ID
        fmt: file format
        output_path: path to output downloaded files
    """

    logging.info('Fetching NCBI ID {} with format {}...'.format(ID, fmt))

    filename = '{id}.{fmt}'.format(id=ID, fmt=fmt)

    db_fmt_path = os.path.join(output_path, fmt, db)
    output_file = os.path.join(db_fmt_path, filename)

    os.makedirs(db_fmt_path, exist_ok=True)

    handle = Entrez.efetch(db=db, id=ID, rettype=fmt, retmode="text")
    response = handle.read()
    with open(output_file, 'w') as f:
        f.write(response)
Пример #4
0
def fetch_ncbi(ID, db, fmt='fasta', output_path='.'):
    """Fetch a ncbi entity

    Args:
        ID: NCBI ID
        fmt: file format
        output_path: path to output downloaded files
    """

    logging.info('Fetching NCBI ID {} with format {}...'.format(ID, fmt))

    filename = '{id}.{fmt}'.format(id=ID, fmt=fmt)

    db_fmt_path = os.path.join(output_path, fmt, db)
    output_file = os.path.join(db_fmt_path, filename)

    os.makedirs(db_fmt_path, exist_ok=True)

    handle = Entrez.efetch(db=db, id=ID, rettype=fmt, retmode="text")
    response = handle.read()
    with open(output_file, 'w') as f:
        f.write(response)
Пример #5
0
    def search(self, organism='', options='', batch_size=50, progress=True):
        Entrez.email = self.email
        search_handle = Entrez.esearch(db="nucleotide",
                                       term=organism+options,
                                       usehistory="y")
        search_results = Entrez.read(search_handle)
        search_handle.close()

        webenv = search_results["WebEnv"]
        query_key = search_results["QueryKey"]

        gi_list = search_results["IdList"]
        retmax = int(search_results["RetMax"])
        count = int(search_results["Count"])

        if count == 0:
            return

        assert retmax == len(gi_list)
        ### Download data referred to in the previous search in batches
        tinyseqxml = organism + '.tseqxml'
        if not os.path.isfile(tinyseqxml):
            widgets = [tinyseqxml + "\t", Percentage(), ' ',
                       Bar(marker=RotatingMarker()),
                       ' ', ETA(), ' ', FileTransferSpeed(unit='r')]
            out_handle = open(tinyseqxml, "w")
            pbar = ProgressBar(widgets=widgets,
                               maxval=count,
                               term_width=int(self.columns) - 20)
            for start in range(0, count, batch_size):
                #end = min(count, start + batch_size)
                #print("Going to download record %i to %i" % (start+1, end))
                fetch_handle = Entrez.efetch(db="nucleotide",
                                             rettype="fasta",
                                             retmode="xml",
                                             retstart=start,
                                             retmax=batch_size,
                                             webenv=webenv,
                                             query_key=query_key)
                data = fetch_handle.read()
                fetch_handle.close()
                out_handle.write(data)
                status = (start + 1) / float(count)
                if progress and pbar.seconds_elapsed == 0:
                    pbar.start()
                if status > 0.001 and progress:
                    pbar.term_width = int(self.columns) - 20
                    pbar.update(status * count)
            if progress:
                pbar.finish()
            out_handle.close()

        ## At this point, file may be a stream of multiple XML documents.
        ### Do not parse as one single XML document.

        ### Beautiful Soup PUNCH! (Think Donkey Kong)
        start = CURRENT_MILLI_TIME()
        soup = BeautifulSoup(open(tinyseqxml, "r"), "lxml")
        end = CURRENT_MILLI_TIME()
        print "Finished Parsing XML in " + str(end - start) + " ms"

        return soup.find_all('tseq')
Пример #6
0
 def server_info(self):
     Entrez.email = self.email
     handle = Entrez.einfo(db="nucleotide")
     record = Entrez.read(handle)
     handle.close()
     return record
Пример #7
0
def fetch_count_from_api(term):
    try:
        handle = Entrez.esearch(db="pubmed", term=term, rettype="count")
        search_result = Entrez.read(handle)
    except Exception, e:
        raise e
Пример #8
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 11:18:38 2016

@author: tanfan.zjh
"""
from Bio_Eutils import Entrez, Medline
Entrez.email = "*****@*****.**"

# Search for PMIDs from author text based search
handle = Entrez.esearch(db="pubmed", retmax=100000, term="Arabidopsis Thaliana")
pub_search = Entrez.read(handle)
handle.close()
print pub_search
'''
# Fetch matching entries
handle = Entrez.efetch(db='pubmed', id=pub_search['IdList'], retmax=20, rettype="medline", retmode="text")
pub_items = Medline.parse(handle)

# Work with it
for pub_item in pub_items:
    print "*" * 10
    print "%s - %s." % (
        pub_item.get("TI","?"),
        ", ".join(pub_item.get("AB","?"))
        )

handle.close()
'''