Пример #1
0
def get_genbank_accession_from_ucsc_name(db):
    """Queries NCBI EUtils for the GenBank accession of a UCSC asseembly name
    """
    global time_ncbi
    t0 = time_ms()
    logger.info('Fetching GenBank accession from NCBI EUtils for: ' + db)

    eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    esearch = eutils + 'esearch.fcgi?retmode=json'
    esummary = eutils + 'esummary.fcgi?retmode=json'

    asm_search = esearch + '&db=assembly&term=' + db

    # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=assembly&retmode=json&term=panTro4
    data = json.loads(request(asm_search))
    id_list = data['esearchresult']['idlist']
    if len(id_list) > 0:
        assembly_uid = id_list[0]
    else:
        unfound_dbs.append(db)
        return ''
    asm_summary = esummary + '&db=assembly&id=' + assembly_uid

    # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&db=assembly&id=255628
    data = json.loads(request(asm_summary))
    result = data['result'][assembly_uid]
    acc = result['assemblyaccession']  # Accession.version

    # Return GenBank accession if it's default, else find and return it
    if "GCA_" not in acc:
        acc = result['synonym']['genbank']

    time_ncbi += time_ms() - t0
    return acc
Пример #2
0
def fetch_from_ensembl_genomes():
    """Queries MySQL servers hosted by Ensembl Genomes

    To connect via Terminal (e.g. to debug), run:
    mysql --user=anonymous --host=mysql-eg-publicsql.ebi.ac.uk --port=4157 -A
    """
    global time_ensembl
    t0 = time_ms()
    logger.info('Entering fetch_from_ensembl_genomes')
    connection = db_connect(host='mysql-eg-publicsql.ebi.ac.uk',
                            user='******',
                            port=4157)
    logger.info('Connected to Ensembl Genomes database')

    cursor = connection.cursor()

    db_map = {}
    org_map = {}

    # Get a list of databases we want to query for karyotype data
    cursor.execute('show databases like "%core_%"')
    for row in cursor.fetchall():
        db = row[0]
        if 'collection' in db:
            continue
        name_slug = db.split('_core')[0].replace('_', '-')
        db_map[db] = name_slug
    db_tuples = [item for item in db_map.items()]

    cursor.close()

    # Take the list of DBs we want to query for karyotype data,
    # split it into 100 smaller lists,
    # then launch a new thread for each of those small new DB lists
    # to divide up the work of querying remote DBs.
    num_threads = 100
    db_tuples_lists = chunkify(db_tuples, num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as pool:
        for result in pool.map(query_ensembl_karyotype_db, db_tuples_lists):
            for db_tuple in result:
                name_slug, asm_data = db_tuple
                if name_slug in org_map:
                    org_map[name_slug].append(asm_data)
                else:
                    org_map[name_slug] = [asm_data]

    logger.info('before exiting with clause')

    time_ensembl += time_ms() - t0
    return org_map
Пример #3
0
def fetch_from_ucsc():
    """Queries MySQL instances hosted by UCSC Genome Browser

    To connect via Terminal (e.g. to debug), run:
    mysql --user=genome --host=genome-mysql.soe.ucsc.edu -A
    """
    global time_ucsc
    t0 = time_ms()
    logger.info('Entering fetch_from_ucsc')
    connection = db_connect(host='genome-mysql.soe.ucsc.edu', user='******')
    logger.info('Connected to UCSC database')
    cursor = connection.cursor()

    db_map = {}
    org_map = {}

    cursor.execute('use hgcentral')
    cursor.execute('''
      SELECT name, scientificName FROM dbDb
        WHERE active = 1
    ''')
    rows = cursor.fetchall()

    for row in rows:
        db = row[0]
        # e.g. H**o sapiens -> h**o-sapiens
        name_slug = row[1].lower().replace(' ', '-')
        db_map[db] = name_slug

    db_tuples = [item for item in db_map.items()]

    # Take the list of DBs we want to query for cytoBandIdeo data,
    # split it into 30 smaller lists,
    # then launch a new thread for each of those small new DB lists
    # to divide up the work of querying remote DBs.
    num_threads = 30
    db_tuples_lists = chunkify(db_tuples, num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as pool:
        for result in pool.map(query_ucsc_cytobandideo_db, db_tuples_lists):
            if result is None:
                continue
            asm_data = result
            if name_slug in org_map:
                org_map[name_slug].append(asm_data)
            else:
                org_map[name_slug] = [asm_data]

    time_ucsc += time_ms() - t0
    return org_map