def get_genbank_accession_from_ucsc_name(db): """Queries NCBI EUtils for the GenBank accession of a UCSC asseembly name """ global time_ncbi t0 = time_ms() logger.info('Fetching GenBank accession from NCBI EUtils for: ' + db) eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' esearch = eutils + 'esearch.fcgi?retmode=json' esummary = eutils + 'esummary.fcgi?retmode=json' asm_search = esearch + '&db=assembly&term=' + db # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=assembly&retmode=json&term=panTro4 data = json.loads(request(asm_search)) id_list = data['esearchresult']['idlist'] if len(id_list) > 0: assembly_uid = id_list[0] else: unfound_dbs.append(db) return '' asm_summary = esummary + '&db=assembly&id=' + assembly_uid # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&db=assembly&id=255628 data = json.loads(request(asm_summary)) result = data['result'][assembly_uid] acc = result['assemblyaccession'] # Accession.version # Return GenBank accession if it's default, else find and return it if "GCA_" not in acc: acc = result['synonym']['genbank'] time_ncbi += time_ms() - t0 return acc
def fetch_from_ensembl_genomes(): """Queries MySQL servers hosted by Ensembl Genomes To connect via Terminal (e.g. to debug), run: mysql --user=anonymous --host=mysql-eg-publicsql.ebi.ac.uk --port=4157 -A """ global time_ensembl t0 = time_ms() logger.info('Entering fetch_from_ensembl_genomes') connection = db_connect(host='mysql-eg-publicsql.ebi.ac.uk', user='******', port=4157) logger.info('Connected to Ensembl Genomes database') cursor = connection.cursor() db_map = {} org_map = {} # Get a list of databases we want to query for karyotype data cursor.execute('show databases like "%core_%"') for row in cursor.fetchall(): db = row[0] if 'collection' in db: continue name_slug = db.split('_core')[0].replace('_', '-') db_map[db] = name_slug db_tuples = [item for item in db_map.items()] cursor.close() # Take the list of DBs we want to query for karyotype data, # split it into 100 smaller lists, # then launch a new thread for each of those small new DB lists # to divide up the work of querying remote DBs. num_threads = 100 db_tuples_lists = chunkify(db_tuples, num_threads) with ThreadPoolExecutor(max_workers=num_threads) as pool: for result in pool.map(query_ensembl_karyotype_db, db_tuples_lists): for db_tuple in result: name_slug, asm_data = db_tuple if name_slug in org_map: org_map[name_slug].append(asm_data) else: org_map[name_slug] = [asm_data] logger.info('before exiting with clause') time_ensembl += time_ms() - t0 return org_map
def fetch_from_ucsc(): """Queries MySQL instances hosted by UCSC Genome Browser To connect via Terminal (e.g. to debug), run: mysql --user=genome --host=genome-mysql.soe.ucsc.edu -A """ global time_ucsc t0 = time_ms() logger.info('Entering fetch_from_ucsc') connection = db_connect(host='genome-mysql.soe.ucsc.edu', user='******') logger.info('Connected to UCSC database') cursor = connection.cursor() db_map = {} org_map = {} cursor.execute('use hgcentral') cursor.execute(''' SELECT name, scientificName FROM dbDb WHERE active = 1 ''') rows = cursor.fetchall() for row in rows: db = row[0] # e.g. H**o sapiens -> h**o-sapiens name_slug = row[1].lower().replace(' ', '-') db_map[db] = name_slug db_tuples = [item for item in db_map.items()] # Take the list of DBs we want to query for cytoBandIdeo data, # split it into 30 smaller lists, # then launch a new thread for each of those small new DB lists # to divide up the work of querying remote DBs. num_threads = 30 db_tuples_lists = chunkify(db_tuples, num_threads) with ThreadPoolExecutor(max_workers=num_threads) as pool: for result in pool.map(query_ucsc_cytobandideo_db, db_tuples_lists): if result is None: continue asm_data = result if name_slug in org_map: org_map[name_slug].append(asm_data) else: org_map[name_slug] = [asm_data] time_ucsc += time_ms() - t0 return org_map