class DbTest(unittest.TestCase): def setUp(self): db.db_init('sqlite:///unit_test.db') db.metadata.drop_all(bind=db.connection) db.metadata.create_all(db.engine) self.r1 = RefSNP(1000, '1') self.r2 = RefSNP(1001, '2') a1 = Allele('A', 'A', 100190109) a1.allele_count = 5000 a1.total_count = 11000 self.r1.put_allele(a1) a2 = Allele('A', 'G', 100190109) a2.allele_count = 6000 a2.total_count = 11000 self.r1.put_allele(a2) def test_bulk_insert(self): result = db.bulk_insert([self.r1, self.r2], db.ref_snps) self.assertEqual(2, result.rowcount) result = db.bulk_insert(self.r1.alleles, db.alleles) self.assertEqual(2, result.rowcount) def test_delete_chromosome(self): result = db.bulk_insert([self.r1, self.r2], db.ref_snps) result = db.bulk_insert(self.r1.alleles, db.alleles) RefSNP.delete_chromosomes(["1", "2"], db.connection) select_query = db.ref_snps.select().where(db.ref_snps.c.chromosome == '1') one_row = db.connection.execute(select_query).fetchone() self.assertIsNone(one_row) def test_default_init(self): db.default_init() self.assertTrue(True)
def test_delete_chromosome(self): result = db.bulk_insert([self.r1, self.r2], db.ref_snps) result = db.bulk_insert(self.r1.alleles, db.alleles) RefSNP.delete_chromosomes(["1", "2"], db.connection) select_query = db.ref_snps.select().where(db.ref_snps.c.chromosome == '1') one_row = db.connection.execute(select_query).fetchone() self.assertIsNone(one_row)
def setUp(self): db.db_init('sqlite:///unit_test.db') db.metadata.drop_all(bind=db.connection) db.metadata.create_all(db.engine) self.r1 = RefSNP(1000, '1') self.r2 = RefSNP(1001, '2') a1 = Allele('A', 'A', 100190109) a1.allele_count = 5000 a1.total_count = 11000 self.r1.put_allele(a1) a2 = Allele('A', 'G', 100190109) a2.allele_count = 6000 a2.total_count = 11000 self.r1.put_allele(a2)
def fetch_snp_file(json_file, queue, min_maf=0): """ Fetch a NIH refSNP file then open it and add RefSNP objects to the work queue. :param json_file: NIH file to download via FTP :param queue: work queue for RefSNP objects :param min_maf minimum minor allele frequency. SNPs with a lower MAF will not be saved to database. :return: """ # Not sure if ftplib is threadsafe so use a ftp login per call ftp = ftp_login() os.makedirs(DOWNLOAD_DIR, exist_ok=True) download_path = DOWNLOAD_DIR + json_file download_needed = True if os.path.exists(download_path): md5 = [] ftp.retrlines('RETR ' + json_file + ".md5", md5.append) if md5: md5 = md5[0].split(" ")[0] print("FTP MD5 of %s: %s" % (json_file, md5)) block_size = 65536 hasher = hashlib.md5() with open(DOWNLOAD_DIR + json_file, 'rb') as afile: buf = afile.read(block_size) while len(buf) > 0: hasher.update(buf) buf = afile.read(block_size) local_md5 = hasher.hexdigest() print("Local File MD5: %s" % local_md5) if local_md5 == md5: print("MD5 matches local copy. Skipping download.") download_needed = False if download_needed: with open(DOWNLOAD_DIR + json_file, 'wb') as f: ftp.retrbinary('RETR ' + json_file, f.write) with bz2.BZ2File(DOWNLOAD_DIR + json_file, 'rb') as f_in: chromosome = chromosome_from_filename(json_file) for line in f_in: snp = RefSNP.from_nih_json(line, chromosome) if snp.total_count > 0 and snp.alleles: if 0 <= min_maf <= snp.maf: queue.put(snp) return True
def load_snps_db(self, min_freq, max_snps): """ Load snps from DB and store as SNPTuples. Also output map file for plink. :param max_snps: Max number of snps to load :param min_freq: min Minor Allele frequency :return: """ invalid_count = 0 snps_result = db.connection.execute( "Select r.id, chromosome, maf, total_count, deleted, inserted, position, allele_count " "from ref_snps r " "join alleles a on r.id = a.ref_snp_id " "and r.maf >= %f and r.total_count >= %i" % (min_freq, MIN_TOTAL_COUNT) ) current_snp_id = -1 snp = None for snp_row in snps_result: if snp_row["id"] != current_snp_id: if snp and snp.valid_for_plink(): if self.snp_count >= max_snps - 1: print("Hit max_snps size of %i. Stopping loading snps." % max_snps, flush=True) break self.add_snp_tuple(snp) if self.snp_count % 100000 == 0: print("Loaded %i snps. %s" % (self.snp_count, datetime.now().strftime("%Y-%m-%d %H:%M:%S")), flush=True) else: invalid_count += 1 # otherwise new snp row snp = RefSNP.from_row_proxy(snp_row) # Added joined allele data every time snp.put_allele(Allele.from_row_proxy(snp_row)) current_snp_id = snp_row["id"] self.add_snp_tuple(snp) print("Skipped Invalid: %i" % invalid_count, flush=True) print("Total Loaded: %i" % len(self.ordered_snps), flush=True)
def load_via_sql(): start = datetime.now() db.default_init() print("%s SQL loading started" % start.strftime("%Y-%m-%d %H:%M:%S")) query = db.ref_snps.select().where(db.ref_snps.columns.maf >= MIN_FREQ) print(str(query.compile(db.engine, compile_kwargs={"literal_binds": True}))) result = db.connection.execute(query) ref_snps = {} for row in result: snp = RefSNP.from_row_proxy(row) ref_snps[snp.id] = snp print("%s RefSNP Query complete" % datetime.now().strftime("%Y-%m-%d %H:%M:%S")) allele_query = "Select ref_snp_id, deleted, inserted, position, allele_count from ref_snps r " \ "join alleles a on r.id = a.ref_snp_id and r.maf >= %f" \ " " % MIN_FREQ result = db.connection.execute(allele_query) for row in result: allele = Allele.from_row_proxy(row) ref_snps[row.ref_snp_id].put_allele(allele) end = datetime.now() print("%s DB loading finished. %s elapsed" % (end.strftime("%Y-%m-%d %H:%M:%S"), str(end - start))) print("%i snps loaded from the DB" % len(ref_snps)) return ref_snps
def download_ref_snps(chromosome_list, num_workers=2, append_mode=False, min_maf=0): """ Downloads all RefSNP data from NIH's FTP site. Requires ~250 GB of disk space """ ftp = ftp_login() file_list = [] # Get a list of files for download ftp.retrlines('NLST', file_list.append) search_pattern = DBSNP_JSON_PATTERN % ".*" if chromosome_list: chromosome_match = "(" + "|".join(chromosome_list).replace(" ", "") + ")" search_pattern = DBSNP_JSON_PATTERN % chromosome_match json_for_dl = [f for f in file_list if re.search(search_pattern, f)] if not append_mode: print("Removing old data from DB.") if not chromosome_list: try: print("No chromosome list specified. Clearing entire DB.") db.ref_snps.drop(db.engine) db.alleles.drop(db.engine) except: print( 'INFO - Exception raised droping tables. Possibly tables do not exist. Continuing on.' ) else: RefSNP.delete_chromosomes(chromosome_list, db.connection) # Create schema if missing db.metadata.create_all(db.engine) with concurrent.futures.ProcessPoolExecutor( max_workers=num_workers) as executor, multiprocessing.Manager( ) as m: q = m.Queue(10000) # Start the load operations and save map of Future to filename future_to_file = { executor.submit(fetch_snp_file, json_file, q, min_maf): json_file for json_file in json_for_dl } # Sleep a bit to wait for download threads to load the queue time.sleep(10) count_inserted = 0 while any(not f.done() for f in future_to_file.keys()): try: count_inserted += write_snps_to_db(q) if count_inserted > 0: print("Inserted %i refSNPs." % count_inserted) time.sleep(2) # Wait for more items in the queue except Exception as e: print("Exception writing snps to DB.") print(e) for f in future_to_file.keys(): f.cancel() raise e for future in future_to_file.keys(): filename = future_to_file[future] try: data = future.result() except Exception as exc: print('%r generated an exception: %s' % (filename, exc)) else: print('Successfully downloaded %s and loaded into db.' % filename)