Пример #1
0
class DbTest(unittest.TestCase):
    def setUp(self):
        db.db_init('sqlite:///unit_test.db')
        db.metadata.drop_all(bind=db.connection)
        db.metadata.create_all(db.engine)
        self.r1 = RefSNP(1000, '1')
        self.r2 = RefSNP(1001, '2')
        a1 = Allele('A', 'A', 100190109)
        a1.allele_count = 5000
        a1.total_count = 11000
        self.r1.put_allele(a1)
        a2 = Allele('A', 'G', 100190109)
        a2.allele_count = 6000
        a2.total_count = 11000
        self.r1.put_allele(a2)

    def test_bulk_insert(self):
        result = db.bulk_insert([self.r1, self.r2], db.ref_snps)
        self.assertEqual(2, result.rowcount)
        result = db.bulk_insert(self.r1.alleles, db.alleles)
        self.assertEqual(2, result.rowcount)

    def test_delete_chromosome(self):
        result = db.bulk_insert([self.r1, self.r2], db.ref_snps)
        result = db.bulk_insert(self.r1.alleles, db.alleles)
        RefSNP.delete_chromosomes(["1", "2"], db.connection)
        select_query = db.ref_snps.select().where(db.ref_snps.c.chromosome == '1')
        one_row = db.connection.execute(select_query).fetchone()
        self.assertIsNone(one_row)


    def test_default_init(self):
        db.default_init()
        self.assertTrue(True)
Пример #2
0
 def test_delete_chromosome(self):
     result = db.bulk_insert([self.r1, self.r2], db.ref_snps)
     result = db.bulk_insert(self.r1.alleles, db.alleles)
     RefSNP.delete_chromosomes(["1", "2"], db.connection)
     select_query = db.ref_snps.select().where(db.ref_snps.c.chromosome == '1')
     one_row = db.connection.execute(select_query).fetchone()
     self.assertIsNone(one_row)
Пример #3
0
 def setUp(self):
     db.db_init('sqlite:///unit_test.db')
     db.metadata.drop_all(bind=db.connection)
     db.metadata.create_all(db.engine)
     self.r1 = RefSNP(1000, '1')
     self.r2 = RefSNP(1001, '2')
     a1 = Allele('A', 'A', 100190109)
     a1.allele_count = 5000
     a1.total_count = 11000
     self.r1.put_allele(a1)
     a2 = Allele('A', 'G', 100190109)
     a2.allele_count = 6000
     a2.total_count = 11000
     self.r1.put_allele(a2)
Пример #4
0
def fetch_snp_file(json_file, queue, min_maf=0):
    """
    Fetch a NIH refSNP file then open it and add RefSNP objects to the work queue.
    :param json_file: NIH file to download via FTP
    :param queue: work queue for RefSNP objects
    :param min_maf minimum minor allele frequency. SNPs with a lower MAF will not be saved to database.
    :return:
    """
    # Not sure if ftplib is threadsafe so use a ftp login per call
    ftp = ftp_login()
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    download_path = DOWNLOAD_DIR + json_file
    download_needed = True
    if os.path.exists(download_path):
        md5 = []
        ftp.retrlines('RETR ' + json_file + ".md5", md5.append)
        if md5:
            md5 = md5[0].split(" ")[0]
        print("FTP MD5 of %s: %s" % (json_file, md5))
        block_size = 65536
        hasher = hashlib.md5()
        with open(DOWNLOAD_DIR + json_file, 'rb') as afile:
            buf = afile.read(block_size)
            while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(block_size)
        local_md5 = hasher.hexdigest()
        print("Local File MD5: %s" % local_md5)
        if local_md5 == md5:
            print("MD5 matches local copy. Skipping download.")
            download_needed = False
    if download_needed:
        with open(DOWNLOAD_DIR + json_file, 'wb') as f:
            ftp.retrbinary('RETR ' + json_file, f.write)
    with bz2.BZ2File(DOWNLOAD_DIR + json_file, 'rb') as f_in:
        chromosome = chromosome_from_filename(json_file)
        for line in f_in:
            snp = RefSNP.from_nih_json(line, chromosome)
            if snp.total_count > 0 and snp.alleles:
                if 0 <= min_maf <= snp.maf:
                    queue.put(snp)
    return True
Пример #5
0
    def load_snps_db(self, min_freq, max_snps):
        """
        Load snps from DB and store as SNPTuples. Also output map file for plink.
        :param max_snps: Max number of snps to load
        :param min_freq: min Minor Allele frequency
        :return:
        """

        invalid_count = 0
        snps_result = db.connection.execute(
            "Select r.id, chromosome, maf, total_count,  deleted, inserted, position, allele_count "
            "from ref_snps r  "
            "join alleles a on r.id = a.ref_snp_id "
            "and r.maf >= %f and r.total_count >= %i" % (min_freq, MIN_TOTAL_COUNT)
        )
        current_snp_id = -1
        snp = None
        for snp_row in snps_result:
            if snp_row["id"] != current_snp_id:
                if snp and snp.valid_for_plink():
                    if self.snp_count >= max_snps - 1:
                        print("Hit max_snps size of %i. Stopping loading snps." % max_snps, flush=True)
                        break
                    self.add_snp_tuple(snp)
                    if self.snp_count % 100000 == 0:
                        print("Loaded %i snps. %s" % (self.snp_count, datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
                              flush=True)
                else:
                    invalid_count += 1
                # otherwise new snp row
                snp = RefSNP.from_row_proxy(snp_row)

            # Added joined allele data every time
            snp.put_allele(Allele.from_row_proxy(snp_row))
            current_snp_id = snp_row["id"]
        self.add_snp_tuple(snp)
        print("Skipped Invalid:        %i" % invalid_count, flush=True)
        print("Total Loaded:           %i" % len(self.ordered_snps), flush=True)
Пример #6
0
def load_via_sql():
    start = datetime.now()
    db.default_init()
    print("%s SQL loading started" % start.strftime("%Y-%m-%d %H:%M:%S"))
    query = db.ref_snps.select().where(db.ref_snps.columns.maf >= MIN_FREQ)
    print(str(query.compile(db.engine, compile_kwargs={"literal_binds": True})))
    result = db.connection.execute(query)
    ref_snps = {}
    for row in result:
        snp = RefSNP.from_row_proxy(row)
        ref_snps[snp.id] = snp
    print("%s RefSNP Query complete" % datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    allele_query = "Select ref_snp_id, deleted, inserted, position, allele_count from ref_snps r  " \
                   "join alleles a on r.id = a.ref_snp_id and r.maf >= %f" \
                   " " % MIN_FREQ
    result = db.connection.execute(allele_query)
    for row in result:
        allele = Allele.from_row_proxy(row)
        ref_snps[row.ref_snp_id].put_allele(allele)
    end = datetime.now()
    print("%s DB loading finished. %s elapsed" % (end.strftime("%Y-%m-%d %H:%M:%S"), str(end - start)))
    print("%i snps loaded from the DB" % len(ref_snps))
    return ref_snps
Пример #7
0
def download_ref_snps(chromosome_list,
                      num_workers=2,
                      append_mode=False,
                      min_maf=0):
    """ Downloads all RefSNP data from NIH's FTP site. Requires ~250 GB of disk space
    """
    ftp = ftp_login()
    file_list = []
    # Get a list of files for download
    ftp.retrlines('NLST', file_list.append)
    search_pattern = DBSNP_JSON_PATTERN % ".*"
    if chromosome_list:
        chromosome_match = "(" + "|".join(chromosome_list).replace(" ",
                                                                   "") + ")"
        search_pattern = DBSNP_JSON_PATTERN % chromosome_match
    json_for_dl = [f for f in file_list if re.search(search_pattern, f)]
    if not append_mode:
        print("Removing old data from DB.")

        if not chromosome_list:
            try:
                print("No chromosome list specified. Clearing entire DB.")
                db.ref_snps.drop(db.engine)
                db.alleles.drop(db.engine)
            except:
                print(
                    'INFO - Exception raised droping tables. Possibly tables do not exist. Continuing on.'
                )
        else:
            RefSNP.delete_chromosomes(chromosome_list, db.connection)
    # Create schema if missing
    db.metadata.create_all(db.engine)
    with concurrent.futures.ProcessPoolExecutor(
            max_workers=num_workers) as executor, multiprocessing.Manager(
            ) as m:
        q = m.Queue(10000)
        # Start the load operations and save map of Future to filename
        future_to_file = {
            executor.submit(fetch_snp_file, json_file, q, min_maf): json_file
            for json_file in json_for_dl
        }
        # Sleep a bit to wait for download threads to load the queue
        time.sleep(10)
        count_inserted = 0
        while any(not f.done() for f in future_to_file.keys()):
            try:
                count_inserted += write_snps_to_db(q)
                if count_inserted > 0:
                    print("Inserted %i refSNPs." % count_inserted)
                time.sleep(2)  # Wait for more items in the queue
            except Exception as e:
                print("Exception writing snps to DB.")
                print(e)
                for f in future_to_file.keys():
                    f.cancel()
                raise e
        for future in future_to_file.keys():
            filename = future_to_file[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (filename, exc))
            else:
                print('Successfully downloaded %s and loaded into db.' %
                      filename)