예제 #1
0
def load(db, query=None):

    t0 = time.time()
    conn = sqlite3.connect(db)
    cur = conn.cursor()

    gt_cols = get_gt_cols(cur)
    samples = get_samples(cur)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load %d arrays" \
            % (time.time() - t0, n)
    return carrays
예제 #2
0
def load(db, query=None):
    import database

    t0 = time.time()
    conn, metadata = database.get_session_metadata(db)

    gt_cols = get_gt_cols(metadata)
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load %d arrays" \
            % (time.time() - t0, n)
    return carrays
예제 #3
0
    def __init__(self, db, include_gt_cols=False,
                 out_format=DefaultRowFormat(None),
                 variant_id_getter=None):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols
        self.variant_id_getter = variant_id_getter

        # try to connect to the provided database
        self._connect_to_database()

        # save the gt_cols in the database and don't hard-code them anywhere.
        self.gt_cols = util.get_gt_cols(self.conn)

        # extract the column names from the sample table.
        # needed for gt-filter wildcard support.
        self._collect_sample_table_columns()

        # list of samples ids for each clause in the --gt-filter
        self.sample_info = collections.defaultdict(list)

        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indices(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indices_to_samples(self.c)
        self.idx_to_sample_object = util.map_indices_to_sample_objects(self.c)
        self.sample_to_sample_object = util.map_samples_to_sample_objects(self.c)
        self.formatter = out_format
        self.predicates = [self.formatter.predicate]
        self.sample_show_fields = ["variant_samples", "het_samples", "hom_alt_samples"]
예제 #4
0
def load(db):

    t0 = time.time()
    conn = sqlite3.connect(db)
    cur = conn.cursor()

    gt_cols = get_gt_cols(cur)
    samples = get_samples(cur)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    for gtc in gt_cols:
        carrays[gtc] = []
        for s in samples:
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load arrays" \
            % (time.time() - t0)
    return carrays
예제 #5
0
def create(db, cols=None):
    if cols is None:
        cols = [x[0] for x in gt_cols_types if x[0] != 'gts']
        print >>sys.stderr, (
                "indexing all columns execpt 'gts'; to index that column, "
                "run gemini bcolz_index %s --cols gts" % db)

    conn = sqlite3.connect(db)
    cur = conn.cursor()
    gt_cols = [x for x in get_gt_cols(cur) if x in cols]
    samples = get_samples(cur)
    bcpath = get_bcolz_dir(db)

    mkdir(bcpath)

    nv = get_n_variants(cur)

    sys.stderr.write("loading %i variants for %i samples into bcolz\n"
                     % (nv, len(samples)))

    if nv == 0 or len(samples) == 0:
        return

    carrays = {}
    tmps = {}
    try:
        for gtc in gt_cols:
            carrays[gtc] = []
            tmps[gtc] = []

            dt = dict(gt_cols_types)[gtc]
            for s in samples:
                mkdir("%s/%s" % (bcpath, s))
                carrays[gtc].append(bcolz.carray(np.empty(0, dtype=dt),
                                    expectedlen=nv,
                                    rootdir="%s/%s/%s" % (bcpath, s, gtc),
                                    chunklen=16384*8,
                                    mode="w"))
                tmps[gtc].append([])

        t0 = time.time()
        step = 200000
        del gtc

        empty = [-1] * len(samples)
        for i, row in enumerate(cur.execute("select %s from variants" % ", ".join(gt_cols))):
            for j, gt_col in enumerate(gt_cols):
                vals = decomp(row[j])
                if vals is None: # empty gt_phred_ll
                    vals = empty
                for isamp, sample in enumerate(samples):
                    tmps[gt_col][isamp].append(vals[isamp])
                    if (i > 0 and i % step == 0) or i == nv - 1:
                        carrays[gt_col][isamp].append(tmps[gt_col][isamp])
                        tmps[gt_col][isamp] = []
                        carrays[gt_col][isamp].flush()

            if i % step == 0 and i > 0:
                print >>sys.stderr, "at %.1fM (%.0f rows / second)" % (i / 1000000., i / float(time.time() - t0))

        t = float(time.time() - t0)
        print >>sys.stderr, "loaded %d variants at %.1f / second" % (len(carrays[gt_col][0]), nv / t)
    except:
        # on error, we remove the dirs so we can't have weird problems.
        for k, li in carrays.items():
            for i, ca in enumerate(li):
                if i < 5:
                    print >>sys.stderr, "removing:", ca.rootdir
                if i == 5:
                    print >>sys.stderr, "not reporting further removals for %s" % k
                ca.flush()
                shutil.rmtree(ca.rootdir)
        raise
예제 #6
0
def create(db, cols=None):
    if cols is None:
        cols = [x[0] for x in gt_cols_types if x[0] != 'gts']
        print >> sys.stderr, (
            "indexing all columns except 'gts'; to index that column, "
            "run gemini bcolz_index %s --cols gts" % db)

    conn = sqlite3.connect(db)
    cur = conn.cursor()
    gt_cols = [x for x in get_gt_cols(cur) if x in cols]
    samples = get_samples(cur)
    bcpath = get_bcolz_dir(db)

    mkdir(bcpath)

    nv = get_n_variants(cur)

    sys.stderr.write("loading %i variants for %i samples into bcolz\n" %
                     (nv, len(samples)))

    if nv == 0 or len(samples) == 0:
        return

    carrays = {}
    tmps = {}
    try:
        for gtc in gt_cols:
            carrays[gtc] = []
            tmps[gtc] = []

            dt = dict(gt_cols_types)[gtc]
            for s in samples:
                mkdir("%s/%s" % (bcpath, s))
                carrays[gtc].append(
                    bcolz.carray(np.empty(0, dtype=dt),
                                 expectedlen=nv,
                                 rootdir="%s/%s/%s" % (bcpath, s, gtc),
                                 chunklen=16384 * 8,
                                 mode="w"))
                tmps[gtc].append([])

        t0 = time.time()
        # scale step by number of samples to limit memory use.
        step = max(100, 2000000 / len(samples))
        sys.stderr.write("step-size: %i\n" % step)
        del gtc

        empty = [-1] * len(samples)
        for i, row in enumerate(
                cur.execute("select %s from variants" % ", ".join(gt_cols))):
            for j, gt_col in enumerate(gt_cols):
                vals = decomp(row[j])
                if vals is None or len(vals) == 0:  # empty gt_phred_ll
                    vals = empty
                for isamp, sample in enumerate(samples):
                    tmps[gt_col][isamp].append(vals[isamp])
                    if (i > 0 and i % step == 0) or i == nv - 1:
                        carrays[gt_col][isamp].append(tmps[gt_col][isamp])
                        tmps[gt_col][isamp] = []
                        carrays[gt_col][isamp].flush()

            if i % step == 0 and i > 0:
                print >> sys.stderr, "at %.1fM (%.0f rows / second)" % (
                    i / 1000000., i / float(time.time() - t0))

        t = float(time.time() - t0)
        print >> sys.stderr, "loaded %d variants at %.1f / second" % (len(
            carrays[gt_col][0]), nv / t)
    except:
        # on error, we remove the dirs so we can't have weird problems.
        for k, li in carrays.items():
            for i, ca in enumerate(li):
                if i < 5:
                    print >> sys.stderr, "removing:", ca.rootdir
                if i == 5:
                    print >> sys.stderr, "not reporting further removals for %s" % k
                ca.flush()
                shutil.rmtree(ca.rootdir)
        raise