def __init__(self, db): self.db = db self.query_executed = False self.for_browser = False self._connect_to_database() # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323 self.sample_to_idx = util.map_samples_to_indicies(self.c) # and vice versa. e.g., self.idx_to_sample[323] -> NA20814 self.idx_to_sample = util.map_indicies_to_samples(self.c)
def get_query(args, c): """ Execute a user-defined query passed in via the command line. """ sample_to_idx = util.map_samples_to_indicies(c) query_pieces = args.query.split() if not any(s.startswith("gt") for s in query_pieces) and not any("gt" in s for s in query_pieces): apply_basic_query(c, args) else: apply_query_w_genotype_select(c, args.query, args.use_header)
def __init__(self, db, include_gt_cols=False): assert os.path.exists(db), "%s does not exist." % db self.db = db self.query_executed = False self.for_browser = False self.include_gt_cols = include_gt_cols self._connect_to_database() # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323 self.sample_to_idx = util.map_samples_to_indicies(self.c) # and vice versa. e.g., self.idx_to_sample[323] -> NA20814 self.idx_to_sample = util.map_indicies_to_samples(self.c)
def get_query(args, c): """ Execute a user-defined query passed in via the command line. """ sample_to_idx = util.map_samples_to_indicies(c) query_pieces = args.query.split() if not any(s.startswith("gt") for s in query_pieces) and \ not any("gt" in s for s in query_pieces): apply_query(c, args) else: (tokens, select_cols, main_where, gts_where) = \ refine_sql(args.query, sample_to_idx) apply_refined_query(c, tokens, select_cols, main_where, gts_where, args)
def apply_query_w_genotype_select(c, query, use_header): """ Execute a query that contains gt* columns in only in the SELECT. """ # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") # all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields
def filter_query(c, query, gt_filter, use_header): """ Execute a base SQL query while applying filters on the returned rows based on filters applied to the genotype-specific columns. For example: --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1) """ def correct_genotype_filter(gt_filter, sample_to_idx): """ This converts a "raw" genotype filter supplied by the user to a filter than can be eval()'ed. Specifically, we must convery a _named_ genotype index to a _numerical_ genotype index so that the appropriate value can be extracted for the sample from the genotype numpy arrays. For example, converts: --gt-filter "(gt_types.1478PC0011 == 1)" to (gt_types[11] == 1) """ corrected_gt_filter = [] tokens = re.split(r"[\s+]+", gt_filter) for token in tokens: if token.find("gt") >= 0 or token.find("GT") >= 0: corrected = _correct_genotype_col(token, sample_to_idx) corrected_gt_filter.append(corrected) else: corrected_gt_filter.append(token) return " ".join(corrected_gt_filter) # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) gt_filter = correct_genotype_filter(gt_filter, sample_to_idx) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) if not eval(gt_filter): continue fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields