def __init__(self, *inargs, **inkwargs): try: main_fpath = os.path.abspath(sys.modules[self.__module__].__file__) self.primary_input_path = None self.secondary_paths = None self.output_dir = None self.output_basename = None self.plain_output = None self.job_conf_path = None self.logger = None self.dbconn = None self.cursor = None self._define_cmd_parser() self.args = cravat.util.get_args(self.cmd_arg_parser, inargs, inkwargs) self.parse_cmd_args(inargs, inkwargs) if hasattr(self.args, "status_writer") == False: self.status_writer = None else: self.status_writer = self.args.status_writer if hasattr(self.args, "live") == False: live = False else: live = self.args.live self.supported_chroms = set(cannonical_chroms) if live: return main_basename = os.path.basename(main_fpath) if "." in main_basename: self.module_name = ".".join(main_basename.split(".")[:-1]) else: self.module_name = main_basename self.annotator_name = self.module_name self.module_dir = os.path.dirname(main_fpath) self.annotator_dir = os.path.dirname(main_fpath) self.data_dir = os.path.join(self.module_dir, "data") # Load command line opts self._setup_logger() config_loader = ConfigLoader(self.job_conf_path) self.conf = config_loader.get_module_conf(self.module_name) self._verify_conf() self._id_col_name = self.conf["output_columns"][0]["name"] if "logging_level" in self.conf: self.logger.setLevel(self.conf["logging_level"].upper()) if "title" in self.conf: self.annotator_display_name = self.conf["title"] else: self.annotator_display_name = os.path.basename( self.module_dir).upper() if "version" in self.conf: self.annotator_version = self.conf["version"] else: self.annotator_version = "" except Exception as e: self._log_exception(e)
def __init__(self, *inargs, **inkwargs): try: main_fpath = os.path.abspath(sys.modules[self.__module__].__file__) self.primary_input_path = None self.secondary_paths = None self.output_dir = None self.output_basename = None self.plain_output = None self.job_conf_path = None self.logger = None self.dbconn = None self.cursor = None self._define_cmd_parser() self.args = cravat.util.get_args(self.cmd_arg_parser, inargs, inkwargs) self.parse_cmd_args(inargs, inkwargs) if hasattr(self.args, 'status_writer') == False: self.status_writer = None else: self.status_writer = self.args.status_writer if hasattr(self.args, 'live') == False: live = False else: live = self.args.live if live: return main_basename = os.path.basename(main_fpath) if '.' in main_basename: self.module_name = '.'.join(main_basename.split('.')[:-1]) else: self.module_name = main_basename self.annotator_name = self.module_name self.module_dir = os.path.dirname(main_fpath) self.annotator_dir = os.path.dirname(main_fpath) self.data_dir = os.path.join(self.module_dir, 'data') # Load command line opts self._setup_logger() config_loader = ConfigLoader(self.job_conf_path) self.conf = config_loader.get_module_conf(self.module_name) self._verify_conf() self._id_col_name = self.conf['output_columns'][0]['name'] if 'logging_level' in self.conf: self.logger.setLevel(self.conf['logging_level'].upper()) if 'title' in self.conf: self.annotator_display_name = self.conf['title'] else: self.annotator_display_name = os.path.basename( self.module_dir).upper() if 'version' in self.conf: self.annotator_version = self.conf['version'] else: self.annotator_version = '' except Exception as e: self._log_exception(e)
def __init__(self, cmd_args): # self.module_name = get_caller_name(sys.modules[self.__module__].__file__) self.module_name = get_caller_name(cmd_args[0]) self.parse_cmd_args(cmd_args) self._setup_logger() config_loader = ConfigLoader() self.conf = config_loader.get_module_conf(self.module_name) self.fix_col_names() self.dbconn = None self.cursor = None self.cursor_w = None self._open_db_connection() self.should_run_annotate = self.check()
def __init__(self, cmd_args, status_writer, live=False): self.live = live self.t = time.time() ''' if live: self.live = live self.cmd_args = SimpleNamespace() self.cmd_args.include_sources = [] self.cmd_args.exclude_sources = [] self.input_path = '' self._setup_logger() return ''' self.status_writer = status_writer main_fpath = cmd_args[0] main_basename = os.path.basename(main_fpath) if '.' in main_basename: self.module_name = '.'.join(main_basename.split('.')[:-1]) else: self.module_name = main_basename self.module_dir = os.path.dirname(main_fpath) self.mapper_dir = os.path.dirname(main_fpath) self.cmd_parser = None self.cmd_args = None self.input_path = None self.input_dir = None self.reader = None self.output_dir = None self.output_base_fname = None self.crx_path = None self.crg_path = None self.crt_path = None self.crx_writer = None self.crg_writer = None self.crt_writer = None self.gene_sources = [] #self.primary_gene_source = None self.gene_info = {} #self.written_primary_transc = set([]) self._define_main_cmd_args() self._define_additional_cmd_args() self._parse_cmd_args(cmd_args) self._setup_logger() config_loader = ConfigLoader() self.conf = config_loader.get_module_conf(self.module_name) self.cravat_version = pkg_resources.get_distribution( 'open-cravat').version
def __init__(self, cmd_args, status_writer): try: self.status_writer = status_writer self.logger = None main_fpath = cmd_args[0] main_basename = os.path.basename(main_fpath) if '.' in main_basename: self.annotator_name = '.'.join(main_basename.split('.')[:-1]) else: self.annotator_name = main_basename self.annotator_dir = os.path.dirname(main_fpath) self.data_dir = os.path.join(self.annotator_dir, 'data') # Load command line opts self.primary_input_path = None self.secondary_paths = None self.output_dir = None self.output_basename = None self.plain_output = None self.job_conf_path = None self.parse_cmd_args(cmd_args) # Make output dir if it doesn't exist if not (os.path.exists(self.output_dir)): os.makedirs(self.output_dir) self._setup_logger() config_loader = ConfigLoader(self.job_conf_path) self.conf = config_loader.get_module_conf(self.annotator_name) self._verify_conf() self._id_col_name = self.conf['output_columns'][0]['name'] if 'logging_level' in self.conf: self.logger.setLevel(self.conf['logging_level'].upper()) if 'title' in self.conf: self.annotator_display_name = self.conf['title'] else: self.annotator_display_name = os.path.basename( self.annotator_dir).upper() if 'version' in self.conf: self.annotator_version = self.conf['version'] else: self.annotator_version = '' self.dbconn = None self.cursor = None except Exception as e: self._log_exception(e)
def __init__(self, cmd_args): try: self.logger = None main_fpath = os.path.abspath(sys.modules[self.__module__].__file__) main_basename = os.path.basename(main_fpath) if "." in main_basename: self.annotator_name = ".".join(main_basename.split(".")[:-1]) else: self.annotator_name = main_basename self.annotator_dir = os.path.dirname(main_fpath) self.data_dir = os.path.join(self.annotator_dir, "data") # Load command line opts self.primary_input_path = None self.secondary_paths = None self.output_dir = None self.output_basename = None self.plain_output = None self.job_conf_path = None self.parse_cmd_args(cmd_args) # Make output dir if it doesn't exist if not (os.path.exists(self.output_dir)): os.makedirs(self.output_dir) self._setup_logger() config_loader = ConfigLoader(self.job_conf_path) self.conf = config_loader.get_module_conf(self.annotator_name) self._verify_conf() self._id_col_name = self.conf["output_columns"][0]["name"] if "logging_level" in self.conf: self.logger.setLevel(self.conf["logging_level"].upper()) if "title" in self.conf: self.annotator_display_name = self.conf["title"] else: self.annotator_display_name = os.path.basename( self.annotator_dir ).upper() if "version" in self.conf: self.annotator_version = self.conf["version"] self.logger.info("Initialized %s" % self.annotator_name) self.dbconn = None self.cursor = None except Exception as e: self._log_exception(e)
def __init__(self, *inargs, **inkwargs): self.cmd_parser = None self.input_path = None self.input_dir = None self.reader = None self.output_dir = None self.output_base_fname = None self.crx_path = None self.crg_path = None self.crt_path = None self.crx_writer = None self.crg_writer = None self.crt_writer = None self._define_main_cmd_args() self._define_additional_cmd_args() self._parse_cmd_args(inargs, inkwargs) if hasattr(self.args, "status_writer") == False: status_writer = None else: status_writer = self.args.status_writer if hasattr(self.args, "live") == False: live = False else: live = self.args.live self.live = live self.t = time.time() self.status_writer = status_writer main_fpath = self.args.script_path main_basename = os.path.basename(main_fpath) if "." in main_basename: self.module_name = ".".join(main_basename.split(".")[:-1]) else: self.module_name = main_basename self.module_dir = os.path.dirname(main_fpath) self.mapper_dir = os.path.dirname(main_fpath) self.gene_sources = [] # self.primary_gene_source = None self.gene_info = {} # self.written_primary_transc = set([]) self._setup_logger() config_loader = ConfigLoader() self.conf = config_loader.get_module_conf(self.module_name) self.cravat_version = pkg_resources.get_distribution("open-cravat").version
class CravatReport: def __init__(self, *inargs, **inkwargs): self.cf = None self.filtertable = "filter" self.colinfo = {} self.colnos = {} self.newcolnos = {} self.var_added_cols = [] self.summarizing_modules = [] self.columngroups = {} self.column_subs = {} self.column_sub_allow_partial_match = {} self.colname_conversion = {} self.warning_msgs = [] self.colnames_to_display = {} self.colnos_to_display = {} self.display_select_columns = {} self.extracted_cols = {} self.conn = None self.levels_to_write = None self.parse_cmd_args(inargs, inkwargs) global parser for ag in parser._action_groups: if ag.title == "optional arguments": for a in ag._actions: if "-t" in a.option_strings: ag._actions.remove(a) self._setup_logger() def parse_cmd_args(self, inargs, inkwargs): parsed_args = cravat.util.get_args(parser, inargs, inkwargs) self.parsed_args = parsed_args if parsed_args.md is not None: constants.custom_modules_dir = parsed_args.md self.dbpath = parsed_args.dbpath self.filterpath = parsed_args.filterpath self.filtername = parsed_args.filtername self.filterstring = parsed_args.filterstring self.filtersql = parsed_args.filtersql self.filter = parsed_args.filter self.confs = {} if parsed_args.output_dir is not None: self.output_dir = parsed_args.output_dir else: self.output_dir = os.path.dirname(self.dbpath) self.savepath = parsed_args.savepath if self.savepath is not None and os.path.dirname(self.savepath) == "": self.savepath = os.path.join(self.output_dir, self.savepath) self.confpath = parsed_args.confpath self.conf = ConfigLoader(job_conf_path=self.confpath) self.module_name = parsed_args.module_name if self.module_name in self.conf._all: self.confs.update(self.conf._all[self.module_name]) if self.conf is not None: self.module_conf = self.conf.get_module_conf(self.module_name) else: self.module_conf = None if hasattr(parsed_args, "reporttypes"): self.report_types = parsed_args.reporttypes if hasattr(parsed_args, "conf") and parsed_args.conf is not None: self.confs.update(parsed_args.conf) if parsed_args.confs is not None: confs = parsed_args.confs.lstrip("'").rstrip("'").replace("'", '"') if self.confs is None: self.confs = json.loads(confs) else: self.confs.update(json.loads(confs)) # Chooses filter. if self.filter is None: if self.confs is not None and "filter" in self.confs: self.filter = self.confs["filter"] local = au.mic.get_local() if (self.filter is None and self.filterpath is None and self.filtername is None and self.filterstring is None and parsed_args.package is not None and parsed_args.package in local and "filter" in local[parsed_args.package].conf): self.filter = local[parsed_args.package].conf["filter"] self.output_basename = os.path.basename(self.dbpath)[:-7] status_fname = "{}.status.json".format(self.output_basename) self.status_fpath = os.path.join(self.output_dir, status_fname) self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel if parsed_args.inputfiles is None and parsed_args.dbpath is not None: db = sqlite3.connect(parsed_args.dbpath) c = db.cursor() q = 'select colval from info where colkey="_input_paths"' c.execute(q) r = c.fetchone() if r is not None: parsed_args.inputfiles = [] s = r[0] if " " in s: s = s.replace("'", '"') s = s.replace("\\", "\\\\\\\\") s = json.loads(s) for k in s: input_path = s[k] parsed_args.inputfiles.append(input_path) c.close() db.close() if hasattr(parsed_args, "status_writer"): self.status_writer = parsed_args.status_writer else: self.status_writer = None self.concise_report = parsed_args.concise_report self.extract_columns_multilevel = self.get_standardized_module_option( self.confs.get("extract-columns", {})) self.args = parsed_args def should_write_level(self, level): if self.levels_to_write is None: return True elif level in self.levels_to_write: return True else: return False async def prep(self): try: await self.connect_db() await self.load_filter() except Exception as e: if hasattr(self, "cf"): await self.cf.close_db() if not hasattr(e, "notraceback") or e.notraceback != True: import traceback traceback.print_exc() self.logger.error(e) else: if hasattr(self, "logger"): write_log_msg(self.logger, e) e.handled = True raise def _setup_logger(self): if hasattr(self, "no_log") and self.no_log: return try: self.logger = logging.getLogger("cravat." + self.module_name) except Exception as e: self._log_exception(e) self.error_logger = logging.getLogger("error." + self.module_name) self.unique_excs = [] async def get_db_conn(self): if self.dbpath is None: return None if self.conn is None: self.conn = await aiosqlite.connect(self.dbpath) return self.conn async def exec_db(self, func, *args, **kwargs): conn = await self.get_db_conn() cursor = await conn.cursor() try: ret = await func(*args, conn=conn, cursor=cursor, **kwargs) except: await cursor.close() raise await cursor.close() return ret def _log_exception(self, e, halt=True): if halt: raise e else: if self.logger: self.logger.exception(e) async def getjson(self, level): ret = None if await self.exec_db(self.table_exists, level) == False: return ret for row in await self.cf.exec_db(self.cf.getiterator, level): row = self.substitute_val(level, row) return json.dumps(row) def substitute_val(self, level, row): for sub in self.column_subs.get(level, []): value = row[sub.index] if value is None or value == "": continue if (level == "variant" and sub.module == "base" and sub.col == "all_mappings"): mappings = json.loads(row[sub.index]) for gene in mappings: for i in range(len(mappings[gene])): sos = mappings[gene][i][2].split(",") sos = [sub.subs.get(so, so) for so in sos] mappings[gene][i][2] = ",".join(sos) value = json.dumps(mappings) elif level == "gene" and sub.module == "base" and sub.col == "all_so": vals = [] for i, so_count in enumerate(value.split(",")): so = so_count[:3] so = sub.subs.get(so, so) so_count = so + so_count[3:] vals.append(so_count) value = ",".join(vals) else: value = sub.subs.get(value, value) row[sub.index] = value return row def process_datarow(self, args): datarow = args[0] should_skip_some_cols = args[1] level = args[2] gene_summary_datas = args[3] if datarow is None: return None datarow = list(datarow) if should_skip_some_cols: datarow = [ datarow[colno] for colno in range(num_total_cols) if colno not in colnos_to_skip ] if level == "variant": # adds gene level data to variant level. if self.nogenelevelonvariantlevel == False and hugo_present: hugo = datarow[self.colnos["variant"]["base__hugo"]] loop = asyncio.get_event_loop() future = asyncio.ensure_future(self.cf.get_gene_row(hugo), loop) generow = future.result() if generow is None: datarow.extend( [None for i in range(len(self.var_added_cols))]) else: datarow.extend([ generow[self.colnos["gene"][colname]] for colname in self.var_added_cols ]) elif level == "gene": # adds summary data to gene level. hugo = datarow[0] for mi, _, _ in self.summarizing_modules: module_name = mi.name [gene_summary_data, cols] = gene_summary_datas[module_name] if (hugo in gene_summary_data and gene_summary_data[hugo] is not None and len(gene_summary_data[hugo]) == len(cols)): datarow.extend( [gene_summary_data[hugo][col["name"]] for col in cols]) else: datarow.extend([None for v in cols]) # re-orders data row. new_datarow = [] colnos = self.colnos[level] for colname in [ col["col_name"] for col in self.colinfo[level]["columns"] ]: if colname in self.colname_conversion[level]: newcolname = self.colname_conversion[level][colname] if newcolname in colnos: colno = colnos[newcolname] else: self.logger.info( "column name does not exist in data: {}".format( colname)) continue else: colno = colnos[colname] value = datarow[colno] new_datarow.append(value) # does report substitution. new_datarow = self.substitute_val(level, new_datarow) if hasattr(self, "keep_json_all_mapping") == False and level == "variant": colno = self.colnos["variant"]["base__all_mappings"] all_map = json.loads(new_datarow[colno]) newvals = [] for hugo in all_map: for maprow in all_map[hugo]: [protid, protchange, so, transcript, rnachange] = maprow if protid == None: protid = "(na)" if protchange == None: protchange = "(na)" if rnachange == None: rnachange = "(na)" newval = (transcript + ":" + hugo + ":" + protid + ":" + so + ":" + protchange + ":" + rnachange) newvals.append(newval) newvals.sort() newcell = "; ".join(newvals) new_datarow[colno] = newcell return new_datarow def get_extracted_header_columns(self, level): cols = [] for col in self.colinfo[level]["columns"]: if col["col_name"] in self.colnames_to_display[level]: cols.append(col) return cols async def run_level(self, level): ret = await self.exec_db(self.table_exists, level) if ret == False: return if self.should_write_level(level) == False: return gene_summary_datas = {} if level == "variant": await self.cf.exec_db(self.cf.make_filtered_uid_table) elif level == "gene": await self.cf.exec_db(self.cf.make_filtered_hugo_table) for mi, o, cols in self.summarizing_modules: if hasattr(o, "build_gene_collection"): msg = "Obsolete module [{}] for gene level summarization. Update the module to get correct gene level summarization.".format( mi.name) self.warning_msgs.append(msg) if self.args.silent == False: print("===Warning: {}".format(msg)) gene_summary_data = {} else: gene_summary_data = await o.get_gene_summary_data(self.cf) gene_summary_datas[mi.name] = [gene_summary_data, cols] for col in cols: if "category" in col and col["category"] in [ "single", "multi" ]: for i in range(len(self.colinfo[level]["columns"])): colinfo_col = self.colinfo[level]["columns"][i] if mi.name in ["hg38", "tagsampler"]: grp_name = "base" else: grp_name = mi.name if colinfo_col[ "col_name"] == grp_name + "__" + col[ "name"]: break cats = [] for hugo in gene_summary_data: val = gene_summary_data[hugo][col["name"]] if len(colinfo_col["reportsub"]) > 0: if val in colinfo_col["reportsub"]: val = colinfo_col["reportsub"][val] if val not in cats: cats.append(val) self.colinfo[level]["columns"][i]["col_cats"] = cats self.write_preface(level) self.extracted_cols[level] = self.get_extracted_header_columns(level) self.write_header(level) if level == "variant": hugo_present = "base__hugo" in self.colnos["variant"] datacols, datarows = await self.cf.exec_db( self.cf.get_filtered_iterator, level) num_total_cols = len(datacols) colnos_to_skip = [] if level == "gene": for colno in range(len(datacols)): if datacols[colno] in constants.legacy_gene_level_cols_to_skip: colnos_to_skip.append(colno) should_skip_some_cols = len(colnos_to_skip) > 0 if level == "variant" and self.args.separatesample: write_variant_sample_separately = True sample_newcolno = self.newcolnos["variant"]["base__samples"] else: write_variant_sample_separately = False colnos = self.colnos[level] all_mappings_newcolno = self.newcolnos["variant"]["base__all_mappings"] cols = self.colinfo[level]["columns"] json_colnos = [] for i in range(len(cols)): col = cols[i] if col["table"] == True: json_colnos.append(i) for datarow in datarows: if datarow is None: continue datarow = list(datarow) if should_skip_some_cols: datarow = [ datarow[colno] for colno in range(num_total_cols) if colno not in colnos_to_skip ] if level == "variant": # adds gene level data to variant level. if self.nogenelevelonvariantlevel == False and hugo_present: hugo = datarow[self.colnos["variant"]["base__hugo"]] generow = await self.cf.get_gene_row(hugo) if generow is None: datarow.extend( [None for i in range(len(self.var_added_cols))]) else: datarow.extend([ generow[self.colnos["gene"][colname]] for colname in self.var_added_cols ]) elif level == "gene": # adds summary data to gene level. hugo = datarow[0] for mi, _, _ in self.summarizing_modules: module_name = mi.name [gene_summary_data, cols] = gene_summary_datas[module_name] if (hugo in gene_summary_data and gene_summary_data[hugo] is not None and len(gene_summary_data[hugo]) == len(cols)): datarow.extend([ gene_summary_data[hugo][col["name"]] for col in cols ]) else: datarow.extend([None for v in cols]) # re-orders data row. new_datarow = [] for colname in [ col["col_name"] for col in self.colinfo[level]["columns"] ]: if colname in self.colname_conversion[level]: oldcolname = self.colname_conversion[level][colname] if oldcolname in colnos: colno = colnos[oldcolname] else: self.logger.info( "column name does not exist in data: {}".format( oldcolname)) continue else: colno = colnos[colname] value = datarow[colno] new_datarow.append(value) # does report substitution. new_datarow = self.substitute_val(level, new_datarow) if hasattr( self, "keep_json_all_mapping") == False and level == "variant": all_map = json.loads(new_datarow[all_mappings_newcolno]) newvals = [] for hugo in all_map: for maprow in all_map[hugo]: [protid, protchange, so, transcript, rnachange] = maprow if protid == None: protid = "(na)" if protchange == None: protchange = "(na)" if rnachange == None: rnachange = "(na)" newval = (transcript + ":" + hugo + ":" + protid + ":" + so + ":" + protchange + ":" + rnachange) newvals.append(newval) newvals.sort() newcell = "; ".join(newvals) new_datarow[all_mappings_newcolno] = newcell if write_variant_sample_separately: samples = new_datarow[sample_newcolno] if samples is not None: samples = samples.split(";") for sample in samples: sample_datarow = new_datarow sample_datarow[sample_newcolno] = sample self.write_table_row( self.get_extracted_row(sample_datarow)) else: self.write_table_row(self.get_extracted_row(new_datarow)) else: self.write_table_row(self.get_extracted_row(new_datarow)) async def store_mapper(self, conn=None, cursor=None): # conn = await self.get_db_conn() # cursor = await conn.cursor() q = 'select colval from info where colkey="_mapper"' await cursor.execute(q) r = await cursor.fetchone() if r is None: self.mapper_name = "hg38" else: self.mapper_name = r[0].split(":")[0] # await cursor.close() # await conn.close() async def run(self, tab="all"): try: start_time = time.time() if not (hasattr(self, "no_log") and self.no_log): self.logger.info("started: %s" % time.asctime(time.localtime(start_time))) if self.cf.filter: s = f"filter:\n{yaml.dump(self.filter)}" self.logger.info(s) if self.module_conf is not None and self.status_writer is not None: if self.parsed_args.do_not_change_status == False: self.status_writer.queue_status_update( "status", "Started {} ({})".format(self.module_conf["title"], self.module_name), ) if self.setup() == False: await self.close_db() return if tab == "all": for level in await self.cf.exec_db(self.cf.get_result_levels): self.level = level if await self.exec_db(self.table_exists, level): await self.exec_db(self.make_col_info, level) for level in await self.cf.exec_db(self.cf.get_result_levels): self.level = level if await self.exec_db(self.table_exists, level): await self.run_level(level) else: if tab in ["variant", "gene"]: for level in ["variant", "gene"]: if await self.exec_db(self.table_exists, level): await self.exec_db(self.make_col_info, level) else: await self.exec_db(self.make_col_info, tab) self.level = level await self.run_level(tab) await self.close_db() if self.module_conf is not None and self.status_writer is not None: if self.parsed_args.do_not_change_status == False: self.status_writer.queue_status_update( "status", "Finished {} ({})".format(self.module_conf["title"], self.module_name), ) end_time = time.time() if not (hasattr(self, "no_log") and self.no_log): self.logger.info("finished: {0}".format( time.asctime(time.localtime(end_time)))) run_time = end_time - start_time self.logger.info("runtime: {0:0.3f}".format(run_time)) ret = self.end() except: await self.close_db() if self.module_conf is not None and self.status_writer is not None: if self.parsed_args.do_not_change_status == False: self.status_writer.queue_status_update( "status", "Failed {} ({})".format(self.module_conf["title"], self.module_name), ) end_time = time.time() if not (hasattr(self, "no_log") and self.no_log): self.logger.info("finished: {0}".format( time.asctime(time.localtime(end_time)))) run_time = end_time - start_time self.logger.info("runtime: {0:0.3f}".format(run_time)) raise return ret async def get_variant_colinfo(self): self.setup() level = "variant" if await self.exec_db(self.table_exists, level): await self.exec_db(self.make_col_info, level) level = "gene" if await self.exec_db(self.table_exists, level): await self.exec_db(self.make_col_info, level) return self.colinfo def setup(self): pass def end(self): pass def write_preface(self, level): pass def write_header(self, level): pass def write_table_row(self, row): pass def get_extracted_row(self, row): if self.display_select_columns[self.level]: filtered_row = [ row[colno] for colno in self.colnos_to_display[self.level] ] else: filtered_row = row return filtered_row def add_conditional_to_colnames_to_display(self, level, column, module_name): col_name = column["col_name"] if (level in self.extract_columns_multilevel and len(self.extract_columns_multilevel[level]) > 0): if col_name in self.extract_columns_multilevel[level]: incl = True else: incl = False elif self.concise_report: if "col_hidden" in column and column["col_hidden"] == True: incl = False else: incl = True else: incl = True if incl and col_name not in self.colnames_to_display[level]: if module_name == self.mapper_name: self.colnames_to_display[level].append( col_name.replace(module_name + "__", "base__")) elif module_name == "tagsampler": self.colnames_to_display[level].append( col_name.replace(module_name + "__", "base__")) else: self.colnames_to_display[level].append(col_name) async def make_col_info(self, level, conn=None, cursor=None): self.colnames_to_display[level] = [] await self.exec_db(self.store_mapper) cravat_conf = self.conf.get_cravat_conf() if "report_module_order" in cravat_conf: priority_colgroupnames = cravat_conf["report_module_order"] else: priority_colgroupnames = [ "base", "hg38", "hg19", "hg18", "tagsampler" ] # level-specific column groups self.columngroups[level] = [] sql = "select name, displayname from " + level + "_annotator" await cursor.execute(sql) rows = await cursor.fetchall() for row in rows: (name, displayname) = row self.columngroups[level].append({ "name": name, "displayname": displayname, "count": 0 }) # level-specific column names header_table = level + "_header" coldefs = [] sql = "select col_def from " + header_table await cursor.execute(sql) for row in await cursor.fetchall(): coljson = row[0] coldef = ColumnDefinition({}) coldef.from_json(coljson) coldefs.append(coldef) columns = [] self.colnos[level] = {} colcount = 0 # level-specific column details for coldef in coldefs: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ["single", "multi"] and len( coldef.categories) == 0: sql = "select distinct {} from {}".format(coldef.name, level) await cursor.execute(sql) rs = await cursor.fetchall() for r in rs: coldef.categories.append(r[0]) [colgrpname, _] = coldef.name.split("__") column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, colgrpname) for columngroup in self.columngroups[level]: if columngroup["name"] == colgrpname: columngroup["count"] += 1 # adds gene level columns to variant level. if (self.nogenelevelonvariantlevel == False and level == "variant" and await self.exec_db(self.table_exists, "gene")): modules_to_add = [] q = "select name from gene_annotator" await cursor.execute(q) gene_annotators = [v[0] for v in await cursor.fetchall()] modules_to_add = [m for m in gene_annotators if m != "base"] for module in modules_to_add: cols = [] q = 'select col_def from gene_header where col_name like "{}__%"'.format( module) await cursor.execute(q) rs = await cursor.fetchall() for r in rs: cd = ColumnDefinition({}) cd.from_json(r[0]) cols.append(cd) q = 'select displayname from gene_annotator where name="{}"'.format( module) await cursor.execute(q) r = await cursor.fetchone() displayname = r[0] self.columngroups[level].append({ "name": module, "displayname": displayname, "count": len(cols) }) for coldef in cols: self.colnos[level][coldef.name] = colcount colcount += 1 if (coldef.category in ["category", "multicategory"] and len(coldef.categories) == 0): sql = "select distinct {} from {}".format( coldef.name, level) await cursor.execute(sql) rs = await cursor.fetchall() for r in rs: coldef.categories.append(r[0]) column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, module) self.var_added_cols.append(coldef.name) # Gene level summary columns if level == "gene": q = "select name from variant_annotator" await cursor.execute(q) done_var_annotators = [v[0] for v in await cursor.fetchall()] self.summarizing_modules = [] local_modules = au.get_local_module_infos_of_type("annotator") local_modules.update( au.get_local_module_infos_of_type("postaggregator")) summarizer_module_names = [] for module_name in done_var_annotators: if module_name in [ "base", "hg19", "hg18", "extra_vcf_info", "extra_variant_info", ]: continue if module_name not in local_modules: if self.args.silent == False and module_name != 'original_input': print( " [{}] module does not exist in the system. Gene level summary for this module is skipped." .format(module_name)) continue module = local_modules[module_name] if "can_summarize_by_gene" in module.conf: summarizer_module_names.append(module_name) local_modules[self.mapper_name] = au.get_local_module_info( self.mapper_name) summarizer_module_names = [self.mapper_name ] + summarizer_module_names for module_name in summarizer_module_names: mi = local_modules[module_name] sys.path = sys.path + [os.path.dirname(mi.script_path)] if module_name in done_var_annotators: annot_cls = util.load_class(mi.script_path, "CravatAnnotator") elif module_name == self.mapper_name: annot_cls = util.load_class(mi.script_path, "Mapper") cmd = { "script_path": mi.script_path, "input_file": "__dummy__", "output_dir": self.output_dir, } annot = annot_cls(cmd) cols = mi.conf["gene_summary_output_columns"] columngroup = { "name": mi.name, "displayname": mi.title, "count": len(cols), } self.columngroups[level].append(columngroup) for col in cols: coldef = ColumnDefinition(col) coldef.name = columngroup["name"] + "__" + coldef.name coldef.genesummary = True column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, mi.name) self.summarizing_modules.append([mi, annot, cols]) for col in cols: fullname = module_name + "__" + col["name"] self.colnos[level][fullname] = len(self.colnos[level]) # re-orders columns groups. colgrps = self.columngroups[level] newcolgrps = [] for priority_colgrpname in priority_colgroupnames: for colgrp in colgrps: if colgrp["name"] == priority_colgrpname: if colgrp["name"] in [self.mapper_name, "tagsampler"]: newcolgrps[0]["count"] += colgrp["count"] else: newcolgrps.append(colgrp) break colpos = 0 for colgrp in newcolgrps: colgrp["lastcol"] = colpos + colgrp["count"] colpos = colgrp["lastcol"] colgrpnames = [ v["displayname"] for v in colgrps if v["name"] not in priority_colgroupnames ] colgrpnames.sort() for colgrpname in colgrpnames: for colgrp in colgrps: if colgrp["displayname"] == colgrpname: colgrp["lastcol"] = colpos + colgrp["count"] newcolgrps.append(colgrp) colpos += colgrp["count"] break # re-orders columns. self.colname_conversion[level] = {} new_columns = [] self.newcolnos[level] = {} newcolno = 0 new_colnames_to_display = [] for colgrp in newcolgrps: colgrpname = colgrp["name"] for col in columns: colname = col["col_name"] [grpname, _] = colname.split("__") if colgrpname == "base" and grpname in [ self.mapper_name, "tagsampler" ]: newcolname = "base__" + colname.split("__")[1] self.colname_conversion[level][newcolname] = colname col["col_name"] = newcolname new_columns.append(col) self.newcolnos[level][newcolname] = newcolno if newcolname in self.colnames_to_display[level]: new_colnames_to_display.append(newcolname) elif grpname == colgrpname: new_columns.append(col) self.newcolnos[level][colname] = newcolno if colname in self.colnames_to_display[level]: new_colnames_to_display.append(colname) else: continue newcolno += 1 self.colinfo[level] = {"colgroups": newcolgrps, "columns": new_columns} self.colnames_to_display[level] = new_colnames_to_display # report substitution if level in ["variant", "gene"]: reportsubtable = level + "_reportsub" if await self.exec_db(self.table_exists, reportsubtable): q = "select * from {}".format(reportsubtable) await cursor.execute(q) reportsub = { r[0]: json.loads(r[1]) for r in await cursor.fetchall() } self.column_subs[level] = [] for i, column in enumerate(new_columns): module, col = column["col_name"].split("__") if module == self.mapper_name: module = "base" if module in reportsub and col in reportsub[module]: self.column_subs[level].append( SimpleNamespace( module=module, col=col, index=i, subs=reportsub[module][col], )) new_columns[i]["reportsub"] = reportsub[module][col] # display_select_columns if (level in self.extract_columns_multilevel and len(self.extract_columns_multilevel[level]) > 0 ) or self.concise_report: self.display_select_columns[level] = True else: self.display_select_columns[level] = False # column numbers to display colno = 0 self.colnos_to_display[level] = [] for colgroup in self.colinfo[level]["colgroups"]: count = colgroup["count"] if count == 0: continue for col in self.colinfo[level]["columns"][colno:colno + count]: module_col_name = col["col_name"] if module_col_name in self.colnames_to_display[level]: include_col = True else: include_col = False if include_col: self.colnos_to_display[level].append(colno) colno += 1 def get_standardized_module_option(self, v): tv = type(v) if tv == str: if ":" in v: v0 = {} for v1 in v.split("."): if ":" in v1: v1toks = v1.split(":") if len(v1toks) == 2: level = v1toks[0] v2s = v1toks[1].split(",") v0[level] = v2s v = v0 elif "," in v: v = [val for val in v.split(",") if val != ""] if v == "true": v = True elif v == "false": v = False return v async def connect_db(self, dbpath=None): if dbpath != None: self.dbpath = dbpath if self.dbpath == None: sys.stderr.write("Provide a path to aggregator output") exit() if os.path.exists(self.dbpath) == False: sys.stderr.write(self.dbpath + " does not exist.") exit() async def close_db(self): if hasattr(self, "conn") and self.conn is not None: await self.conn.close() self.conn = None if self.cf is not None: await self.cf.close_db() self.cf = None async def load_filter(self): self.cf = await CravatFilter.create(dbpath=self.dbpath) await self.cf.exec_db(self.cf.loadfilter, filter=self.filter, filterpath=self.filterpath, filtername=self.filtername, filterstring=self.filterstring, filtersql=self.filtersql, includesample=self.args.includesample, excludesample=self.args.excludesample) async def table_exists(self, tablename, conn=None, cursor=None): sql = ("select name from sqlite_master where " + 'type="table" and name="' + tablename + '"') await cursor.execute(sql) row = await cursor.fetchone() if row == None: ret = False else: ret = True return ret
class CravatReport: def __init__(self, cmd_args, status_writer=None): self.status_writer = status_writer global parser for ag in parser._action_groups: if ag.title == 'optional arguments': for a in ag._actions: if '-t' in a.option_strings: ag._actions.remove(a) self.parse_cmd_args(parser, cmd_args) self.cursor = None self.cf = None self.filtertable = 'filter' self.colinfo = {} self.colnos = {} self.newcolnos = {} self.var_added_cols = [] self.summarizing_modules = [] self.columngroups = {} self.column_subs = {} self.column_sub_allow_partial_match = {} self.colname_conversion = {} self._setup_logger() self.warning_msgs = [] def parse_cmd_args(self, parser, cmd_args): cmd_args = clean_args(cmd_args) parsed_args = parser.parse_args(cmd_args) self.parsed_args = parsed_args self.dbpath = parsed_args.dbpath self.filterpath = parsed_args.filterpath self.filtername = parsed_args.filtername self.filterstring = parsed_args.filterstring self.confs = None if parsed_args.confs is not None: confs = parsed_args.confs.lstrip('\'').rstrip('\'').replace( "'", '"') self.confs = json.loads(confs) if 'filter' in self.confs: self.filter = self.confs['filter'] else: self.filter = None if parsed_args.output_dir is not None: self.output_dir = parsed_args.output_dir else: self.output_dir = os.path.dirname(self.dbpath) self.savepath = parsed_args.savepath if self.savepath is not None and os.path.dirname(self.savepath) == '': self.savepath = os.path.join(self.output_dir, self.savepath) self.confpath = parsed_args.confpath self.conf = ConfigLoader(job_conf_path=self.confpath) self.module_name = parsed_args.module_name if self.conf is not None: self.module_conf = self.conf.get_module_conf(self.module_name) else: self.module_conf = None if hasattr(parsed_args, 'reporttypes'): self.report_types = parsed_args.reporttypes self.output_basename = os.path.basename(self.dbpath)[:-7] status_fname = '{}.status.json'.format(self.output_basename) self.status_fpath = os.path.join(self.output_dir, status_fname) self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel if parsed_args.inputfiles is None and parsed_args.dbpath is not None: db = sqlite3.connect(parsed_args.dbpath) c = db.cursor() q = 'select colval from info where colkey="_input_paths"' c.execute(q) r = c.fetchone() if r is not None: parsed_args.inputfiles = [] s = r[0] if ' ' in s: s = s.replace("'", '"') s = json.loads(r[0].replace("'", '"')) for k in s: input_path = s[k] parsed_args.inputfiles.append(input_path) self.args = parsed_args async def prep(self): await self.connect_db() await self.load_filter() def _setup_logger(self): if hasattr(self, 'no_log') and self.no_log: return try: self.logger = logging.getLogger('cravat.' + self.module_name) except Exception as e: self._log_exception(e) self.error_logger = logging.getLogger('error.' + self.module_name) self.unique_excs = [] def _log_exception(self, e, halt=True): if halt: raise e else: if self.logger: self.logger.exception(e) async def getjson(self, level): ret = None if await self.table_exists(level) == False: return ret for row in await self.cf.getiterator(level): row = self.substitute_val(level, row) return json.dumps(row) def substitute_val(self, level, row): if level in self.column_subs: column_sub_dict = self.column_subs[level] column_sub_allow_partial_match = self.column_sub_allow_partial_match[ level] for colno in column_sub_dict: column_sub = column_sub_dict[colno] value = row[colno] if value is not None: if column_sub_allow_partial_match[colno]: for target, substitution in column_sub.items(): value = target.sub(substitution, value) else: if value in column_sub: value = column_sub[value] row[colno] = value return row def process_datarow(self, args): datarow = args[0] should_skip_some_cols = args[1] level = args[2] gene_summary_datas = args[3] if datarow is None: return None datarow = list(datarow) if should_skip_some_cols: datarow = [ datarow[colno] for colno in range(num_total_cols) if colno not in colnos_to_skip ] if level == 'variant': # adds gene level data to variant level. if self.nogenelevelonvariantlevel == False and hugo_present: hugo = datarow[self.colnos['variant']['base__hugo']] loop = asyncio.get_event_loop() future = asyncio.ensure_future(self.cf.get_gene_row(hugo), loop) generow = future.result() if generow is None: datarow.extend( [None for i in range(len(self.var_added_cols))]) else: datarow.extend([ generow[self.colnos['gene'][colname]] for colname in self.var_added_cols ]) elif level == 'gene': # adds summary data to gene level. hugo = datarow[0] for mi, _, _ in self.summarizing_modules: module_name = mi.name [gene_summary_data, cols] = gene_summary_datas[module_name] if hugo in gene_summary_data and gene_summary_data[ hugo] is not None and len( gene_summary_data[hugo]) == len(cols): datarow.extend( [gene_summary_data[hugo][col['name']] for col in cols]) else: datarow.extend([None for v in cols]) # re-orders data row. new_datarow = [] colnos = self.colnos[level] for colname in [ col['col_name'] for col in self.colinfo[level]['columns'] ]: if colname in self.colname_conversion[level]: newcolname = self.colname_conversion[level][colname] if newcolname in colnos: colno = colnos[newcolname] else: self.logger.info( 'column name does not exist in data: {}'.format( colname)) continue else: colno = colnos[colname] value = datarow[colno] new_datarow.append(value) # does report substitution. new_datarow = self.substitute_val(level, new_datarow) if hasattr(self, 'keep_json_all_mapping') == False and level == 'variant': colno = self.colnos['variant']['base__all_mappings'] all_map = json.loads(new_datarow[colno]) newvals = [] for hugo in all_map: for maprow in all_map[hugo]: [protid, protchange, so, transcript, rnachange] = maprow if protid == None: protid = '(na)' if protchange == None: protchange = '(na)' if rnachange == None: rnachange = '(na)' newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange newvals.append(newval) newvals.sort() newcell = '; '.join(newvals) new_datarow[colno] = newcell return new_datarow async def run_level(self, level): ret = await self.table_exists(level) if ret == False: return gene_summary_datas = {} if level == 'variant': await self.cf.make_filtered_uid_table() elif level == 'gene': await self.cf.make_filtered_hugo_table() for mi, o, cols in self.summarizing_modules: if hasattr(o, 'build_gene_collection'): msg = 'Obsolete module [{}] for gene level summarization. Update the module to get correct gene level summarization.'.format( mi.name) self.warning_msgs.append(msg) print('===Warning: {}'.format(msg)) gene_summary_data = {} else: gene_summary_data = await o.get_gene_summary_data(self.cf) gene_summary_datas[mi.name] = [gene_summary_data, cols] for col in cols: if 'category' in col and col['category'] in [ 'single', 'multi' ]: for i in range(len(self.colinfo[level]['columns'])): colinfo_col = self.colinfo[level]['columns'][i] if mi.name in ['hg38', 'tagsampler']: grp_name = 'base' else: grp_name = mi.name if colinfo_col[ 'col_name'] == grp_name + '__' + col[ 'name']: break cats = [] for hugo in gene_summary_data: val = gene_summary_data[hugo][col['name']] if len(colinfo_col['reportsub']) > 0: if val in colinfo_col['reportsub']: val = colinfo_col['reportsub'][val] if val not in cats: cats.append(val) self.colinfo[level]['columns'][i]['col_cats'] = cats self.write_preface(level) self.write_header(level) if level == 'variant': hugo_present = 'base__hugo' in self.colnos['variant'] datacols, datarows = await self.cf.get_filtered_iterator(level) num_total_cols = len(datacols) colnos_to_skip = [] if level == 'gene': for colno in range(len(datacols)): if datacols[colno] in constants.legacy_gene_level_cols_to_skip: colnos_to_skip.append(colno) should_skip_some_cols = len(colnos_to_skip) > 0 if level == 'variant' and self.args.separatesample: write_variant_sample_separately = True sample_newcolno = self.newcolnos['variant']['base__samples'] else: write_variant_sample_separately = False colnos = self.colnos[level] newcolnos = self.newcolnos[level] all_mappings_newcolno = self.newcolnos['variant']['base__all_mappings'] for datarow in datarows: if datarow is None: continue datarow = list(datarow) if should_skip_some_cols: datarow = [ datarow[colno] for colno in range(num_total_cols) if colno not in colnos_to_skip ] if level == 'variant': # adds gene level data to variant level. if self.nogenelevelonvariantlevel == False and hugo_present: hugo = datarow[self.colnos['variant']['base__hugo']] generow = await self.cf.get_gene_row(hugo) if generow is None: datarow.extend( [None for i in range(len(self.var_added_cols))]) else: datarow.extend([ generow[self.colnos['gene'][colname]] for colname in self.var_added_cols ]) elif level == 'gene': # adds summary data to gene level. hugo = datarow[0] for mi, _, _ in self.summarizing_modules: module_name = mi.name [gene_summary_data, cols] = gene_summary_datas[module_name] if hugo in gene_summary_data and gene_summary_data[ hugo] is not None and len( gene_summary_data[hugo]) == len(cols): datarow.extend([ gene_summary_data[hugo][col['name']] for col in cols ]) else: datarow.extend([None for v in cols]) # re-orders data row. new_datarow = [] for colname in [ col['col_name'] for col in self.colinfo[level]['columns'] ]: if colname in self.colname_conversion[level]: oldcolname = self.colname_conversion[level][colname] if oldcolname in colnos: colno = colnos[oldcolname] else: self.logger.info( 'column name does not exist in data: {}'.format( oldcolname)) continue else: colno = colnos[colname] value = datarow[colno] new_datarow.append(value) # does report substitution. new_datarow = self.substitute_val(level, new_datarow) if hasattr( self, 'keep_json_all_mapping') == False and level == 'variant': all_map = json.loads(new_datarow[all_mappings_newcolno]) newvals = [] for hugo in all_map: for maprow in all_map[hugo]: [protid, protchange, so, transcript, rnachange] = maprow if protid == None: protid = '(na)' if protchange == None: protchange = '(na)' if rnachange == None: rnachange = '(na)' newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange newvals.append(newval) newvals.sort() newcell = '; '.join(newvals) new_datarow[all_mappings_newcolno] = newcell if write_variant_sample_separately: samples = new_datarow[sample_newcolno] if samples is not None: samples = samples.split(';') for sample in samples: sample_datarow = new_datarow sample_datarow[sample_newcolno] = sample self.write_table_row(sample_datarow) else: self.write_table_row(new_datarow) else: self.write_table_row(new_datarow) async def store_mapper(self): q = 'select colval from info where colkey="_mapper"' await self.cursor.execute(q) r = await self.cursor.fetchone() if r is None: self.mapper_name = 'hg38' else: self.mapper_name = r[0].split(':')[0] async def run(self, tab='all'): start_time = time.time() if not (hasattr(self, 'no_log') and self.no_log): self.logger.info('started: %s' % time.asctime(time.localtime(start_time))) if self.module_conf is not None and self.status_writer is not None: if self.parsed_args.do_not_change_status == False: self.status_writer.queue_status_update( 'status', 'Started {} ({})'.format(self.module_conf['title'], self.module_name)) if self.setup() == False: return if tab == 'all': for level in await self.cf.get_result_levels(): if await self.table_exists(level): await self.make_col_info(level) for level in await self.cf.get_result_levels(): if await self.table_exists(level): await self.run_level(level) else: if tab in ['variant', 'gene']: for level in ['variant', 'gene']: if await self.table_exists(level): await self.make_col_info(level) else: await self.make_col_info(tab) await self.run_level(tab) if self.module_conf is not None and self.status_writer is not None: if self.parsed_args.do_not_change_status == False: self.status_writer.queue_status_update( 'status', 'Finished {} ({})'.format(self.module_conf['title'], self.module_name)) end_time = time.time() if not (hasattr(self, 'no_log') and self.no_log): self.logger.info('finished: {0}'.format( time.asctime(time.localtime(end_time)))) run_time = end_time - start_time self.logger.info('runtime: {0:0.3f}'.format(run_time)) ret = self.end() return ret async def get_variant_colinfo(self): self.setup() level = 'variant' if await self.table_exists(level): await self.make_col_info(level) level = 'gene' if await self.table_exists(level): await self.make_col_info(level) return self.colinfo def setup(self): pass def end(self): pass def write_preface(self, level): pass def write_header(self, level): pass def write_table_row(self, row): pass async def make_col_info(self, level): await self.store_mapper() cravat_conf = self.conf.get_cravat_conf() if 'report_module_order' in cravat_conf: priority_colgroupnames = cravat_conf['report_module_order'] else: priority_colgroupnames = [ 'base', 'hg38', 'hg19', 'hg18', 'tagsampler' ] # level-specific column groups self.columngroups[level] = [] sql = 'select name, displayname from ' + level + '_annotator' await self.cursor.execute(sql) rows = await self.cursor.fetchall() for row in rows: (name, displayname) = row self.columngroups[level].append({ 'name': name, 'displayname': displayname, 'count': 0 }) # level-specific column names header_table = level + '_header' coldefs = [] sql = 'select col_def from ' + header_table await self.cursor.execute(sql) for row in await self.cursor.fetchall(): coljson = row[0] coldef = ColumnDefinition({}) coldef.from_json(coljson) coldefs.append(coldef) columns = [] self.colnos[level] = {} colcount = 0 # level-specific column details for coldef in coldefs: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ['single', 'multi'] and len( coldef.categories) == 0: sql = 'select distinct {} from {}'.format(coldef.name, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: coldef.categories.append(r[0]) [colgrpname, colonlyname] = coldef.name.split('__') column = coldef.get_colinfo() columns.append(column) for columngroup in self.columngroups[level]: if columngroup['name'] == colgrpname: columngroup['count'] += 1 # adds gene level columns to variant level. if self.nogenelevelonvariantlevel == False and level == 'variant' and await self.table_exists( 'gene'): modules_to_add = [] q = 'select name from gene_annotator' await self.cursor.execute(q) gene_annotators = [v[0] for v in await self.cursor.fetchall()] modules_to_add = [m for m in gene_annotators if m != 'base'] for module in modules_to_add: if not module in gene_annotators: continue cols = [] q = 'select col_def from gene_header where col_name like "{}__%"'.format( module) await self.cursor.execute(q) rs = await self.cursor.fetchall() for r in rs: cd = ColumnDefinition({}) cd.from_json(r[0]) cols.append(cd) q = 'select displayname from gene_annotator where name="{}"'.format( module) await self.cursor.execute(q) r = await self.cursor.fetchone() displayname = r[0] self.columngroups[level].append({ 'name': module, 'displayname': displayname, 'count': len(cols) }) for coldef in cols: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ['category', 'multicategory' ] and len(coldef.categories) == 0: sql = 'select distinct {} from {}'.format( coldef.name, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: coldef.categories.append(r[0]) column = coldef.get_colinfo() columns.append(column) self.var_added_cols.append(coldef.name) # Gene level summary columns if level == 'gene': q = 'select name from variant_annotator' await self.cursor.execute(q) done_var_annotators = [v[0] for v in await self.cursor.fetchall()] self.summarizing_modules = [] local_modules = au.get_local_module_infos_of_type('annotator') local_modules.update( au.get_local_module_infos_of_type('postaggregator')) summarizer_module_names = [] for module_name in done_var_annotators: if module_name in [ 'base', 'hg19', 'hg18', 'extra_vcf_info', 'extra_variant_info' ]: continue if module_name not in local_modules: print( ' [{}] module does not exist in the system. Gene level summary for this module is skipped.' .format(module_name)) continue module = local_modules[module_name] if 'can_summarize_by_gene' in module.conf: summarizer_module_names.append(module_name) local_modules[self.mapper_name] = au.get_local_module_info( self.mapper_name) summarizer_module_names = [self.mapper_name ] + summarizer_module_names for module_name in summarizer_module_names: mi = local_modules[module_name] sys.path = sys.path + [os.path.dirname(mi.script_path)] if module_name in done_var_annotators: annot_cls = util.load_class(mi.script_path, 'CravatAnnotator') elif module_name == self.mapper_name: annot_cls = util.load_class(mi.script_path, 'Mapper') annot = annot_cls( [mi.script_path, '__dummy__', '-d', self.output_dir], {}) ''' cols = conf['gene_summary_output_columns'] columngroup = {} columngroup['name'] = os.path.basename(mi.script_path).split('.')[0] columngroup['displayname'] = conf['title'] columngroup['count'] = len(cols) ''' cols = mi.conf['gene_summary_output_columns'] columngroup = { 'name': mi.name, 'displayname': mi.title, 'count': len(cols), } self.columngroups[level].append(columngroup) for col in cols: coldef = ColumnDefinition(col) coldef.name = columngroup['name'] + '__' + coldef.name coldef.genesummary = True column = coldef.get_colinfo() columns.append(column) self.summarizing_modules.append([mi, annot, cols]) for col in cols: fullname = module_name + '__' + col['name'] self.colnos[level][fullname] = len(self.colnos[level]) # re-orders columns groups. colgrps = self.columngroups[level] newcolgrps = [] for priority_colgrpname in priority_colgroupnames: for colgrp in colgrps: if colgrp['name'] == priority_colgrpname: if colgrp['name'] in [self.mapper_name, 'tagsampler']: newcolgrps[0]['count'] += colgrp['count'] else: newcolgrps.append(colgrp) break colpos = 0 for colgrp in newcolgrps: colgrp['lastcol'] = colpos + colgrp['count'] colpos = colgrp['lastcol'] colgrpnames = [ v['displayname'] for v in colgrps if v['name'] not in priority_colgroupnames ] colgrpnames.sort() for colgrpname in colgrpnames: for colgrp in colgrps: if colgrp['displayname'] == colgrpname: colgrp['lastcol'] = colpos + colgrp['count'] newcolgrps.append(colgrp) colpos += colgrp['count'] break # re-orders columns. self.colname_conversion[level] = {} new_columns = [] self.newcolnos[level] = {} newcolno = 0 for colgrp in newcolgrps: colgrpname = colgrp['name'] for col in columns: colname = col['col_name'] [grpname, oricolname] = colname.split('__') if colgrpname == 'base' and grpname in [ self.mapper_name, 'tagsampler' ]: newcolname = 'base__' + colname.split('__')[1] self.colname_conversion[level][newcolname] = colname col['col_name'] = newcolname new_columns.append(col) self.newcolnos[level][newcolname] = newcolno #self.colnos[level][newcolname] = colno #del self.colnos[level][oldcolname] elif grpname == colgrpname: new_columns.append(col) self.newcolnos[level][colname] = newcolno else: continue newcolno += 1 self.colinfo[level] = {'colgroups': newcolgrps, 'columns': new_columns} # report substitution if level in ['variant', 'gene']: reportsubtable = level + '_reportsub' if await self.table_exists(reportsubtable): q = 'select * from {}'.format(reportsubtable) await self.cursor.execute(q) rs = await self.cursor.fetchall() self.report_substitution = {} for r in rs: module = r[0] sub = json.loads(r[1]) self.report_substitution[module] = sub self.column_subs[level] = {} self.column_sub_allow_partial_match[level] = {} for i in range(len(new_columns)): column = new_columns[i] [module, col] = column['col_name'].split('__') if module in [self.mapper_name]: module = 'base' if module in self.report_substitution: sub = self.report_substitution[module] if col in sub: if module in [ 'base', self.mapper_name ] and col in ['all_mappings', 'all_so']: allow_partial_match = True self.column_subs[level][i] = { re.compile(fr'\b{key}\b'): val for key, val in sub[col].items() } else: allow_partial_match = False self.column_subs[level][i] = sub[col] self.column_sub_allow_partial_match[level][ i] = allow_partial_match new_columns[i]['reportsub'] = sub[col] async def connect_db(self, dbpath=None): if dbpath != None: self.dbpath = dbpath if self.dbpath == None: sys.stderr.write('Provide a path to aggregator output') exit() if os.path.exists(self.dbpath) == False: sys.stderr.write(self.dbpath + ' does not exist.') exit() self.conn = await aiosqlite3.connect(self.dbpath) self.cursor = await self.conn.cursor() async def load_filter(self): self.cf = await CravatFilter.create(dbpath=self.dbpath) await self.cf.loadfilter(filter=self.filter, filterpath=self.filterpath, filtername=self.filtername, filterstring=self.filterstring) async def table_exists(self, tablename): sql = 'select name from sqlite_master where ' + \ 'type="table" and name="' + tablename + '"' await self.cursor.execute(sql) row = await self.cursor.fetchone() if row == None: ret = False else: ret = True return ret
class CravatReport: def __init__(self, cmd_args, status_writer): self.status_writer = status_writer self.parse_cmd_args(cmd_args) self.cursor = None self.cf = None self.filtertable = 'filter' self.colinfo = {} self.colnos = {} self.ord_cols = {} self.var_added_cols = [] self.summarizing_modules = [] self.columngroups = {} self.column_subs = {} self._setup_logger() async def prep(self): await self.connect_db() await self.load_filter() def _setup_logger(self): if hasattr(self, 'no_log') and self.no_log: return try: self.logger = logging.getLogger('cravat.' + self.module_name) except Exception as e: self._log_exception(e) self.error_logger = logging.getLogger('error.' + self.module_name) self.unique_excs = [] def _log_exception(self, e, halt=True): if halt: raise e else: if self.logger: self.logger.exception(e) async def getjson(self, level): ret = None if await self.table_exists(level) == False: return ret for row in await self.cf.getiterator(level): row = self.substitute_val(level, row) return json.dumps(row) def substitute_val(self, level, row): if level in self.column_subs: column_sub_level = self.column_subs[level] for i in self.column_subs[level]: column_sub_i = column_sub_level[i] value = row[i] if value is not None: if value in column_sub_i: row[i] = column_sub_i[value] return row async def run_level(self, level): if await self.table_exists(level): if level == 'variant': await self.cf.make_filtered_uid_table() elif level == 'gene': await self.cf.make_filtered_hugo_table() gene_summary_datas = {} for mi, o, cols in self.summarizing_modules: gene_summary_data = await o.get_gene_summary_data(self.cf) gene_summary_datas[mi.name] = [gene_summary_data, cols] self.write_preface(level) self.write_header(level) if level == 'variant': hugo_present = 'base__hugo' in self.colnos['variant'] for row in await self.cf.get_filtered_iterator(level): row = list(row) if level == 'variant': if hugo_present: hugo = row[self.colnos['variant']['base__hugo']] generow = await self.cf.get_gene_row(hugo) for colname in self.var_added_cols: if generow == None: colval = None else: colval = generow[self.colnos['gene'][colname]] row.append(colval) elif level == 'gene': hugo = row[0] for mi, _, _ in self.summarizing_modules: module_name = mi.name [gene_summary_data, cols] = gene_summary_datas[module_name] if hugo in gene_summary_data: row.extend([ gene_summary_data[hugo][col['name']] for col in cols ]) else: row.extend([None for v in cols]) row = self.substitute_val(level, row) if hasattr(self, 'keep_json_all_mapping' ) == False and level == 'variant': colno = self.colnos['variant']['base__all_mappings'] all_map = json.loads(row[colno]) newvals = [] for hugo in all_map: for maprow in all_map[hugo]: [protid, protchange, so, transcript, rnachange] = maprow if protid == None: protid = '(na)' if protchange == None: protchange = '(na)' if rnachange == None: rnachange = '(na)' newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange newvals.append(newval) newvals.sort() newcell = '; '.join(newvals) row[colno] = newcell newrow = [] for colname in self.ord_cols[level]: colno = self.colnos[level][colname] value = row[colno] newrow.append(value) self.write_table_row(newrow) async def run(self, tab='all'): start_time = time.time() if not (hasattr(self, 'no_log') and self.no_log): self.logger.info('started: %s' % time.asctime(time.localtime(start_time))) if self.module_conf is not None: self.status_writer.queue_status_update( 'status', 'Started {} ({})'.format(self.module_conf['title'], self.module_name)) self.setup() if tab == 'all': for level in await self.cf.get_result_levels(): if await self.table_exists(level): await self.make_col_info(level) for level in await self.cf.get_result_levels(): if await self.table_exists(level): await self.run_level(level) else: if tab in ['variant', 'gene']: for level in ['variant', 'gene']: if await self.table_exists(level): await self.make_col_info(level) else: await self.make_col_info(tab) await self.run_level(tab) if self.module_conf is not None: self.status_writer.queue_status_update( 'status', 'Finished {} ({})'.format(self.module_conf['title'], self.module_name)) end_time = time.time() if not (hasattr(self, 'no_log') and self.no_log): self.logger.info('finished: {0}'.format( time.asctime(time.localtime(end_time)))) run_time = end_time - start_time self.logger.info('runtime: {0:0.3f}'.format(run_time)) ret = self.end() return ret async def get_variant_colinfo(self): self.setup() level = 'variant' if await self.table_exists(level): await self.make_col_info(level) level = 'gene' if await self.table_exists(level): await self.make_col_info(level) return self.colinfo def setup(self): pass def end(self): pass def write_preface(self, level): pass def write_header(self, level): pass def write_table_row(self, row): pass async def make_col_info(self, level): cravat_conf = self.conf.get_cravat_conf() if 'report_module_order' in cravat_conf: priority_colgroups = cravat_conf['report_module_order'] else: priority_colgroups = ['base', 'hg19', 'hg18', 'tagsampler'] # ordered column groups self.columngroups[level] = [] sql = 'select name, displayname from ' + level + '_annotator' await self.cursor.execute(sql) rows = await self.cursor.fetchall() for priority_colgroup in priority_colgroups: for row in rows: colgroup = row[0] if colgroup == priority_colgroup: (name, displayname) = row self.columngroups[level].append({ 'name': name, 'displayname': displayname, 'count': 0 }) for row in rows: colgroup = row[0] if colgroup in priority_colgroups: pass else: (name, displayname) = row self.columngroups[level].append({ 'name': name, 'displayname': displayname, 'count': 0 }) # ordered column names sql = 'select * from ' + level + '_header' await self.cursor.execute(sql) columns = [] unordered_rows = await self.cursor.fetchall() rows = [] self.ord_cols[level] = [] for group in priority_colgroups: for row in unordered_rows: [col_group, col_name] = row[0].split('__') if col_group == group: rows.append(row) self.ord_cols[level].append(row[0]) for row in unordered_rows: [col_group, col_name] = row[0].split('__') if col_group not in priority_colgroups: rows.append(row) self.ord_cols[level].append(row[0]) # unordered column numbers self.colnos[level] = {} colcount = 0 for row in unordered_rows: self.colnos[level][row[0]] = colcount colcount += 1 # ordered column details for row in rows: (colname, coltitle, col_type) = row[:3] col_cats = json.loads(row[3]) if len(row) > 3 and row[3] else [] col_width = row[4] if len(row) > 4 else None col_desc = row[5] if len(row) > 5 else None col_hidden = bool(row[6]) if len(row) > 6 else False col_ctg = row[7] if len(row) > 7 else None if col_ctg in ['single', 'multi'] and len(col_cats) == 0: sql = 'select distinct {} from {}'.format(colname, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: col_cats.append(r[0]) col_filterable = bool(row[8]) if len(row) > 8 else True link_format = row[9] if len(row) > 9 else None column = { 'col_name': colname, 'col_title': coltitle, 'col_type': col_type, 'col_cats': col_cats, 'col_width': col_width, 'col_desc': col_desc, 'col_hidden': col_hidden, 'col_ctg': col_ctg, 'col_filterable': col_filterable, 'link_format': link_format, } columns.append(column) groupname = colname.split('__')[0] for columngroup in self.columngroups[level]: if columngroup['name'] == groupname: columngroup['count'] += 1 if level == 'variant' and await self.table_exists('gene'): modules_to_add = [] q = 'select name from gene_annotator' await self.cursor.execute(q) gene_annotators = [v[0] for v in await self.cursor.fetchall()] k = 'add_gene_module_to_variant' if self.conf.has_key(k): modules_to_add = self.conf.get_val(k) for module in gene_annotators: module_info = au.get_local_module_info(module) if module_info == None: continue module_conf = module_info.conf if 'add_to_variant_level' in module_conf: if module_conf['add_to_variant_level'] == True: modules_to_add.append(module) for module in modules_to_add: if not module in gene_annotators: continue mi = au.get_local_module_info(module) cols = mi.conf['output_columns'] self.columngroups[level].append({ 'name': mi.name, 'displayname': mi.title, 'count': len(cols) }) for col in cols: colname = mi.name + '__' + col['name'] self.colnos[level][colname] = colcount self.ord_cols[level].append(colname) colcount += 1 col_type = col['type'] col_cats = col.get('categories', []) col_width = col.get('width') col_desc = col.get('desc') col_hidden = col.get('hidden', False) col_ctg = col.get('category', None) if col_ctg in ['category', 'multicategory' ] and len(col_cats) == 0: sql = 'select distinct {} from {}'.format( colname, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: col_cats.append(r[0]) col_filterable = col.get('filterable', True) col_link_format = col.get('link_format') column = { 'col_name': colname, 'col_title': col['title'], 'col_type': col_type, 'col_cats': col_cats, 'col_width': col_width, 'col_desc': col_desc, 'col_hidden': col_hidden, 'col_ctg': col_ctg, 'col_filterable': col_filterable, 'col_link_format': col_link_format, } columns.append(column) self.var_added_cols.append(colname) # Gene level summary columns if level == 'gene': q = 'select name from variant_annotator' await self.cursor.execute(q) done_var_annotators = [v[0] for v in await self.cursor.fetchall()] self.summarizing_modules = [] local_modules = au.get_local_module_infos_of_type('annotator') for module_name in local_modules: mi = local_modules[module_name] conf = mi.conf if 'can_summarize_by_gene' in conf and module_name in done_var_annotators: sys.path = sys.path + [os.path.dirname(mi.script_path)] annot_cls = util.load_class('CravatAnnotator', mi.script_path) annot = annot_cls([mi.script_path, '__dummy__'], {}) cols = conf['gene_summary_output_columns'] for col in cols: col['name'] = col['name'] columngroup = {} columngroup['name'] = conf['name'] columngroup['displayname'] = conf['title'] columngroup['count'] = len(cols) self.columngroups[level].append(columngroup) for col in cols: col_type = col['type'] col_cats = col.get('categories', []) col_ctg = col.get('category', None) if col_type in ['category', 'multicategory' ] and len(col_cats) == 0: sql = 'select distinct {} from {}'.format( colname, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: col_cats.append(r[0]) col_filterable = col.get('filterable', True) col_link_format = col.get('link_format') column = { 'col_name': conf['name'] + '__' + col['name'], 'col_title': col['title'], 'col_type': col_type, 'col_cats': col_cats, 'col_width': col.get('width'), 'col_desc': col.get('desc'), 'col_hidden': col.get('hidden', False), 'col_ctg': col_ctg, 'col_filterable': col_filterable, 'col_link_format': col_link_format, } columns.append(column) self.summarizing_modules.append([mi, annot, cols]) for col in cols: fullname = module_name + '__' + col['name'] self.ord_cols[level].append(fullname) self.colnos[level][fullname] = len(self.colnos[level]) colno = 0 for colgroup in self.columngroups[level]: colno += colgroup['count'] colgroup['lastcol'] = colno self.colinfo[level] = { 'colgroups': self.columngroups[level], 'columns': columns } # report substitution if level in ['variant', 'gene']: reportsubtable = level + '_reportsub' if await self.table_exists(reportsubtable): q = 'select * from {}'.format(reportsubtable) await self.cursor.execute(q) rs = await self.cursor.fetchall() self.report_substitution = {} for r in rs: module = r[0] sub = json.loads(r[1]) self.report_substitution[module] = sub self.column_subs[level] = {} columns = self.colinfo[level]['columns'] for i in range(len(columns)): column = columns[i] [module, col] = column['col_name'].split('__') if module in self.report_substitution: sub = self.report_substitution[module] if col in sub: self.column_subs[level][i] = sub[col] self.colinfo[level]['columns'][i][ 'reportsub'] = sub[col] def parse_cmd_args(self, cmd_args): parser = argparse.ArgumentParser() parser.add_argument('dbpath', help='Path to aggregator output') parser.add_argument('-f', dest='filterpath', default=None, help='Path to filter file') parser.add_argument( '-F', dest='filtername', default=None, help='Name of filter (stored in aggregator output)') parser.add_argument('--filterstring', dest='filterstring', default=None, help='Filter in JSON') parser.add_argument('-s', dest='savepath', default=None, help='Path to save file') parser.add_argument('-c', dest='confpath', help='path to a conf file') parser.add_argument('-t', dest='reporttypes', nargs='+', default=None, help='report types') parser.add_argument('--module-name', dest='module_name', default=None, help='report module name') parsed_args = parser.parse_args(cmd_args[1:]) self.parsed_args = parsed_args self.dbpath = parsed_args.dbpath self.filterpath = parsed_args.filterpath self.filtername = parsed_args.filtername self.filterstring = parsed_args.filterstring self.savepath = parsed_args.savepath self.confpath = parsed_args.confpath self.conf = ConfigLoader(job_conf_path=self.confpath) self.module_name = parsed_args.module_name if self.conf is not None: self.module_conf = self.conf.get_module_conf(self.module_name) else: self.module_conf = None self.report_types = parsed_args.reporttypes self.output_basename = os.path.basename(self.dbpath)[:-7] self.output_dir = os.path.dirname(self.dbpath) status_fname = '{}.status.json'.format(self.output_basename) self.status_fpath = os.path.join(self.output_dir, status_fname) async def connect_db(self, dbpath=None): if dbpath != None: self.dbpath = dbpath if self.dbpath == None: sys.stderr.write('Provide a path to aggregator output') exit() if os.path.exists(self.dbpath) == False: sys.stderr.write(self.dbpath + ' does not exist.') exit() self.conn = await aiosqlite3.connect(self.dbpath) self.cursor = await self.conn.cursor() async def load_filter(self): self.cf = await CravatFilter.create(dbpath=self.dbpath) await self.cf.loadfilter(filterpath=self.filterpath, filtername=self.filtername, filterstring=self.filterstring) async def table_exists(self, tablename): sql = 'select name from sqlite_master where ' + \ 'type="table" and name="' + tablename + '"' await self.cursor.execute(sql) row = await self.cursor.fetchone() if row == None: ret = False else: ret = True return ret
def __init__(self, dir_path, name=None): self.directory = dir_path if name is None: self.name = os.path.basename(self.directory) else: self.name = name self.script_path = os.path.join(self.directory, self.name+'.py') #if importlib.util.find_spec('cython') is not None: # pyx_path = self.script_path + 'x' # if os.path.exists(pyx_path): # self.script_path = pyx_path self.script_exists = os.path.exists(self.script_path) self.conf_path = os.path.join(self.directory, self.name+'.yml') self.conf_exists = os.path.exists(self.conf_path) self.exists = self.conf_exists startofinstall_path = os.path.join(self.directory, 'startofinstall') if os.path.exists(startofinstall_path): endofinstall_path = os.path.join(self.directory, 'endofinstall') if os.path.exists(endofinstall_path): self.exists = True else: self.exists = False self.data_dir = os.path.join(dir_path, 'data') self.data_dir_exists = os.path.isdir(self.data_dir) self.has_data = self.data_dir_exists \ and len(os.listdir(self.data_dir)) > 0 self.test_dir = os.path.join(dir_path, 'test') self.test_dir_exists = os.path.isdir(self.test_dir) self.has_test = self.test_dir_exists \ and os.path.isfile(os.path.join(self.test_dir, 'input')) \ and os.path.isfile(os.path.join(self.test_dir, 'key')) self.readme_path = os.path.join(self.directory, self.name+'.md') self.readme_exists = os.path.exists(self.readme_path) if self.readme_exists: with open(self.readme_path) as f: self.readme = f.read() else: self.readme = '' self.helphtml_path = os.path.join(self.directory, 'help.html') self.helphtml_exists = os.path.exists(self.helphtml_path) self.conf = {} if self.conf_exists: from cravat.config_loader import ConfigLoader conf = ConfigLoader() self.conf = conf.get_module_conf(self.name) self.type = self.conf.get('type') self.version = self.conf.get('version') self.description = self.conf.get('description') self.hidden = self.conf.get('hidden',False) dev_dict = self.conf.get('developer') if not(type(dev_dict)==dict): dev_dict = {} self.developer = get_developer_dict(**dev_dict) if 'type' not in self.conf: self.conf['type'] = 'unknown' self.type = self.conf['type'] self.level = self.conf.get('level') self.input_format = self.conf.get('input_format') self.secondary_module_names = list(self.conf.get('secondary_inputs',{})) if self.type == 'annotator': if self.level == 'variant': self.output_suffix = self.name + '.var' elif self.level == 'gene': self.output_suffix = self.name + '.gen' else: self.output_suffix = self. name + '.' + self.type self.title = self.conf.get('title',self.name) self.disk_size = None self.tags = self.conf.get('tags',[]) self.datasource = str(self.conf.get('datasource','')) self.smartfilters = self.conf.get('smartfilters') self.groups = self.conf.get('groups')