def ma_run(partition): log = task.logger config = GlobalConfig(task.conf) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) offline = "yes" # TODO: deprecate online mode if offline == "yes": log.info("Running Mutation assessor in local mode.") ma = MaLocal(config.ma_cache_path) else: log.info("Running Mutation assessor using web services.") from intogensm.ma.service import MaService ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"])) if not os.path.exists(results_path) or config.consequences_overwrite: log.info("Querying Mutation assessor for 'missense_variant' consequences ...") projdb = ProjectDb(project["db"]) missense_variants = set() with open(partition["vep_path"], "r") as f: for line in f: fields = line.rstrip().split("\t") var_id = int(fields[0]) ctypes = fields[3].split(",") if so.match(ctypes, so.NON_SYNONYMOUS): missense_variants.add(var_id) with open(results_path, "w") as mf: for var_id in missense_variants: var = projdb.get_variant(var_id) start, end, ref, alt = var_to_tab(var) r = ma.get(var.chr, var.strand, start, ref, alt, var_id) if r is not None: tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-") projdb.close() else: log.warn("Skipping MA, results already exist.") log.debug("MA results: {0}".format(results_path)) ma.close() # Send results to the next module partition["ma_path"] = results_path results_port.send(partition)
def update_db(project): log = task.logger config = GlobalConfig(task.conf) projects_port = task.ports("projects_out") log.info("--- [{0}] --------------------------------------------".format(project["id"])) partitions = project["partitions"] if not os.path.exists(config.vardb_path): log.warn("Database for variation external references not found") log.debug("> {0}".format(conf["vardb_path"])) varxdb = VarXrefsDb(config.vardb_path) varxdb.open() projdb = ProjectDb(project["db"]) updated_variants = set() plen = len(partitions) gene_xrefs = defaultdict(set) for part in partitions: log.info("Updating database with partition data ({0} out of {1}) ...".format(part["index"] + 1, plen)) log.info(" VEP results ...") ctype = lambda v: v.split(",") with open(part["vep_path"], "r") as vf: for fields in tsv.lines(vf, (int, str, str, ctype, str, str, str, float, float), null_value="-"): ( var_id, gene, transcript, consequences, protein_pos, aa_change, protein, sift_score, pph2_score, ) = fields var = projdb.get_variant(var_id) xrefs = varxdb.get_xrefs(var.chr, var.start, var.ref, var.alt, var.strand) if xrefs is not None: xrefs = ["{0}:{1}".format(source, xref) for source, xref in xrefs] gene_xrefs[gene].update(xrefs) if len(xrefs) == 0: xrefs = None projdb.update_variant(Variant(id=var_id, xrefs=xrefs)) projdb.add_consequence( Consequence( var=Variant(id=var_id), transcript=transcript, gene=gene, ctypes=consequences, protein_pos=protein_pos, aa_change=aa_change, protein=protein, ) ) log.info(" Transcript functional impacts ...") with open(part["tfi_path"], "r") as f: types = (int, str, str, int, float, float, int, float, float, int, float, float, int) columns = [0, 1, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17] for fields in tsv.lines(f, types, columns=columns, null_value="-"): ( var_id, transcript, uniprot, impact, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, ) = fields print fields projdb.update_consequence( Consequence( var=Variant(id=var_id), transcript=transcript, uniprot=uniprot, sift_score=sift_score, sift_tfic=sift_tfic, sift_tfic_class=sift_class, pph2_score=pph2_score, pph2_tfic=pph2_tfic, pph2_tfic_class=pph2_class, ma_score=ma_score, ma_tfic=ma_tfic, ma_tfic_class=ma_class, impact=impact, ) ) log.info("Updating variant-gene functional impacts ...") with open(project["gfi_path"], "r") as f: types = (int, str, float, int, str) for var_id, gene, impact, coding_region, prot_changes in tsv.lines(f, types, null_value="-"): projdb.add_affected_gene( AffectedGene( var=Variant(id=var_id), gene_id=gene, impact=impact, coding_region=coding_region, prot_changes=prot_changes, ) ) log.info("Updating database with gene external variant references ...") for gene, xrefs in gene_xrefs.items(): projdb.update_gene(Gene(id=gene, xrefs=xrefs)) projdb.commit() projdb.close() varxdb.close() del project["partitions"] projects_port.send(project)