def combination_recurrences(projects_set): log = task.logger conf = task.conf classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute("INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs))) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute("SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt)) r = c.fetchone() var_id = r[0] try: c.execute("INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq)) except sqlite3.IntegrityError: c.execute(""" UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id)) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute("UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id)) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = get_combination_path(conf, "recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS") for r in c.execute("SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line(f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-") log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def pack_results(project): log = task.logger conf = task.conf project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] dest_path = os.path.join(project_path, "results.zip") sigdb = SigDb(conf["sigdb_path"]) sigdb.open() projdb = ProjectDb(project["db"]) projres = ProjectResults(project) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Compressing files ...") arc = None try: arc = Archive(dest_path, mode="w", fmt="zip") log.info(" Variant genes ...") with ArcFile(task, arc, project_id, "variant_genes", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "SYMBOL", "VAR_IMPACT", "VAR_IMPACT_DESC", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), afg.gene_id, gene_sym.get(afg.gene_id), afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq or 0, total_samples, rec.sample_prop or 0, afg.coding_region, afg.prot_changes, intogen_driver, xrefs) log.info(" Variant samples ...") with ArcFile(task, arc, project_id, "variant_samples", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES") for var in projdb.variants(join_samples=True): start, end, ref, alt = var_to_tab(var) write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), ",".join([s.name for s in var.samples])) log.info(" Consequences ...") with ArcFile(task, arc, project_id, "consequences", "w") as cf: write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact)) log.info(" Genes ...") with ArcFile(task, arc, project_id, "genes", "w") as gf: write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "INTOGEN_DRIVER", "XREFS") for gene in projdb.genes(join_xrefs=True, join_rec=True): if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0: intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, intogen_driver, ",".join(gene.xrefs)) log.info(" Pathways ...") with ArcFile(task, arc, project_id, "pathways", "w") as pf: write_line(pf, "PROJECT_ID", "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0: write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0, pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0) skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool) if not skip_oncodrivefm: log.info(" Genes per sample functional impact ...") with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f: write_line(f, "SAMPLE", "GENE_ID", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields write_line(f, sample, gene, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class)) log.info("Saving project configuration ...") with ArcFile(task, arc, project_id, "project", "w") as f: names = ["PROJECT_ID", "ASSEMBLY", "SAMPLES_TOTAL"] values = [project_id, project["assembly"], total_samples] names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="-") finally: if arc is not None: arc.close() projdb.close() sigdb.close()
def datasets(project): log = task.logger conf = task.conf project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = get_website_results_path(project_path) if not os.path.exists(datasets_path): os.makedirs(datasets_path) sigdb = SigDb(conf["sigdb_path"]) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool) if not skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)