def retrieve_data(self, projdb): cds_len = self.load_cds_len(self.paths.data_ensembl_gene_transcripts_path()) data = {} self.logger.info("Retrieving gene alterations for OncodriveCLUST ...") for csq in projdb.consequences(join_samples=True, filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVECLUST | so.SYNONYMOUS}): if csq.transcript not in cds_len: continue transcript_len = cds_len[csq.transcript] if so.match(csq.ctypes, so.ONCODRIVECLUST): cls = NON_SYN elif so.match(csq.ctypes, so.SYNONYMOUS): cls = SYN else: continue for sample in csq.var.samples: key = (cls, csq.gene, sample.name) if key not in data: data[key] = (csq.transcript, transcript_len, csq.protein_pos) else: transcript, tlen, protein_pos = data[key] if transcript_len > tlen: data[key] = (csq.transcript, transcript_len, csq.protein_pos) return data
def retrieve_data(projdb, cds_len): data = {} for csq in projdb.consequences(join_samples=True, filters={ProjectDb.CSQ_CTYPES : so.PROTEIN_AFFECTING | so.SYNONYMOUS}): if csq.transcript not in cds_len: continue transcript_len = cds_len[csq.transcript] if so.match(csq.ctypes, so.PROTEIN_AFFECTING): cls = NON_SYN elif so.match(csq.ctypes, so.SYNONYMOUS): cls = SYN else: continue for sample in csq.var.samples: key = (cls, csq.gene, sample.name) if key not in data: data[key] = (csq.transcript, transcript_len, csq.protein_pos) else: transcript, tlen, protein_pos = data[key] if transcript_len > tlen: data[key] = (csq.transcript, transcript_len, csq.protein_pos) return data
def ma_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running Mutation assessor in local mode.") ma = MaLocal(conf["ma_cache_path"]) else: log.info("Running Mutation assessor using web services.") ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): log.info("Querying Mutation assessor for 'missense_variant' consequences ...") projdb = ProjectDb(project["db"]) missense_variants = set() with open(partition["vep_path"], "r") as f: for line in f: fields = line.rstrip().split("\t") var_id = int(fields[0]) ctypes = fields[3].split(",") if so.match(ctypes, so.NON_SYNONYMOUS): missense_variants.add(var_id) with open(results_path, "w") as mf: for var_id in missense_variants: var = projdb.get_variant(var_id) start, end, ref, alt = var_to_tab(var) r = ma.get(var.chr, var.strand, start, ref, alt, var_id) if r is not None: tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-") projdb.close() else: log.warn("Skipping MA, results already exist.") log.debug("MA results: {0}".format(results_path)) ma.close() # Send results to the next module partition["ma_path"] = results_path results_port.send(partition)
def oncodriveclust(project): log = task.logger conf = task.conf log.info("--- [{0}] --------------------------------------------".format(project["id"])) source_genes = {} syn_genes = set() selected_genes = set() filter_genes = set() threshold_genes = set() source_samples = {} selected_samples = set() filter_samples = set() threshold_samples = set() selected_gene_sample_count = {} # number of samples for each selected gene filter_gene_sample_count = {} # number of samples per each gene passing the filter # get configuration samples_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = set() for csq in projdb.consequences(join_samples=True): # filters={ProjectDb.CSQ_CTYPES : so.PROTEIN_AFFECTING | so.SYNONYMOUS}): is_selected = so.match(csq.ctypes, so.PROTEIN_AFFECTING) is_synonymous = so.match(csq.ctypes, so.SYNONYMOUS) if csq.gene not in source_genes: source_genes[csq.gene] = gene_index = len(source_genes) if is_selected: selected_genes.add(gene_index) if is_synonymous: syn_genes.add(gene_index) for sample in csq.var.samples: if sample.name not in source_samples: source_samples[sample.name] = sample_index = len(source_samples) if is_selected: selected_samples.add(sample_index) data.add((csq.gene, sample_index)) projdb.close() log.info("Counting selected, filtered and threshold ...") # calculate selected and filter counts data2 = set() for gene, sample_index in data: gene_index = source_genes[gene] if gene_index not in selected_gene_sample_count: selected_gene_sample_count[gene_index] = 1 else: selected_gene_sample_count[gene_index] += 1 if filt.valid(gene): data2.add((gene_index, sample_index)) filter_genes.add(gene_index) filter_samples.add(sample_index) if gene_index not in filter_gene_sample_count: filter_gene_sample_count[gene_index] = 1 else: filter_gene_sample_count[gene_index] += 1 # calculate threshold counts for gene_index, sample_index in data2: if selected_gene_sample_count[gene_index] >= samples_threshold: threshold_genes.add(gene_index) threshold_samples.add(sample_index) log.info("Counting significant genes ...") # significance of q-values projdb = ProjectDb(project["db"]) sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0] sig_count = [0] * len(sig_thresholds) for gene in projdb.genes(): if gene.id in source_genes and source_genes[gene.id] in threshold_genes: i = 0 while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]: i += 1 for j in range(i, len(sig_count)): sig_count[j] += 1 projdb.close() source_genes_count = len(source_genes) syn_genes_count = len(syn_genes) selected_genes_count = len(selected_genes) filter_genes_count = len(filter_genes) threshold_genes_count = len(threshold_genes) source_samples_count = len(source_samples) selected_samples_count = len(selected_samples) filter_samples_count = len(filter_samples) threshold_samples_count = len(threshold_samples) sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi]) qc_data = dict( source=dict( genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]), genes_count=source_genes_count, genes_lost_count=max(0, source_genes_count - syn_genes_count - threshold_genes_count), samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]), samples_count=source_samples_count, ), samples_lost_count=max(0, source_samples_count - threshold_samples_count), synonymous=dict( genes=sorted(syn_genes), genes_count=syn_genes_count, ratio=(float(syn_genes_count) / selected_genes_count) if selected_genes_count > 0 else 0, ), selected=dict( genes=sorted(selected_genes), genes_count=selected_genes_count, genes_lost=sorted(set(source_genes.values()) - syn_genes - selected_genes), genes_lost_count=max(0, source_genes_count - syn_genes_count - selected_genes_count), samples=sorted(selected_samples), samples_count=selected_samples_count, samples_lost=sorted(set(source_samples.values()) - selected_samples), samples_lost_count=max(0, source_samples_count - selected_samples_count), ), filter=dict( genes=sorted_filter_genes, genes_count=filter_genes_count, genes_lost=sorted(selected_genes - filter_genes), genes_lost_count=max(0, selected_genes_count - filter_genes_count), genes_sample_count=[filter_gene_sample_count[gene_index] for gene_index in sorted_filter_genes], samples=sorted(filter_samples), samples_count=filter_samples_count, samples_lost=sorted(selected_samples - filter_samples), samples_lost_count=max(0, selected_samples_count - filter_samples_count), ), threshold=dict( genes=sorted(threshold_genes), genes_count=threshold_genes_count, genes_lost=sorted(filter_genes - threshold_genes), genes_lost_count=max(0, filter_genes_count - threshold_genes_count), samples=sorted(threshold_samples), samples_count=threshold_samples_count, samples_threshold=samples_threshold, samples_lost=sorted(filter_samples - threshold_samples), samples_lost_count=max(0, filter_samples_count - threshold_samples_count), ), results=dict(sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]), ) project_results = ProjectResults(project) project_results.save_quality_control("oncodriveclust", qc_data)
def pack_datasets(project): log = task.logger config = GlobalConfig(task.conf) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if not config.results.create_zip: log.info("Creation of the results compressed file is deactivated. Skipped.") return project_path = project["path"] temp_path = project["temp_path"] dest_path = os.path.join(project_path, "results.zip") sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) projres = ProjectResults(project) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Compressing files ...") arc = None try: arc = Archive(dest_path, mode="w", fmt="zip") log.info(" Variant genes ...") with ArcFile(task, arc, project_id, "variant_genes", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "SYMBOL", "VAR_IMPACT", "VAR_IMPACT_DESC", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), afg.gene_id, gene_sym.get(afg.gene_id), afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq or 0, total_samples, rec.sample_prop or 0, afg.coding_region, afg.prot_changes, intogen_driver, xrefs) log.info(" Variant samples ...") with ArcFile(task, arc, project_id, "variant_samples", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES") for var in projdb.variants(join_samples=True): start, end, ref, alt = var_to_tab(var) write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), ",".join([s.name for s in var.samples])) log.info(" Consequences ...") with ArcFile(task, arc, project_id, "consequences", "w") as cf: write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact)) log.info(" Genes ...") with ArcFile(task, arc, project_id, "genes", "w") as gf: write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "INTOGEN_DRIVER", "XREFS") for gene in projdb.genes(join_xrefs=True, join_rec=True): if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0: intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, intogen_driver, ",".join(gene.xrefs)) log.info(" Pathways ...") with ArcFile(task, arc, project_id, "pathways", "w") as pf: write_line(pf, "PROJECT_ID", "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0: write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0, pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0) if not config.skip_oncodrivefm: log.info(" Genes per sample functional impact ...") with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f: write_line(f, "SAMPLE", "GENE_ID", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields write_line(f, sample, gene, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class)) log.info("Saving project configuration ...") with ArcFile(task, arc, project_id, "project", "w") as f: names = ["PROJECT_ID", "ASSEMBLY", "SAMPLES_TOTAL"] values = [project_id, project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="-") finally: if arc is not None: arc.close() projdb.close() sigdb.close()
def quality_control(log, conf, project, filt): data = {} projdb = ProjectDb(project["db"]) for csq in projdb.consequences(join_samples=True, join_ctypes=True):#, #filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVEFM}): is_selected = so.match(csq.ctypes, so.ONCODRIVEFM) var = csq.var for sample in var.samples: key = (sample.id, csq.gene) if key not in data: data[key] = is_selected else: data[key] = data[key] or is_selected projdb.close() source_genes = {} selected_genes = set() filter_genes = set() threshold_genes = set() selected_gene_sample_count = {} # number of samples for each selected gene filter_gene_sample_count = {} # number of samples per gene source_samples = {} selected_samples = set() filter_samples = set() threshold_samples = set() for (sample, gene), is_selected in data.items(): if sample in source_samples: sample_index = source_samples[sample] else: source_samples[sample] = sample_index = len(source_samples) if is_selected: selected_samples.add(sample_index) increment(selected_gene_sample_count, gene) samples_threshold = get_threshold(log, conf, project, "oncodrivefm.genes.threshold", ONCODRIVEFM_GENES_THRESHOLD, len(selected_samples)) for (sample, gene), is_selected in data.items(): if gene not in source_genes: source_genes[gene] = len(source_genes) gi = source_genes[gene] sample_index = source_samples[sample] if is_selected: if filt is None or filt.valid(gene): filter_samples.add(sample_index) increment(filter_gene_sample_count, gi) if selected_gene_sample_count[gene] >= samples_threshold: threshold_samples.add(sample_index) for gene, sample_count in selected_gene_sample_count.items(): gi = source_genes[gene] selected_genes.add(gi) if filt is None or filt.valid(gene): filter_genes.add(gi) if sample_count >= samples_threshold: threshold_genes.add(gi) # significance of q-values projdb = ProjectDb(project["db"]) sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0] sig_count = [0] * len(sig_thresholds) for gene in projdb.genes(): if gene.id in source_genes and source_genes[gene.id] in threshold_genes: i = 0 while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]: i += 1 for j in range(i, len(sig_count)): sig_count[j] += 1 projdb.close() source_samples_count = len(source_samples) selected_samples_count = len(selected_samples) filter_samples_count = len(filter_samples) threshold_samples_count = len(threshold_samples) source_genes_count = len(source_genes) selected_genes_count = len(selected_genes) filter_genes_count = len(filter_genes) threshold_genes_count = len(threshold_genes) sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi]) qc_data = dict( source=dict( genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]), genes_count=source_genes_count, genes_lost_count=max(0, source_genes_count - threshold_genes_count), samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]), samples_count=source_samples_count), samples_lost_count=max(0, source_samples_count - threshold_samples_count), selected=dict( genes=sorted(selected_genes), genes_count=selected_genes_count, genes_lost=sorted(set(source_genes.values()) - selected_genes), genes_lost_count=max(0, source_genes_count - selected_genes_count), samples=sorted(selected_samples), samples_count=selected_samples_count, samples_lost=sorted(set(source_samples.values()) - selected_samples), samples_lost_count=max(0, source_samples_count - selected_samples_count)), filter=dict( genes=sorted_filter_genes, genes_count=filter_genes_count, genes_lost=sorted(selected_genes - filter_genes), genes_lost_count=max(0, selected_genes_count - filter_genes_count), genes_sample_count=[filter_gene_sample_count[gi] for gi in sorted_filter_genes], samples=sorted(filter_samples), samples_count=filter_samples_count, samples_lost=sorted(selected_samples - filter_samples), samples_lost_count=max(0, selected_samples_count - filter_samples_count)), threshold=dict( genes=sorted(threshold_genes), genes_count=threshold_genes_count, genes_lost=sorted(filter_genes - threshold_genes), genes_lost_count=max(0, filter_genes_count - threshold_genes_count), samples=sorted(threshold_samples), samples_count=threshold_samples_count, samples_threshold=samples_threshold, samples_lost=sorted(filter_samples - threshold_samples), samples_lost_count=max(0, filter_samples_count - threshold_samples_count)), results=dict( sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]) ) return qc_data
def create_datasets(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = paths.project_results_path(project_path) ensure_path_exists(datasets_path) sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() if not config.skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def fimpact_run(partition): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=paths.data_transfic_path()) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields ct = (ct or "").split(",") # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot.get(var_id) sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = 1 if so.match(ct, so.CODING_REGION) else 0 sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores.get(var_id) (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact elif so.match(ct, so.STOP): # stop sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_JUNCTION): # splice junction sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_REGION): # splice region sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SYNONYMOUS): # synonymous sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS aff_gene = (var_id, gene) # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact, sift_score, sift_tfic, sift_class, sift_impact, pph2_score, pph2_tfic, pph2_class, pph2_impact, ma_score, ma_tfic, ma_class, ma_impact, null_value="-") cf.close() # Send results to the next module partition["tfi_path"] = tfi_path results_port.send(partition)
def fimpact_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC")) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") aff_gene_attrs = {} with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields if ct is not None: ct = ct.split(",") else: ct = [] # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = so.match(ct, so.CODING_REGION) calculate_transfic = True ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores[var_id] if var_id in ma_scores else None elif so.match(ct, so.STOP): # stop ct_type = TransFIC.CT_STOP sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift ct_type = TransFIC.CT_FRAMESHIFT sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE): # splice ct_type = "splice" sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS calculate_transfic = False elif so.match(ct, so.SYNONYMOUS): # synonymous ct_type = TransFIC.CT_SYNONYMOUS sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS calculate_transfic = False if calculate_transfic: (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) # if the impact was not preassigned get it from the transFIC calculated class sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact else: sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, uniprot, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, impact, null_value="-") cf.close() log.info("Saving variant impacts ...") gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"])) vf = open(gfi_path, "w") for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") vf.close() # Send results to the next module partition["tfi_path"] = tfi_path partition["gfi_path"] = gfi_path results_port.send(partition)