def projects(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") log.info("--- [{0}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) total_samples = projdb.get_total_affected_samples() if total_samples == 0: log.warn("There are no samples, recurrences cannot be calculated.") projdb.close() return log.info("Calculating project recurrences for variant genes ...") projdb.compute_affected_genes_recurrences(total_samples) if not conf.get("variants_only", False): log.info("Calculating project recurrences for genes ...") projdb.compute_gene_recurrences(total_samples) log.info("Calculating project recurrences for pathways ...") projdb.compute_pathway_recurrences(total_samples) projdb.commit() projdb.close() projects_out_port.send(project)
def ma_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running Mutation assessor in local mode.") ma = MaLocal(conf["ma_cache_path"]) else: log.info("Running Mutation assessor using web services.") ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): log.info("Querying Mutation assessor for 'missense_variant' consequences ...") projdb = ProjectDb(project["db"]) missense_variants = set() with open(partition["vep_path"], "r") as f: for line in f: fields = line.rstrip().split("\t") var_id = int(fields[0]) ctypes = fields[3].split(",") if so.match(ctypes, so.NON_SYNONYMOUS): missense_variants.add(var_id) with open(results_path, "w") as mf: for var_id in missense_variants: var = projdb.get_variant(var_id) start, end, ref, alt = var_to_tab(var) r = ma.get(var.chr, var.strand, start, ref, alt, var_id) if r is not None: tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-") projdb.close() else: log.warn("Skipping MA, results already exist.") log.debug("MA results: {0}".format(results_path)) ma.close() # Send results to the next module partition["ma_path"] = results_path results_port.send(partition)
def datasets(projects_set): log = task.logger conf = task.conf classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] group_file_prefix = normalize_id(classifier_id) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Reading number of samples per project ...") project_ids = [] total_samples = 0 for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}".format(project["id"])) projdb = ProjectDb(project["db"]) num_samples = projdb.get_total_affected_samples() total_samples += num_samples log.debug(" {0} samples".format(num_samples)) projdb.close() log.debug(" {0} samples in total".format(total_samples)) log.info("Updating ...") combination_path = get_combination_path(conf) path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix)) if not os.path.exists(path): with open(path, "w") as f: tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS") with open(path, "a") as f: tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
def update_db(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) oclust = project["oncodriveclust"] del project["oncodriveclust"] if not os.path.exists(oclust["results"]): log.warn("No results have been found. Skipping it.") return log.info("Updating the project database ...") projdb = ProjectDb(project["db"]) exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, clust_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) log.info(" OncodriveCLUST results ...") with tsv.open(oclust["results"], "r") as f: types = (str, str, float, float, float) columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE") for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"): projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue, clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC)) projdb.commit() projdb.close() projects_out_port.send(project)
def variants(project): log = task.logger conf = task.conf log.info("--- [{0}] --------------------------------------------".format(project["id"])) log.info("Calculating number of variants processed in each step ...") proj_res = ProjectResults(project) projdb = ProjectDb(project["db"]) counts = projdb.count_variants() proj_res.save_quality_control("variants", counts) projdb.close()
def oncodriveclust(project): log = task.logger conf = task.conf log.info("--- [{0}] --------------------------------------------".format(project["id"])) source_genes = {} syn_genes = set() selected_genes = set() filter_genes = set() threshold_genes = set() source_samples = {} selected_samples = set() filter_samples = set() threshold_samples = set() selected_gene_sample_count = {} # number of samples for each selected gene filter_gene_sample_count = {} # number of samples per each gene passing the filter # get configuration samples_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = set() for csq in projdb.consequences(join_samples=True): # filters={ProjectDb.CSQ_CTYPES : so.PROTEIN_AFFECTING | so.SYNONYMOUS}): is_selected = so.match(csq.ctypes, so.PROTEIN_AFFECTING) is_synonymous = so.match(csq.ctypes, so.SYNONYMOUS) if csq.gene not in source_genes: source_genes[csq.gene] = gene_index = len(source_genes) if is_selected: selected_genes.add(gene_index) if is_synonymous: syn_genes.add(gene_index) for sample in csq.var.samples: if sample.name not in source_samples: source_samples[sample.name] = sample_index = len(source_samples) if is_selected: selected_samples.add(sample_index) data.add((csq.gene, sample_index)) projdb.close() log.info("Counting selected, filtered and threshold ...") # calculate selected and filter counts data2 = set() for gene, sample_index in data: gene_index = source_genes[gene] if gene_index not in selected_gene_sample_count: selected_gene_sample_count[gene_index] = 1 else: selected_gene_sample_count[gene_index] += 1 if filt.valid(gene): data2.add((gene_index, sample_index)) filter_genes.add(gene_index) filter_samples.add(sample_index) if gene_index not in filter_gene_sample_count: filter_gene_sample_count[gene_index] = 1 else: filter_gene_sample_count[gene_index] += 1 # calculate threshold counts for gene_index, sample_index in data2: if selected_gene_sample_count[gene_index] >= samples_threshold: threshold_genes.add(gene_index) threshold_samples.add(sample_index) log.info("Counting significant genes ...") # significance of q-values projdb = ProjectDb(project["db"]) sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0] sig_count = [0] * len(sig_thresholds) for gene in projdb.genes(): if gene.id in source_genes and source_genes[gene.id] in threshold_genes: i = 0 while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]: i += 1 for j in range(i, len(sig_count)): sig_count[j] += 1 projdb.close() source_genes_count = len(source_genes) syn_genes_count = len(syn_genes) selected_genes_count = len(selected_genes) filter_genes_count = len(filter_genes) threshold_genes_count = len(threshold_genes) source_samples_count = len(source_samples) selected_samples_count = len(selected_samples) filter_samples_count = len(filter_samples) threshold_samples_count = len(threshold_samples) sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi]) qc_data = dict( source=dict( genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]), genes_count=source_genes_count, genes_lost_count=max(0, source_genes_count - syn_genes_count - threshold_genes_count), samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]), samples_count=source_samples_count, ), samples_lost_count=max(0, source_samples_count - threshold_samples_count), synonymous=dict( genes=sorted(syn_genes), genes_count=syn_genes_count, ratio=(float(syn_genes_count) / selected_genes_count) if selected_genes_count > 0 else 0, ), selected=dict( genes=sorted(selected_genes), genes_count=selected_genes_count, genes_lost=sorted(set(source_genes.values()) - syn_genes - selected_genes), genes_lost_count=max(0, source_genes_count - syn_genes_count - selected_genes_count), samples=sorted(selected_samples), samples_count=selected_samples_count, samples_lost=sorted(set(source_samples.values()) - selected_samples), samples_lost_count=max(0, source_samples_count - selected_samples_count), ), filter=dict( genes=sorted_filter_genes, genes_count=filter_genes_count, genes_lost=sorted(selected_genes - filter_genes), genes_lost_count=max(0, selected_genes_count - filter_genes_count), genes_sample_count=[filter_gene_sample_count[gene_index] for gene_index in sorted_filter_genes], samples=sorted(filter_samples), samples_count=filter_samples_count, samples_lost=sorted(selected_samples - filter_samples), samples_lost_count=max(0, selected_samples_count - filter_samples_count), ), threshold=dict( genes=sorted(threshold_genes), genes_count=threshold_genes_count, genes_lost=sorted(filter_genes - threshold_genes), genes_lost_count=max(0, filter_genes_count - threshold_genes_count), samples=sorted(threshold_samples), samples_count=threshold_samples_count, samples_threshold=samples_threshold, samples_lost=sorted(filter_samples - threshold_samples), samples_lost_count=max(0, filter_samples_count - threshold_samples_count), ), results=dict(sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]), ) project_results = ProjectResults(project) project_results.save_quality_control("oncodriveclust", qc_data)
def quality_control(log, conf, project, filt): data = {} projdb = ProjectDb(project["db"]) for csq in projdb.consequences(join_samples=True, join_ctypes=True):#, #filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVEFM}): is_selected = so.match(csq.ctypes, so.ONCODRIVEFM) var = csq.var for sample in var.samples: key = (sample.id, csq.gene) if key not in data: data[key] = is_selected else: data[key] = data[key] or is_selected projdb.close() source_genes = {} selected_genes = set() filter_genes = set() threshold_genes = set() selected_gene_sample_count = {} # number of samples for each selected gene filter_gene_sample_count = {} # number of samples per gene source_samples = {} selected_samples = set() filter_samples = set() threshold_samples = set() for (sample, gene), is_selected in data.items(): if sample in source_samples: sample_index = source_samples[sample] else: source_samples[sample] = sample_index = len(source_samples) if is_selected: selected_samples.add(sample_index) increment(selected_gene_sample_count, gene) samples_threshold = get_threshold(log, conf, project, "oncodrivefm.genes.threshold", ONCODRIVEFM_GENES_THRESHOLD, len(selected_samples)) for (sample, gene), is_selected in data.items(): if gene not in source_genes: source_genes[gene] = len(source_genes) gi = source_genes[gene] sample_index = source_samples[sample] if is_selected: if filt is None or filt.valid(gene): filter_samples.add(sample_index) increment(filter_gene_sample_count, gi) if selected_gene_sample_count[gene] >= samples_threshold: threshold_samples.add(sample_index) for gene, sample_count in selected_gene_sample_count.items(): gi = source_genes[gene] selected_genes.add(gi) if filt is None or filt.valid(gene): filter_genes.add(gi) if sample_count >= samples_threshold: threshold_genes.add(gi) # significance of q-values projdb = ProjectDb(project["db"]) sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0] sig_count = [0] * len(sig_thresholds) for gene in projdb.genes(): if gene.id in source_genes and source_genes[gene.id] in threshold_genes: i = 0 while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]: i += 1 for j in range(i, len(sig_count)): sig_count[j] += 1 projdb.close() source_samples_count = len(source_samples) selected_samples_count = len(selected_samples) filter_samples_count = len(filter_samples) threshold_samples_count = len(threshold_samples) source_genes_count = len(source_genes) selected_genes_count = len(selected_genes) filter_genes_count = len(filter_genes) threshold_genes_count = len(threshold_genes) sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi]) qc_data = dict( source=dict( genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]), genes_count=source_genes_count, genes_lost_count=max(0, source_genes_count - threshold_genes_count), samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]), samples_count=source_samples_count), samples_lost_count=max(0, source_samples_count - threshold_samples_count), selected=dict( genes=sorted(selected_genes), genes_count=selected_genes_count, genes_lost=sorted(set(source_genes.values()) - selected_genes), genes_lost_count=max(0, source_genes_count - selected_genes_count), samples=sorted(selected_samples), samples_count=selected_samples_count, samples_lost=sorted(set(source_samples.values()) - selected_samples), samples_lost_count=max(0, source_samples_count - selected_samples_count)), filter=dict( genes=sorted_filter_genes, genes_count=filter_genes_count, genes_lost=sorted(selected_genes - filter_genes), genes_lost_count=max(0, selected_genes_count - filter_genes_count), genes_sample_count=[filter_gene_sample_count[gi] for gi in sorted_filter_genes], samples=sorted(filter_samples), samples_count=filter_samples_count, samples_lost=sorted(selected_samples - filter_samples), samples_lost_count=max(0, selected_samples_count - filter_samples_count)), threshold=dict( genes=sorted(threshold_genes), genes_count=threshold_genes_count, genes_lost=sorted(filter_genes - threshold_genes), genes_lost_count=max(0, filter_genes_count - threshold_genes_count), samples=sorted(threshold_samples), samples_count=threshold_samples_count, samples_threshold=samples_threshold, samples_lost=sorted(filter_samples - threshold_samples), samples_lost_count=max(0, filter_samples_count - threshold_samples_count)), results=dict( sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]) ) return qc_data
def pack_results(project): log = task.logger conf = task.conf project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] dest_path = os.path.join(project_path, "results.zip") sigdb = SigDb(conf["sigdb_path"]) sigdb.open() projdb = ProjectDb(project["db"]) projres = ProjectResults(project) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Compressing files ...") arc = None try: arc = Archive(dest_path, mode="w", fmt="zip") log.info(" Variant genes ...") with ArcFile(task, arc, project_id, "variant_genes", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "SYMBOL", "VAR_IMPACT", "VAR_IMPACT_DESC", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), afg.gene_id, gene_sym.get(afg.gene_id), afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq or 0, total_samples, rec.sample_prop or 0, afg.coding_region, afg.prot_changes, intogen_driver, xrefs) log.info(" Variant samples ...") with ArcFile(task, arc, project_id, "variant_samples", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES") for var in projdb.variants(join_samples=True): start, end, ref, alt = var_to_tab(var) write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), ",".join([s.name for s in var.samples])) log.info(" Consequences ...") with ArcFile(task, arc, project_id, "consequences", "w") as cf: write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact)) log.info(" Genes ...") with ArcFile(task, arc, project_id, "genes", "w") as gf: write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "INTOGEN_DRIVER", "XREFS") for gene in projdb.genes(join_xrefs=True, join_rec=True): if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0: intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, intogen_driver, ",".join(gene.xrefs)) log.info(" Pathways ...") with ArcFile(task, arc, project_id, "pathways", "w") as pf: write_line(pf, "PROJECT_ID", "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0: write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0, pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0) skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool) if not skip_oncodrivefm: log.info(" Genes per sample functional impact ...") with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f: write_line(f, "SAMPLE", "GENE_ID", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields write_line(f, sample, gene, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class)) log.info("Saving project configuration ...") with ArcFile(task, arc, project_id, "project", "w") as f: names = ["PROJECT_ID", "ASSEMBLY", "SAMPLES_TOTAL"] values = [project_id, project["assembly"], total_samples] names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="-") finally: if arc is not None: arc.close() projdb.close() sigdb.close()
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Loading transcripts CDS length ...") cds_len = load_cds_len(conf) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = retrieve_data(projdb, cds_len) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = {} for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 if genes_filter_enabled and not filt.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < mutations_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, mutations_threshold=mutations_threshold, genes_filter_enabled=genes_filter_enabled, # not used genes_filter=genes_filter))) # not used
def combination_oncodrivefm(projects_set): log = task.logger conf = task.conf classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Exporting project data ...") base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix)) log.debug("> {0}".format(base_path)) project_ids = [] gene_files = [] pathway_files = [] for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) log.info(" Genes ...") count = 0 file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id)) gene_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "GENE_ID", "PVALUE") for gene in projdb.genes(): if gene.fm_pvalue is not None: tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-") count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id)) pathway_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "PATHWAY_ID", "ZSCORE") for pathway in projdb.pathways(): if pathway.fm_zscore is not None: tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-") count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Combining ...") combination_path = get_combination_path(conf, "oncodrivefm") log.info(" Genes ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-empirical", "-o '{0}'".format(combination_path), "-n 'gene-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in gene_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) log.info(" Pathways ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-zscore", "-o '{0}'".format(combination_path), "-n 'pathway-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in pathway_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) remove_temp(task, base_path)
def end(): log = task.logger projects_out_port = task.ports("projects_out") log.info("Updating the projects database ...") for project_id, projects in task.context.items(): log.info("[{0}]".format(project_id)) for index, project in enumerate(projects): projdb = ProjectDb(project["db"]) if index == 0: log.info(" Functional impact ...") projdb.delete_sample_gene_fimpact() with tsv.open(project["sample_gene_fi_data"], "r") as f: types = (int, str, float, float, int, float, float, int, float, float, int) for fields in tsv.lines(f, types, header=True, null_value="-"): projdb.add_sample_gene_fimpact(*fields) ofm = project["oncodrivefm"] del project["oncodrivefm"] exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, fm_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) for feature, results_path in ofm: log.info(" {0} ...".format(feature)) log.debug(" > {0}".format(results_path)) if feature == "genes": with tsv.open(results_path, "r") as f: count = 0 for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True): projdb.update_gene( Gene(id=gene, fm_pvalue=pvalue, fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC) ) count += 1 log.info(" {0} genes".format(count)) elif feature == "pathways": with tsv.open(results_path, "r") as f: count = 0 for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True): projdb.update_pathway( Pathway(id=pathway, fm_zscore=zscore, fm_pvalue=pvalue, fm_qvalue=qvalue) ) count += 1 log.info(" {0} pathways".format(count)) projdb.commit() projdb.close() projects_out_port.send(projects[0])
def scan_files(project): log = task.logger conf = task.conf projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path) projdb.create() data_path = conf["data_path"] log.info("Loading genes ...") projdb.load_genes(get_data_ensembl_genes_path(conf)) log.info("Loading pathways ...") projdb.load_pathways( get_data_kegg_def_path(conf), get_data_kegg_ensg_map_path(conf)) log.info("Parsing variants ...") for file in project["files"]: if not os.path.isabs(file): raise InternalError("Non absolute path found: {0}".format(file)) if not os.path.exists(file): raise Exception("Input file not found: {0}".format(file)) if not os.path.isfile(file): raise Exception("Not a file: {0}".format(file)) for container_name, path, name, ext, f in archived_files(file): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)
def combination_recurrences(projects_set): log = task.logger conf = task.conf classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute("INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs))) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute("SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt)) r = c.fetchone() var_id = r[0] try: c.execute("INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq)) except sqlite3.IntegrityError: c.execute(""" UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id)) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute("UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id)) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = get_combination_path(conf, "recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS") for r in c.execute("SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line(f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-") log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) log.info("Retrieving functional impact scores for genes ...") data = retrieve_data(projdb) projdb.close() # save data matrix dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm") sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz") project["sample_gene_fi_data"] = sgfi_path log.info("Saving functional impact scores ...") log.debug("> {0}".format(dst_path)) with open(dst_path, "w") as f: sgff = tsv.open(sgfi_path, "w") tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA") tsv.write_line(sgff, "SAMPLE", "GENE", "SIFT_SCORE", "SIFT_TFIC", "SIFT_TFIC_CLASS", "PPH2_SCORE", "PPH2_TFIC", "PPH2_TFIC_CLASS", "MA_SCORE", "MA_TFIC", "MA_TFIC_CLASS") for key, values in data.iteritems(): sample, gene = key (sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = values tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score) tsv.write_line(sgff, sample, gene, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, null_value="-") sgff.close() # count samples samples = set() gene_sample_count = {} for sample, gene in data.keys(): samples.add(sample) if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 num_samples = len(samples) if num_samples == 0: log.warn("There are no samples data, skipping OncodriveFM for this project") return (num_cores, estimator, genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt, pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples) # Create a dataset with information on why some genes are not considered for calculation in OncodriveFM # There are basically two possible reasons: # - It does not pass the filter # - There are less samples mutated than the threshold exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < genes_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) ofm = dict( data=dst_path, num_cores=num_cores, estimator=estimator) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="genes", slice=slice_name, num_samplings=genes_num_samplings, threshold=genes_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter))) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="pathways", slice=slice_name, num_samplings=pathways_num_samplings, threshold=pathways_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter)))
def datasets(project): log = task.logger conf = task.conf project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = get_website_results_path(project_path) if not os.path.exists(datasets_path): os.makedirs(datasets_path) sigdb = SigDb(conf["sigdb_path"]) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool) if not skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def liftover(project): log = task.logger conf = task.conf lifted_project_port = task.ports("lifted_projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) log.info("Preparing liftOver files ...") in_path = make_temp_file(task, suffix=".bed") in_file = open(in_path, "w") out_path = make_temp_file(task, suffix=".bed") unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed") projdb = ProjectDb(project["db"]) for var in projdb.variants(order_by="position"): in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id)) in_file.close() log.info("Running liftOver ...") project["from_assembly"] = project["assembly"] project["assembly"] = "hg19" cmd = " ".join([ conf["liftover_bin"], in_path, os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"), out_path, unmapped_path ]) log.debug(cmd) subprocess.call(cmd, shell=True) log.info("Annotating unmapped variants ...") count = 0 with open(unmapped_path, "r") as f: for line in f: if line.lstrip().startswith("#"): continue fields = line.rstrip().split("\t") var_id = int(fields[3]) projdb.update_variant_start(var_id, start=None) count += 1 log.info(" {0} unmapped variants annotated".format(count)) log.info("Updating variants ...") count = 0 with open(out_path, "r") as f: for line in f: fields = line.rstrip().split("\t") chr, start, end, var_id = fields projdb.update_variant_start(var_id, start=start) count += 1 log.info(" {0} variants".format(count)) remove_temp(task, in_path, out_path) projdb.commit() projdb.close() lifted_project_port.send(project)