def post_process_genes(self, data, position, states, output_path): output = open(output_path, "w") pos2state = dict([(position[t], states[t]) for t in range(len(states))]) theta = numpy.mean(data > 0) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, data=data, position=position, ignoreCodon=False) num2label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"} output.write("#HMM - Genes\n") for gene in G: reads_nz = [c for c in gene.reads.flatten() if c > 0] avg_read_nz = 0 if len(reads_nz) > 0: avg_read_nz = numpy.average(reads_nz) # State genestates = [pos2state[p] for p in gene.position] statedist = {} for st in genestates: if st not in statedist: statedist[st] = 0 statedist[st] += 1 # State counts n0 = statedist.get(0, 0) n1 = statedist.get(1, 0) n2 = statedist.get(2, 0) n3 = statedist.get(3, 0) if gene.n > 0: E = tnseq_tools.ExpectedRuns(gene.n, 1.0 - theta) V = tnseq_tools.VarR(gene.n, 1.0 - theta) if n0 == gene.n: S = "ES" elif n0 >= int(E + (3 * math.sqrt(V))): S = "ES" else: temp = max([(statedist.get(s, 0), s) for s in [0, 1, 2, 3]])[1] S = num2label[temp] else: E = 0.0 V = 0.0 S = "N/A" output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%1.4f\t%1.2f\t%s\n" % (gene.orf, gene.name, gene.desc, gene.n, n0, n1, n2, n3, gene.theta(), avg_read_nz, S)) output.close()
def Run(self): self.transit_message("Starting Tn5 gaps method") start_time = time.time() self.transit_message("Getting data (May take a while)") # Combine all wigs (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) combined = tnseq_tools.combine_replicates(data, method=self.replicates) combined[combined < self.minread] = 0 counts = combined counts[counts > 0] = 1 num_sites = counts.size genes_obj = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) pins = numpy.mean(counts) pnon = 1.0 - pins # Calculate stats of runs exprunmax = tnseq_tools.ExpectedRuns(num_sites, pnon) varrun = tnseq_tools.VarR(num_sites, pnon) stddevrun = math.sqrt(varrun) exp_cutoff = exprunmax + 2 * stddevrun # Get the runs self.transit_message("Getting non-insertion runs in genome") run_arr = tnseq_tools.runs_w_info(counts) pos_hash = transit_tools.get_pos_hash(self.annotation_path) # Finally, calculate the results self.transit_message("Running Tn5 gaps method") results_per_gene = {} for gene in genes_obj.genes: results_per_gene[gene.orf] = [ gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, 0, 0, 1 ] N = len(run_arr) count = 0 accum = 0 self.progress_range(N) for run in run_arr: accum += run['length'] count += 1 genes = tnseq_tools.get_genes_in_range(pos_hash, run['start'], run['end']) for gene_orf in genes: gene = genes_obj[gene_orf] inter_sz = self.intersect_size([run['start'], run['end']], [gene.start, gene.end]) + 1 percent_overlap = self.calc_overlap([run['start'], run['end']], [gene.start, gene.end]) run_len = run['length'] B = 1.0 / math.log(1.0 / pnon) u = math.log(num_sites * pins, 1.0 / pnon) pval = 1.0 - tnseq_tools.GumbelCDF(run['length'], u, B) curr_val = results_per_gene[gene.orf] curr_inter_sz = curr_val[6] curr_len = curr_val[7] if inter_sz > curr_inter_sz: results_per_gene[gene.orf] = [ gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, inter_sz, run_len, pval ] # Update Progress text = "Running Tn5Gaps method... %1.1f%%" % (100.0 * count / N) self.progress_update(text, count) data = list(results_per_gene.values()) exp_run_len = float(accum) / N min_sig_len = float('inf') sig_genes_count = 0 pval = [row[-1] for row in data] padj = stat_tools.BH_fdr_correction(pval) for i in range(len(data)): if padj[i] < 0.05: sig_genes_count += 1 min_sig_len = min(min_sig_len, data[i][-2]) data[i].append(padj[i]) data[i].append('Essential' if padj[i] < 0.05 else 'Non-essential') #(data[i][0], data[i][1], data[i][2], data[i][3], data[i][4], data[i][5], data[i][6], data[i][7], data[i][8], padj[i], 'Essential' if padj[i] < 0.05 else 'Non-essential') data.sort(key=lambda l: l[0]) # Output results self.output.write("#Tn5 Gaps\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#Essential gene count: %d\n" % (sig_genes_count)) self.output.write("#Minimum reads: %d\n" % (self.minread)) self.output.write("#Replicate combination method: %s\n" % (self.replicates)) self.output.write("#Minimum significant run length: %d\n" % (min_sig_len)) self.output.write("#Expected run length: %1.5f\n" % (exp_run_len)) self.output.write("#Expected max run length: %s\n" % (exprunmax)) self.output.write("#%s\n" % "\t".join(columns)) #self.output.write("#Orf\tName\tDesc\tk\tn\tr\tovr\tlenovr\tpval\tpadj\tcall\n") for res in data: self.output.write( "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%1.5f\t%1.5f\t%s\n" % (res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7], res[8], res[9], res[10])) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Tn5 Gaps") self.finish() self.transit_message("Finished Tn5Gaps Method")