Exemplo n.º 1
0
    def Run(self):
        self.transit_message("Starting Tn5 gaps method")
        start_time = time.time()

        self.transit_message("Getting data (May take a while)")

        # Combine all wigs
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        combined = tnseq_tools.combine_replicates(data, method=self.replicates)
        combined[combined < self.minread] = 0
        counts = combined
        counts[counts > 0] = 1
        num_sites = counts.size

        genes_obj = tnseq_tools.Genes(self.ctrldata,
                                      self.annotation_path,
                                      ignoreCodon=self.ignoreCodon,
                                      nterm=self.NTerminus,
                                      cterm=self.CTerminus,
                                      data=data,
                                      position=position)

        pins = numpy.mean(counts)
        pnon = 1.0 - pins

        # Calculate stats of runs
        exprunmax = tnseq_tools.ExpectedRuns(num_sites, pnon)
        varrun = tnseq_tools.VarR(num_sites, pnon)
        stddevrun = math.sqrt(varrun)
        exp_cutoff = exprunmax + 2 * stddevrun

        # Get the runs
        self.transit_message("Getting non-insertion runs in genome")
        run_arr = tnseq_tools.runs_w_info(counts)
        pos_hash = transit_tools.get_pos_hash(self.annotation_path)

        # Finally, calculate the results
        self.transit_message("Running Tn5 gaps method")
        results_per_gene = {}
        for gene in genes_obj.genes:
            results_per_gene[gene.orf] = [
                gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, 0, 0, 1
            ]

        N = len(run_arr)
        count = 0
        accum = 0
        self.progress_range(N)
        for run in run_arr:
            accum += run['length']
            count += 1
            genes = tnseq_tools.get_genes_in_range(pos_hash, run['start'],
                                                   run['end'])
            for gene_orf in genes:
                gene = genes_obj[gene_orf]
                inter_sz = self.intersect_size([run['start'], run['end']],
                                               [gene.start, gene.end]) + 1
                percent_overlap = self.calc_overlap([run['start'], run['end']],
                                                    [gene.start, gene.end])
                run_len = run['length']
                B = 1.0 / math.log(1.0 / pnon)
                u = math.log(num_sites * pins, 1.0 / pnon)
                pval = 1.0 - tnseq_tools.GumbelCDF(run['length'], u, B)

                curr_val = results_per_gene[gene.orf]
                curr_inter_sz = curr_val[6]
                curr_len = curr_val[7]
                if inter_sz > curr_inter_sz:
                    results_per_gene[gene.orf] = [
                        gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r,
                        inter_sz, run_len, pval
                    ]

            # Update Progress
            text = "Running Tn5Gaps method... %1.1f%%" % (100.0 * count / N)
            self.progress_update(text, count)

        data = list(results_per_gene.values())
        exp_run_len = float(accum) / N

        min_sig_len = float('inf')
        sig_genes_count = 0
        pval = [row[-1] for row in data]
        padj = stat_tools.BH_fdr_correction(pval)
        for i in range(len(data)):
            if padj[i] < 0.05:
                sig_genes_count += 1
                min_sig_len = min(min_sig_len, data[i][-2])
            data[i].append(padj[i])
            data[i].append('Essential' if padj[i] < 0.05 else 'Non-essential')
            #(data[i][0], data[i][1], data[i][2], data[i][3], data[i][4], data[i][5], data[i][6], data[i][7], data[i][8], padj[i], 'Essential' if padj[i] < 0.05 else 'Non-essential')
        data.sort(key=lambda l: l[0])

        # Output results
        self.output.write("#Tn5 Gaps\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          self.annotation_path.encode('utf-8'))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#Essential gene count: %d\n" % (sig_genes_count))
        self.output.write("#Minimum reads: %d\n" % (self.minread))
        self.output.write("#Replicate combination method: %s\n" %
                          (self.replicates))
        self.output.write("#Minimum significant run length: %d\n" %
                          (min_sig_len))
        self.output.write("#Expected run length: %1.5f\n" % (exp_run_len))
        self.output.write("#Expected max run length: %s\n" % (exprunmax))
        self.output.write("#%s\n" % "\t".join(columns))
        #self.output.write("#Orf\tName\tDesc\tk\tn\tr\tovr\tlenovr\tpval\tpadj\tcall\n")

        for res in data:
            self.output.write(
                "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%1.5f\t%1.5f\t%s\n" %
                (res[0], res[1], res[2], res[3], res[4], res[5], res[6],
                 res[7], res[8], res[9], res[10]))
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Tn5 Gaps")
        self.finish()
        self.transit_message("Finished Tn5Gaps Method")
Exemplo n.º 2
0
    def Run(self):

        self.transit_message("Starting HMM Method")
        start_time = time.time()

        #Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        # Normalize data
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        # Do LOESS
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        if len(self.ctrldata) > 1:
            self.transit_message("Combining Replicates as '%s'" %
                                 self.replicates)
        O = tnseq_tools.combine_replicates(
            data, method=self.replicates
        ) + 1  # Adding 1 to because of shifted geometric in scipy

        #Parameters
        Nstates = 4
        label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"}

        reads = O - 1
        reads_nz = sorted(reads[reads != 0])
        size = len(reads_nz)
        mean_r = numpy.average(reads_nz[:int(0.95 * size)])
        mu = numpy.array([1 / 0.99, 0.01 * mean_r + 2, mean_r, mean_r * 5.0])
        #mu = numpy.array([1/0.99, 0.1 * mean_r + 2,  mean_r, mean_r*5.0])
        L = 1.0 / mu
        B = []  # Emission Probability Distributions
        for i in range(Nstates):
            B.append(scipy.stats.geom(L[i]).pmf)

        pins = self.calculate_pins(O - 1)
        pins_obs = sum([1 for rd in O if rd >= 2]) / float(len(O))
        pnon = 1.0 - pins
        pnon_obs = 1.0 - pins_obs

        for r in range(100):
            if pnon**r < 0.01: break

        A = numpy.zeros((Nstates, Nstates))
        a = math.log1p(-B[int(Nstates / 2)](1)**r)
        b = r * math.log(B[int(Nstates / 2)](1)) + math.log(
            1.0 / 3)  # change to Nstates-1?
        for i in range(Nstates):
            A[i] = [b] * Nstates
            A[i][i] = a

        PI = numpy.zeros(Nstates)  # Initial state distribution
        PI[0] = 0.7
        PI[1:] = 0.3 / (Nstates - 1)

        self.progress_range(self.maxiterations)

        ###############
        ### VITERBI ###
        (Q_opt, delta, Q) = self.viterbi(A, B, PI, O)
        ###############

        ##################
        ### ALPHA PASS ###
        (log_Prob_Obs, alpha,
         C) = self.forward_procedure(numpy.exp(A), B, PI, O)
        ##################

        #################
        ### BETA PASS ###
        beta = self.backward_procedure(numpy.exp(A), B, PI, O, C)
        #################

        T = len(O)
        total = 0
        state2count = dict.fromkeys(range(Nstates), 0)
        for t in range(T):
            state = Q_opt[t]
            state2count[state] += 1
            total += 1

        self.output.write("#HMM - Sites\n")
        self.output.write("# Tn-HMM\n")

        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("# \n")
        self.output.write("# Mean:\t%2.2f\n" % (numpy.average(reads_nz)))
        self.output.write("# Median:\t%2.2f\n" % numpy.median(reads_nz))
        self.output.write("# Normalization:\t%s\n" % self.normalization)
        self.output.write("# LOESS Correction:\t%s\n" % str(self.LOESS))
        self.output.write("# pins (obs):\t%f\n" % pins_obs)
        self.output.write("# pins (est):\t%f\n" % pins)
        self.output.write("# Run length (r):\t%d\n" % r)
        self.output.write("# State means:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %8.4f" % (label[i], mu[i]) for i in range(Nstates)]))
        self.output.write("# Self-Transition Prob:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %2.4e" % (label[i], A[i][i]) for i in range(Nstates)]))
        self.output.write("# State Emission Parameters (theta):\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %1.4f" % (label[i], L[i]) for i in range(Nstates)]))
        self.output.write("# State Distributions:")
        self.output.write("#    %s\n" % "   ".join([
            "%s: %2.2f%%" % (label[i], state2count[i] * 100.0 / total)
            for i in range(Nstates)
        ]))

        states = [int(Q_opt[t]) for t in range(T)]
        last_orf = ""
        for t in range(T):
            s_lab = label.get(states[t], "Unknown State")
            gamma_t = (alpha[:, t] * beta[:, t]) / numpy.sum(
                alpha[:, t] * beta[:, t])
            genes_at_site = hash.get(position[t], [""])
            genestr = ""
            if not (len(genes_at_site) == 1 and not genes_at_site[0]):
                genestr = ",".join([
                    "%s_(%s)" % (g, rv2info.get(g, "-")[0])
                    for g in genes_at_site
                ])

            self.output.write("%s\t%s\t%s\t%s\t%s\n" %
                              (int(position[t]), int(O[t]) - 1, "\t".join(
                                  ["%-9.2e" % g
                                   for g in gamma_t]), s_lab, genestr))

        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Finished HMM - Sites Method")
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="HMM - Sites")

        #Gene Files
        self.transit_message("Creating HMM Genes Level Output")
        genes_path = ".".join(self.output.name.split(
            ".")[:-1]) + "_genes." + self.output.name.split(".")[-1]

        tempObs = numpy.zeros((1, len(O)))
        tempObs[0, :] = O - 1
        self.post_process_genes(tempObs, position, states, genes_path)

        self.transit_message("Adding File: %s" % (genes_path))
        self.add_file(path=genes_path, filetype="HMM - Genes")
        self.finish()
        self.transit_message("Finished HMM Method")
Exemplo n.º 3
0
    def Run(self):
        self.transit_message("Starting Tn5 gaps method")
        start_time = time.time()
        
        self.transit_message("Getting data (May take a while)")
        
        # Combine all wigs
        (data,position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
        combined = tnseq_tools.combine_replicates(data, method=self.replicates)
        combined[combined < self.minread] = 0
        counts = combined
        counts[counts > 0] = 1
        num_sites = counts.size
        
        genes_obj = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)
        
        pins = numpy.mean(counts)
        pnon = 1.0 - pins

        # Calculate stats of runs
        exprunmax = tnseq_tools.ExpectedRuns(num_sites, pnon)
        varrun = tnseq_tools.VarR(num_sites, pnon)
        stddevrun = math.sqrt(varrun)
        exp_cutoff = exprunmax + 2*stddevrun

        # Get the runs
        self.transit_message("Getting non-insertion runs in genome")
        run_arr = tnseq_tools.runs_w_info(counts)
        pos_hash = transit_tools.get_pos_hash(self.annotation_path)

        # Finally, calculate the results
        self.transit_message("Running Tn5 gaps method")
        results_per_gene = {}
        for gene in genes_obj.genes:
            results_per_gene[gene.orf] = [gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, 0, 0, 1]
        
        N = len(run_arr)
        count = 0
        accum = 0
        self.progress_range(N)
        for run in run_arr: 
            accum += run['length']
            count += 1
            genes = tnseq_tools.get_genes_in_range(pos_hash, run['start'], run['end'])
            for gene_orf in genes:
                gene = genes_obj[gene_orf]
                inter_sz = self.intersect_size([run['start'], run['end']], [gene.start, gene.end]) + 1
                percent_overlap = self.calc_overlap([run['start'], run['end']], [gene.start, gene.end])
                run_len = run['length']
                B = 1.0/math.log(1.0/pnon)
                u = math.log(num_sites*pins, 1.0/pnon)
                pval = 1.0 - tnseq_tools.GumbelCDF(run['length'], u, B)
                
                curr_val = results_per_gene[gene.orf]
                curr_inter_sz = curr_val[6]
                curr_len = curr_val[7]
                if inter_sz > curr_inter_sz:
                    results_per_gene[gene.orf] = [gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, inter_sz, run_len, pval]
            
            # Update Progress
            text = "Running Tn5Gaps method... %1.1f%%" % (100.0*count/N) 
            self.progress_update(text, count)
                
        data = list(results_per_gene.values())
        exp_run_len = float(accum)/N
        
        min_sig_len = float('inf')
        sig_genes_count = 0
        pval = [row[-1] for row in data]
        padj = stat_tools.BH_fdr_correction(pval)
        for i in range(len(data)):
            if padj[i] < 0.05:
                sig_genes_count += 1
                min_sig_len = min(min_sig_len, data[i][-2])
            data[i].append(padj[i]);
            data[i].append('Essential' if padj[i] < 0.05 else 'Non-essential');#(data[i][0], data[i][1], data[i][2], data[i][3], data[i][4], data[i][5], data[i][6], data[i][7], data[i][8], padj[i], 'Essential' if padj[i] < 0.05 else 'Non-essential')
        data.sort(key=lambda l: l[0])
            
        # Output results
        self.output.write("#Tn5 Gaps\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
        self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#Essential gene count: %d\n" % (sig_genes_count))
        self.output.write("#Minimum reads: %d\n" % (self.minread))
        self.output.write("#Replicate combination method: %s\n" % (self.replicates))
        self.output.write("#Minimum significant run length: %d\n" % (min_sig_len))
        self.output.write("#Expected run length: %1.5f\n" % (exp_run_len))
        self.output.write("#Expected max run length: %s\n" % (exprunmax))
        self.output.write("#%s\n" % "\t".join(columns))
        #self.output.write("#Orf\tName\tDesc\tk\tn\tr\tovr\tlenovr\tpval\tpadj\tcall\n")

        for res in data:
            self.output.write("%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%1.5f\t%1.5f\t%s\n" % (res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7], res[8], res[9], res[10]))
        self.output.close()

        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Tn5 Gaps")
        self.finish()
        self.transit_message("Finished Tn5Gaps Method")