def test_nonorm(self): data, position = tnseq_tools.get_data(all_data_list) norm_data, factors = norm_tools.normalize_data(data, "nonorm") self.assertTrue((factors == numpy.array([1.])).all()) N = len(all_data_list) for k in range(N): self.assertEqual(numpy.mean(norm_data[k]), raw_means[k])
def test_nonorm(self): data,position = tnseq_tools.get_data(all_data_list) norm_data,factors = norm_tools.normalize_data(data, "nonorm") self.assertTrue((factors == numpy.array([ 1.])).all()) N = len(all_data_list) for k in range(N): self.assertEqual(numpy.mean(norm_data[k]), raw_means[k])
def Run(self): self.transit_message("Starting Normalization") start_time = time.time() infile = self.infile outputPath = self.outfile # output file exists, should I require -overwrite flag? # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines line2 = "variableStep chrom=" # unknown for line in open(infile): if line.startswith("variableStep"): line2 = line.rstrip(); break if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0]) else: (data, sites) = tnseq_tools.get_data(self.ctrldata) (data,factors) = norm_tools.normalize_data(data,self.normalization) print "writing",outputPath file = open(outputPath,"w") file.write("# %s normalization of %s\n" % (self.normalization,infile)) if self.combined_wig==True: for f in files: file.write("#File: %s\n" % f) for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n") else: file.write(line2+"\n") for j in range(len(sites)): file.write("%s %s\n" % (sites[j],int(data[0,j]))) file.close() self.finish() self.transit_message("Finished Normalization")
def __init__(self, parent, dataset_list=["H37Rv_Sassetti_glycerol.wig"], annotation="H37Rv.prot_table", gene="", scale=None, feature_hashes=[], feature_data=[]): view_trash.MainFrame.__init__(self, parent) self.parent = parent self.size = wx.Size(1500, 800) self.start = 1 self.end = 10000 #self.orf2data = draw_trash.read_prot_table(annotation) #self.hash = draw_trash.hash_prot_genes(annotation) self.orf2data = transit_tools.get_gene_info(annotation) self.hash = transit_tools.get_pos_hash(annotation) self.features = [] #Data to facilitate search self.name2id = {} for orf, (name, desc, start, end, strand) in self.orf2data.items(): name = name.lower() if name not in self.name2id: self.name2id[name] = [] self.name2id[name].append(orf) self.lowerid2id = dict([(x.lower(), x) for x in self.orf2data.keys()]) self.labels = [fetch_name(d) for d in dataset_list] (self.fulldata, self.position) = tnseq_tools.get_data(dataset_list) #Save normalized data (self.fulldata_norm, self.factors) = norm_tools.normalize_data(self.fulldata, method="nzmean") self.wasNorm = False #initialize parent class self.feature_hashes = feature_hashes self.feature_data = feature_data if not scale: scale = [150] * len(dataset_list) self.scale = scale self.globalScale = False self.datasetChoice.SetItems(self.labels) self.datasetChoice.SetSelection(0) if gene: self.searchText.SetValue(gene) self.searchFunc(gene) self.updateFunc(parent) self.Fit()
def test_TTR(self): N = len(all_data_list) data, position = tnseq_tools.get_data(all_data_list) norm_data, factors = norm_tools.normalize_data(data, "TTR") self.assertFalse((factors == numpy.ones(N)).all()) for k in range(N): self.assertNotEqual(numpy.mean(norm_data[k]), raw_means[k])
def convertToIGV(self, dataset_list, annotationPath, path, normchoice=None): if not normchoice: normchoice = "nonorm" (fulldata, position) = tnseq_tools.get_data(dataset_list) (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice, dataset_list, annotationPath) position = position.astype(int) output = open(path, "w") output.write("#Converted to IGV with TRANSIT.\n") if normchoice != "nonorm": output.write("#Reads normalized using '%s'\n" % normchoice) output.write("#Files:\n#%s\n" % "\n#".join(dataset_list)) output.write( "#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" % ("\t".join([transit_tools.fetch_name(D) for D in dataset_list]))) chrom = transit_tools.fetch_name(annotationPath) for i, pos in enumerate(position): output.write( "%s\t%s\t%s\tTA%s\t%s\t1\n" % (chrom, position[i], position[i] + 1, position[i], "\t".join( ["%1.1f" % fulldata[j][i] for j in range(len(fulldata))]))) output.close()
def Run(self): self.transit_message("Starting Normalization") start_time = time.time() infile = self.infile outputPath = self.outfile # output file exists, should I require -overwrite flag? # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines line2 = "variableStep chrom=" # unknown for line in open(infile): if line.startswith("variableStep"): line2 = line.rstrip(); break if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0]) else: (data, sites) = tnseq_tools.get_data(self.ctrldata) (data,factors) = norm_tools.normalize_data(data,self.normalization) print "writing",outputPath file = open(outputPath,"w") file.write("# %s normalization of %s\n" % (self.normalization,infile)) if self.combined_wig==True: for f in files: file.write("#File: %s\n" % f) for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n") else: file.write(line2+"\n") for j in range(len(sites)): file.write("%s %s\n" % (sites[j],int(data[0,j]))) file.close() self.finish() self.transit_message("Finished Normalization")
def test_TTR(self): N = len(all_data_list) data,position = tnseq_tools.get_data(all_data_list) norm_data,factors = norm_tools.normalize_data(data, "TTR") self.assertFalse((factors == numpy.ones(N)).all()) for k in range(N): self.assertNotEqual(numpy.mean(norm_data[k]), raw_means[k])
def Run(self): self.transit_message("Starting IGV Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Converted to IGV with TRANSIT.\n") if self.normalization != "nonorm": self.output.write("#Reads normalized using '%s'\n" % self.normalization) if type(factors[0]) == type(0.0): self.output.write( "#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join( [",".join(["%s" % bx for bx in b]) for b in factors])) self.output.write("#Files:\n") for f in self.ctrldata: self.output.write("#%s\n" % f) dataset_str = "\t".join( [transit_tools.fetch_name(F) for F in self.ctrldata]) self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" % dataset_str) chrom = transit_tools.fetch_name(self.annotation_path) (K, N) = fulldata.shape self.progress_range(N) for i, pos in enumerate(position): self.output.write( "%s\t%s\t%s\tTA%s\t%s\t1\n" % (chrom, position[i], position[i] + 1, position[i], "\t".join( ["%1.1f" % fulldata[j][i] for j in range(len(fulldata))]))) # Update progress text = "Running Export Method... %5.1f%%" % (100.0 * i / N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def Run(self): self.transit_message("Starting Gene Mean Counts Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Summarized to Mean Gene Counts with TRANSIT.\n") if self.normalization != "nonorm": self.output.write("#Reads normalized using '%s'\n" % self.normalization) if type(factors[0]) == type(0.0): self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors])) self.output.write("#Files:\n") for f in self.ctrldata: self.output.write("#%s\n" % f) K,Nsites = fulldata.shape # Get Gene objects G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization) N = len(G) self.progress_range(N) dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata]) self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header) for i,gene in enumerate(G): if gene.n > 0: data_str = "\t".join(["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)]) else: data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)]) self.output.write("%s\t%s\t%s\t%s\n" % (gene.orf, gene.name, gene.n, data_str)) # Update progress text = "Running Export Method... %5.1f%%" % (100.0*i/N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def Run(self): self.transit_message("Starting Combined Wig Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Converted to CombinedWig with TRANSIT.\n") self.output.write("#normalization method: %s\n" % self.normalization) if self.normalization != "nonorm": if type(factors[0]) == type(0.0): self.output.write( "#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join( [",".join(["%s" % bx for bx in b]) for b in factors])) (K, N) = fulldata.shape for f in self.ctrldata: self.output.write("#File: %s\n" % f) self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata))) for i, pos in enumerate(position): #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])]) )) if self.normalization != 'nonorm': vals = "\t".join(["%1.1f" % c for c in fulldata[:, i]]) else: vals = "\t".join(["%d" % c for c in fulldata[:, i] ]) # no decimals if raw counts self.output.write("%d\t%s\t%s\n" % (position[i], vals, ",".join([ "%s (%s)" % (orf, rv2info.get(orf, ["-"])[0]) for orf in hash.get(position[i], []) ]))) # Update progress text = "Running Export Method... %5.1f%%" % (100.0 * i / N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def preprocess_data(self, position, data): (K,N) = data.shape if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path) if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) return data
def preprocess_data(self, position, data): (K,N) = data.shape if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path) if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) return data
def convertToCombinedWig(dataset_list, annotationPath, outputPath, normchoice="nonorm"): """Normalizes the input datasets and outputs the result in CombinedWig format. Arguments: dataset_list (list): List of paths to datasets in .wig format annotationPath (str): Path to annotation in .prot_table or GFF3 format. outputPath (str): Desired output path. normchoice (str): Choice for normalization method. """ (fulldata, position) = tnseq_tools.get_data(dataset_list) (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice, dataset_list, annotationPath) position = position.astype(int) hash = get_pos_hash(annotationPath) rv2info = get_gene_info(annotationPath) output = open(outputPath, "w") output.write("#Converted to CombinedWig with TRANSIT.\n") if normchoice != "nonorm": output.write("#Reads normalized using '%s'\n" % normchoice) if type(factors[0]) == type(0.0): output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: output.write( "#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors])) (K, N) = fulldata.shape output.write("#Files:\n") for f in dataset_list: output.write("#%s\n" % f) for i, pos in enumerate(position): #output.write("%-10d %s %s\n" % (position[i], "".join(["%7.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])]) )) output.write( "%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:, i]]), ",".join([ "%s (%s)" % (orf, rv2info.get(orf, ["-"])[0]) for orf in hash.get(position[i], []) ]))) output.close()
def Run(self): self.transit_message("Starting Anova analysis") start_time = time.time() self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) conditionsByFile, _, _, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata) conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata) data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions) self.transit_message("Running Anova") pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") heads = ("Rv Gene TAs".split() + ["Mean_%s" % x for x in conditionsList] + ["LFC_%s" % x for x in conditionsList] + "pval padj".split() + ["status"]) file.write("#Console: python3 %s\n" % " ".join(sys.argv)) file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC)) file.write('#'+'\t'.join(heads)+EOL) for gene in genes: Rv = gene["rv"] if Rv in MeansByRv: means = [MeansByRv[Rv][c] for c in conditionsList] LFCs = self.calcLFCs(means,self.PC) vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % x for x in means] + ["%0.3f" % x for x in LFCs] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]] + [run_status[Rv]]) file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Anova analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def convertToGeneCountSummary(dataset_list, annotationPath, outputPath, normchoice="nonorm"): """Normalizes the input datasets and outputs the result in CombinedWig format. Arguments: dataset_list (list): List of paths to datasets in .wig format annotationPath (str): Path to annotation in .prot_table or GFF3 format. outputPath (str): Desired output path. normchoice (str): Choice for normalization method. """ (fulldata, position) = tnseq_tools.get_data(dataset_list) (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice, dataset_list, annotationPath) output = open(outputPath, "w") output.write("#Summarized to Mean Gene Counts with TRANSIT.\n") if normchoice != "nonorm": output.write("#Reads normalized using '%s'\n" % normchoice) if type(factors[0]) == type(0.0): output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: output.write( "#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors])) (K, N) = fulldata.shape output.write("#Files:\n") for f in dataset_list: output.write("#%s\n" % f) # Get Gene objects G = tnseq_tools.Genes(dataset_list, annotationPath, norm=normchoice) dataset_header = "\t".join([os.path.basename(D) for D in dataset_list]) output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header) for i, gene in enumerate(G): if gene.n > 0: data_str = "\t".join( ["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)]) else: data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)]) output.write("%s\t%s\t%s\t%s\n" % (gene.orf, gene.name, gene.n, data_str)) output.close()
def refresh(self): try: #(self.data, self.position) = tnseq_tools.get_data(self.wigList) self.plots_list = [] self.statsListCtrl.DeleteAllItems() (self.normdata, factors) = norm_tools.normalize_data(self.data, self.norm) self.updateFiles() self.addPlots() self.statsListCtrl.Select(0) self.refreshPlots() except Exception as e: print self.qc_prefix, "Error:", e exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno)
def refresh(self): try: #(self.data, self.position) = tnseq_tools.get_data(self.wigList) self.plots_list = [] self.statsListCtrl.DeleteAllItems() (self.normdata, factors) = norm_tools.normalize_data(self.data, self.norm) self.updateFiles() self.addPlots() self.statsListCtrl.Select(0) self.refreshPlots() except Exception as e: print(self.qc_prefix, "Error:", e) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno)
def Run(self): self.transit_message("Starting IGV Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Converted to IGV with TRANSIT.\n") if self.normalization != "nonorm": self.output.write("#Reads normalized using '%s'\n" % self.normalization) if type(factors[0]) == type(0.0): self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors])) self.output.write("#Files:\n") for f in self.ctrldata: self.output.write("#%s\n" % f) dataset_str = "\t".join([transit_tools.fetch_name(F) for F in self.ctrldata]) self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" % dataset_str) chrom = transit_tools.fetch_name(self.annotation_path) (K,N) = fulldata.shape self.progress_range(N) for i,pos in enumerate(position): self.output.write("%s\t%s\t%s\tTA%s\t%s\t1\n" % (chrom, position[i], position[i]+1, position[i], "\t".join(["%1.1f" % fulldata[j][i] for j in range(len(fulldata))]))) # Update progress text = "Running Export Method... %5.1f%%" % (100.0*i/N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def Run(self): self.transit_message("Starting Combined Wig Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Converted to CombinedWig with TRANSIT.\n") self.output.write("#normalization method: %s\n" % self.normalization) if self.normalization != "nonorm": if type(factors[0]) == type(0.0): self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors])) (K,N) = fulldata.shape for f in self.ctrldata: self.output.write("#File: %s\n" % f) self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata))) for i,pos in enumerate(position): #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])]) )) if self.normalization!='nonorm': vals = "\t".join(["%1.1f" % c for c in fulldata[:,i]]) else: vals = "\t".join(["%d" % c for c in fulldata[:,i]]) # no decimals if raw counts self.output.write("%d\t%s\t%s\n" % (position[i],vals,",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])]) )) # Update progress text = "Running Export Method... %5.1f%%" % (100.0*i/N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def Run(self): self.transit_message("Starting Anova analysis") start_time = time.time() self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) conditions = self.wigs_to_conditions( self.read_samples_metadata(self.metadata), filenamesInCombWig) data, conditions = self.filter_by_conditions_blacklist( data, conditions, self.ignored_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map( genes, TASiteindexMap) MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions) self.transit_message("Running Anova") pvals, qvals = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output, "w") conditionsList = list(set(conditions)) vals = "Rv Gene TAs".split() + conditionsList + "pval padj".split() file.write('\t'.join(vals) + EOL) for gene in genes: Rv = gene["rv"] if Rv in MeansByRv: vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.1f" % MeansByRv[Rv][c] for c in conditionsList] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) file.write('\t'.join(vals) + EOL) file.close() self.transit_message("Finished Anova analysis")
def Run(self): self.transit_message("Starting HMM Method") start_time = time.time() #Get data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K, N) = data.shape # Normalize data if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) # Do LOESS if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) if len(self.ctrldata) > 1: self.transit_message("Combining Replicates as '%s'" % self.replicates) O = tnseq_tools.combine_replicates( data, method=self.replicates ) + 1 # Adding 1 to because of shifted geometric in scipy #Parameters Nstates = 4 label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"} reads = O - 1 reads_nz = sorted(reads[reads != 0]) size = len(reads_nz) mean_r = numpy.average(reads_nz[:int(0.95 * size)]) mu = numpy.array([1 / 0.99, 0.01 * mean_r + 2, mean_r, mean_r * 5.0]) #mu = numpy.array([1/0.99, 0.1 * mean_r + 2, mean_r, mean_r*5.0]) L = 1.0 / mu B = [] # Emission Probability Distributions for i in range(Nstates): B.append(scipy.stats.geom(L[i]).pmf) pins = self.calculate_pins(O - 1) pins_obs = sum([1 for rd in O if rd >= 2]) / float(len(O)) pnon = 1.0 - pins pnon_obs = 1.0 - pins_obs for r in range(100): if pnon**r < 0.01: break A = numpy.zeros((Nstates, Nstates)) a = math.log1p(-B[int(Nstates / 2)](1)**r) b = r * math.log(B[int(Nstates / 2)](1)) + math.log( 1.0 / 3) # change to Nstates-1? for i in range(Nstates): A[i] = [b] * Nstates A[i][i] = a PI = numpy.zeros(Nstates) # Initial state distribution PI[0] = 0.7 PI[1:] = 0.3 / (Nstates - 1) self.progress_range(self.maxiterations) ############### ### VITERBI ### (Q_opt, delta, Q) = self.viterbi(A, B, PI, O) ############### ################## ### ALPHA PASS ### (log_Prob_Obs, alpha, C) = self.forward_procedure(numpy.exp(A), B, PI, O) ################## ################# ### BETA PASS ### beta = self.backward_procedure(numpy.exp(A), B, PI, O, C) ################# T = len(O) total = 0 state2count = dict.fromkeys(range(Nstates), 0) for t in range(T): state = Q_opt[t] state2count[state] += 1 total += 1 self.output.write("#HMM - Sites\n") self.output.write("# Tn-HMM\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python3 %s\n" % " ".join(sys.argv)) self.output.write("# \n") self.output.write("# Mean:\t%2.2f\n" % (numpy.average(reads_nz))) self.output.write("# Median:\t%2.2f\n" % numpy.median(reads_nz)) self.output.write("# Normalization:\t%s\n" % self.normalization) self.output.write("# LOESS Correction:\t%s\n" % str(self.LOESS)) self.output.write("# pins (obs):\t%f\n" % pins_obs) self.output.write("# pins (est):\t%f\n" % pins) self.output.write("# Run length (r):\t%d\n" % r) self.output.write("# State means:\n") self.output.write("# %s\n" % " ".join( ["%s: %8.4f" % (label[i], mu[i]) for i in range(Nstates)])) self.output.write("# Self-Transition Prob:\n") self.output.write("# %s\n" % " ".join( ["%s: %2.4e" % (label[i], A[i][i]) for i in range(Nstates)])) self.output.write("# State Emission Parameters (theta):\n") self.output.write("# %s\n" % " ".join( ["%s: %1.4f" % (label[i], L[i]) for i in range(Nstates)])) self.output.write("# State Distributions:") self.output.write("# %s\n" % " ".join([ "%s: %2.2f%%" % (label[i], state2count[i] * 100.0 / total) for i in range(Nstates) ])) states = [int(Q_opt[t]) for t in range(T)] last_orf = "" for t in range(T): s_lab = label.get(states[t], "Unknown State") gamma_t = (alpha[:, t] * beta[:, t]) / numpy.sum( alpha[:, t] * beta[:, t]) genes_at_site = hash.get(position[t], [""]) genestr = "" if not (len(genes_at_site) == 1 and not genes_at_site[0]): genestr = ",".join([ "%s_(%s)" % (g, rv2info.get(g, "-")[0]) for g in genes_at_site ]) self.output.write("%s\t%s\t%s\t%s\t%s\n" % (int(position[t]), int(O[t]) - 1, "\t".join( ["%-9.2e" % g for g in gamma_t]), s_lab, genestr)) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Finished HMM - Sites Method") self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="HMM - Sites") #Gene Files self.transit_message("Creating HMM Genes Level Output") genes_path = ".".join(self.output.name.split( ".")[:-1]) + "_genes." + self.output.name.split(".")[-1] tempObs = numpy.zeros((1, len(O))) tempObs[0, :] = O - 1 self.post_process_genes(tempObs, position, states, genes_path) self.transit_message("Adding File: %s" % (genes_path)) self.add_file(path=genes_path, filetype="HMM - Genes") self.finish() self.transit_message("Finished HMM Method")
def Run(self): self.transit_message("Starting Binomial Method") start_time = time.time() self.progress_range(self.samples + self.burnin) #Get orf data #self.transit_message("Getting Data") #G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus) self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K, N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) #Parameters self.transit_message("Setting Parameters") w1 = 0.15 w0 = 1.0 - w1 mu_c = 0 Ngenes = len(G) sample_size = self.samples + self.burnin numReps = len(self.ctrldata) theta = numpy.zeros((Ngenes, sample_size)) theta[:, 0] = 0.10 rho0 = numpy.zeros(sample_size) rho0[0] = 0.5 Kp0 = numpy.zeros(sample_size) Kp0[0] = 10 rho1 = numpy.zeros(sample_size) rho1[0] = 0.10 Kp1 = numpy.zeros(sample_size) Kp1[0] = 3 Z = numpy.zeros((Ngenes, sample_size)) pz1 = numpy.zeros(sample_size) n1 = 0 w1 = scipy.stats.beta.rvs(self.alpha_w, self.beta_w) W1 = numpy.zeros(sample_size) W1[0] = w1 # self.transit_message("Setting Initial Values") K = numpy.array( [sum([1 for x in gene.reads.flatten() if x > 0]) for gene in G]) N = numpy.array([len(gene.reads.flatten()) for gene in G]) for g, gene in enumerate(G): if N[g] == 0: theta[g][0] = 0.5 elif K[g] / float(N[g]) == 0: theta[g][0] = 0.001 elif K[g] / float(N[g]) == 1: theta[g][0] = 0.001 else: theta[g][0] = K[g] / float(N[g]) #print(g, ORF[g], K[g], N[g], theta[g][0]) Z[g][0] = scipy.stats.bernoulli.rvs(1 - theta[g][0]) acc_p0 = 0 acc_k0 = 0 acc_p1 = 0 acc_k1 = 0 rho0c_std = 0.010 kp0c_std = 1.40 rho1c_std = 0.009 kp1c_std = 1.1 numpy.seterr(divide='ignore') for i in range(1, sample_size): i0 = Z[:, i - 1] == 0 n0 = numpy.sum(i0) i1 = Z[:, i - 1] == 1 n1 = numpy.sum(i1) theta[i0, i] = scipy.stats.beta.rvs( Kp0[i - 1] * rho0[i - 1] + K[i0], Kp0[i - 1] * (1 - rho0[i - 1]) + N[i0] - K[i0]) theta[i1, i] = scipy.stats.beta.rvs( Kp1[i - 1] * rho1[i - 1] + K[i1], Kp1[i - 1] * (1 - rho1[i - 1]) + N[i1] - K[i1]) rho0_c = rho0[i - 1] + scipy.stats.norm.rvs(0, rho0c_std) Kp0_c = Kp0[i - 1] + scipy.stats.norm.rvs(0, kp0c_std) if rho0_c <= 0: rho0[i] = rho0[i - 1] else: fc = numpy.log( scipy.stats.beta.pdf(rho0_c, self.M0 * self.pi0, self.M0 * (1.0 - self.pi0))) f0 = numpy.log( scipy.stats.beta.pdf(rho0[i - 1], self.M0 * self.pi0, self.M0 * (1.0 - self.pi0))) fc += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i0, i], Kp0[i - 1] * rho0_c, Kp0[i - 1] * (1 - rho0_c)))) f0 += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i0, i], Kp0[i - 1] * rho0[i - 1], Kp0[i - 1] * (1 - rho0[i - 1])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f0: rho0[i] = rho0_c acc_p0 += 1 else: rho0[i] = rho0[i - 1] if Kp0_c <= 0: Kp0[i] = Kp0[i - 1] else: fc = numpy.log(scipy.stats.gamma.pdf(Kp0_c, self.a0, self.b0)) f0 = numpy.log( scipy.stats.gamma.pdf(Kp0[i - 1], self.a0, self.b0)) fc += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i0, i], Kp0_c * rho0[i], Kp0_c * (1 - rho0[i])))) f0 += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i0, i], Kp0[i - 1] * rho0[i], Kp0[i - 1] * (1 - rho0[i])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f0: Kp0[i] = Kp0_c acc_k0 += 1 else: Kp0[i] = Kp0[i - 1] rho1_c = rho1[i - 1] + scipy.stats.norm.rvs(0, rho1c_std) Kp1_c = Kp1[i - 1] + scipy.stats.norm.rvs(0, kp1c_std) if rho1_c <= 0: rho1[i] = rho1[i - 1] else: fc = numpy.log( scipy.stats.beta.pdf(rho1_c, self.M1 * self.pi1, self.M1 * (1 - self.pi1))) f1 = numpy.log( scipy.stats.beta.pdf(rho1[i - 1], self.M1 * self.pi1, self.M1 * (1 - self.pi1))) fc += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i1, i], Kp1[i - 1] * rho1_c, Kp1[i - 1] * (1 - rho1_c)))) f1 += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i1, i], Kp1[i - 1] * rho1[i - 1], Kp1[i - 1] * (1 - rho1[i - 1])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f1: rho1[i] = rho1_c acc_p1 += 1 else: rho1[i] = rho1[i - 1] if Kp1_c <= 0: Kp1[i] = Kp1[i - 1] else: fc = numpy.log(scipy.stats.gamma.pdf(Kp1_c, self.a1, self.b1)) f1 = numpy.log( scipy.stats.gamma.pdf(Kp1[i - 1], self.a1, self.b1)) fc += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i1, i], Kp1_c * rho1[i], Kp1_c * (1 - rho1[i])))) f1 += numpy.sum( numpy.log( scipy.stats.beta.pdf(theta[i1, i], Kp1[i - 1] * rho1[i], Kp1[i - 1] * (1 - rho1[i])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f1: Kp1[i] = Kp1_c acc_k1 += 1 else: Kp1[i] = Kp1[i - 1] g0 = scipy.stats.beta.pdf(theta[:, i], Kp0[i] * rho0[i], Kp0[i] * (1 - rho0[i])) * (1 - w1) g1 = scipy.stats.beta.pdf(theta[:, i], Kp1[i] * rho1[i], Kp1[i] * (1 - rho1[i])) * (w1) p1 = g1 / (g0 + g1) p1 = numpy.nan_to_num(p1) try: Z[:, i] = scipy.stats.bernoulli.rvs(p1) except: inan = numpy.isnan(p1) sys.stderr.write("K=\t", K[inan], "\n") sys.stderr.write("N=\t", N[inan], "\n") sys.stderr.write("theta=", theta[inan, i], '\n') sys.exit() pz1[i] = p1[0] i1 = Z[:, i] == 1 n1 = numpy.sum(i1) #w1 = 0.15 w1 = scipy.stats.beta.rvs(self.alpha_w + n1, self.beta_w + Ngenes - n1) W1[i] = w1 #Update progress text = "Running Binomial Method... %5.1f%%" % (100.0 * (i + 1) / (sample_size)) self.progress_update(text, i) numpy.seterr(divide='warn') z_bar = numpy.apply_along_axis(numpy.mean, 1, Z[:, self.burnin:]) theta_bar = numpy.apply_along_axis(numpy.mean, 1, theta[:, self.burnin:]) #(ess_threshold, noness_threshold) = stat_tools.fdr_post_prob(z_bar) (ess_threshold, noness_threshold) = stat_tools.bayesian_ess_thresholds(z_bar) self.output.write("#Binomial\n") #output.write("#Command: %s\n" % " ".join(["%s=%s" %(key,val) for (key,val) in kwargs.items()])) if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, burnin=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.burnin)) else: self.output.write("#Console: python3 %s\n" % " ".join(sys.argv)) self.output.write("#Thresholds: (%1.5f, %1.5f)\n" % (ess_threshold, noness_threshold)) self.output.write("#rho0 Acceptance Rate:\t%f%%\n" % ((100.0 * acc_p0) / sample_size)) self.output.write("#Kp0 Acceptance Rate:\t%f%%\n" % ((100.0 * acc_k0) / sample_size)) self.output.write("#rho1 Acceptance Rate:\t%f%%\n" % ((100.0 * acc_p1) / sample_size)) self.output.write("#Kp1 Acceptance Rate:\t%f%%\n" % ((100.0 * acc_k1) / sample_size)) self.output.write( "#Hyperparameters rho: \t%1.2f\t%3.1f\t%1.2f\t%3.1f\n" % (self.pi0, self.M0, self.pi1, self.M1)) self.output.write( "#Hyperparameters Kp: \t%3.1f\t%3.1f\t%3.1f\t%3.1f\n" % (self.a0, self.b0, self.a1, self.b1)) self.output.write("#Hyperparameters W: \t%1.3f\t%1.3f\n" % (self.alpha_w, self.beta_w)) self.output.write("#%s\n" % "\t".join(columns)) data = [] for g, gene in enumerate(G): c = "Uncertain" if z_bar[g] > ess_threshold: c = "Essential" if z_bar[g] < noness_threshold: c = "Non-Essential" data.append( "%s\t%s\t%s\t%1.1f\t%d\t%d\t%d\t%f\t%f\t%s" % (gene.orf, gene.name, gene.desc, K[g] / float(numReps), N[g] / numReps, K[g], N[g], theta_bar[g], z_bar[g], c)) data.sort() for row in data: self.output.write("%s\n" % row) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Binomial") self.finish() self.transit_message("Finished Binomial Method")
if DO_LIB: ctrl_lib_str = "ABAB" exp_lib_str = "AAABBB" else: ctrl_lib_str = "" exp_lib_str = "" Kctrl = len(ctrldata) Kexp = len(expdata) (data, position) = transit_tools.get_validated_data(ctrldata+expdata) (K,N) = data.shape (data, factors) = norm_tools.normalize_data(data, "TTR", ctrldata+expdata, annotation) G = tnseq_tools.Genes(ctrldata + expdata, annotation, data=data, position=position) gene = G[i] print "\n\n" print "#"*100 print "# (%s) NEW TEST: %s" % (DO_LIB, gene) print "#"*100 print "" ii = numpy.ones(gene.n) == 1
def Run(self): self.transit_message("Starting Genetic Interactions Method") start_time = time.time() self.output.write("#GI\n") wiglist = self.ctrldataA + self.expdataA + self.ctrldataB + self.expdataB Nwig = len(wiglist) Na1 = len(self.ctrldataA) Nb1 = len(self.expdataA) Na2 = len(self.ctrldataB) Nb2 = len(self.expdataB) # Get data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(wiglist, wxobj=self.wxobj) # Normalize data if specified if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, wiglist, self.annotation_path) # Do LOESS correction if specified if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) # Get Gene objects for each condition G_A1 = tnseq_tools.Genes([], self.annotation_path, data=data[:Na1], position=position,nterm=self.NTerminus,cterm=self.CTerminus) G_B1 = tnseq_tools.Genes([], self.annotation_path, data=data[Na1:(Na1+Nb1)], position=position,nterm=self.NTerminus,cterm=self.CTerminus) G_A2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1):(Na1+Nb1+Na2)], position=position,nterm=self.NTerminus,cterm=self.CTerminus) G_B2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1+Na2):], position=position,nterm=self.NTerminus,cterm=self.CTerminus) means_list_a1 = [] means_list_b1 = [] means_list_a2 = [] means_list_b2 = [] var_list_a1 = [] var_list_a2 = [] var_list_b1 = [] var_list_b2 = [] # Base priors on empirical observations accross genes. for gene in sorted(G_A1): if gene.n > 1: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() means_list_a1.append(numpy.mean(A1_data)) var_list_a1.append(numpy.var(A1_data)) means_list_b1.append(numpy.mean(B1_data)) var_list_b1.append(numpy.var(B1_data)) means_list_a2.append(numpy.mean(A2_data)) var_list_a2.append(numpy.var(A2_data)) means_list_b2.append(numpy.mean(B2_data)) var_list_b2.append(numpy.var(B2_data)) # Priors mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01) mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01) mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01) mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01) s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01) s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01) s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01) s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01) k0=1.0 nu0=1.0 data = [] postprob = [] count = 0 N = len(G_A1) self.progress_range(N) # Perform actual analysis for gene in G_A1: # If there is some data if gene.n > 0: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() # Time-1 Time-2 # # Strain-A A C # # Strain-B B D try: muA1_post, varA1_post = stat_tools.sample_trunc_norm_post(A1_data, self.samples, mu0_A1, s20_A1, k0, nu0) muB1_post, varB1_post = stat_tools.sample_trunc_norm_post(B1_data, self.samples, mu0_B1, s20_B1, k0, nu0) muA2_post, varA2_post = stat_tools.sample_trunc_norm_post(A2_data, self.samples, mu0_A2, s20_A2, k0, nu0) muB2_post, varB2_post = stat_tools.sample_trunc_norm_post(B2_data, self.samples, mu0_B2, s20_B2, k0, nu0) except Exception as e: muA1_post = varA1_post = numpy.ones(self.samples) muB1_post = varB1_post = numpy.ones(self.samples) muA2_post = varA2_post = numpy.ones(self.samples) muB2_post = varB2_post = numpy.ones(self.samples) logFC_A_post = numpy.log2(muA2_post/muA1_post) logFC_B_post = numpy.log2(muB2_post/muB1_post) delta_logFC_post = logFC_B_post - logFC_A_post alpha = 0.05 # Get Bounds of the HDI l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC(logFC_A_post, 1-alpha) l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC(logFC_B_post, 1-alpha) l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC(delta_logFC_post, 1-alpha) mean_logFC_A = numpy.mean(logFC_A_post) mean_logFC_B = numpy.mean(logFC_B_post) mean_delta_logFC = numpy.mean(delta_logFC_post) # Is HDI significantly different than ROPE? not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope # Probability of posterior overlaping with ROPE probROPE = numpy.mean(numpy.logical_and(delta_logFC_post>=0.0-self.rope, delta_logFC_post<=0.0+self.rope)) # If there is no data, assume empty defaults else: A1_data = [0,0] B1_data = [0,0] A2_data = [0,0] B2_data = [0,0] muA1_post = varA1_post = numpy.ones(self.samples) muB1_post = varB1_post = numpy.ones(self.samples) muA2_post = varA2_post = numpy.ones(self.samples) muB2_post = varB2_post = numpy.ones(self.samples) logFC_A_post = numpy.log2(muA2_post/muA1_post) logFC_B_post = numpy.log2(muB2_post/muB1_post) delta_logFC_post = logFC_B_post - logFC_A_post mean_logFC_A = 0 mean_logFC_B = 0 mean_delta_logFC = 0 l_logFC_A = 0 u_logFC_A = 0 l_logFC_B = 0 u_logFC_B = 0 l_delta_logFC = 0 u_delta_logFC = 0 probROPE = 1.0 if numpy.isnan(l_logFC_A): l_logFC_A = -10 u_logFC_A = 10 if numpy.isnan(l_logFC_B): l_logFC_B = -10 u_logFC_B = 10 if numpy.isnan(l_delta_logFC): l_delta_logFC = -10 u_delta_logFC = 10 postprob.append(probROPE) data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post), numpy.mean(muA2_post), numpy.mean(muB1_post), numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit)) text = "Running GI Method... %2.0f%%" % (100.0*(count+1)/N) self.progress_update(text, count) self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0*count/(N-1))) count+=1 data.sort(key=lambda x: x[-2]) if self.doBFDR or not self.doFWER: postprob = numpy.array(postprob) postprob.sort() bfdr = numpy.cumsum(postprob)/numpy.arange(1, len(postprob)+1) adjusted_prob = bfdr adjusted_label = "BFDR" elif doFWER: fwer = FWER_Bayes(postprob) fwer.sort() adjusted_prob = fwer adjusted_label = "FWER" # If not using adjustment for classification, sort correctly if not self.doBFDR and not self.doFWER: sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)] adjusted_prob = [adjusted_prob[ii] for ii in sorted_index] data = [data[ii] for ii in sorted_index] # Print output if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.includeZeros, self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8'))) self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8'))) self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8'))) self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8'))) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) if self.doBFDR or self.doFWER: self.output.write("# Significant interactions are those whose adjusted probability of the delta-logFC falling within ROPE is < 0.05 (Adjusted using %s)\n" % (adjusted_label)) else: self.output.write("# Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n") self.output.write("#\n") # Write column names self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Time 1)\tMean count (Strain A Time 2)\tMean count (Strain B Time 1)\tMean count (Strain B Time 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tProb. of delta-logFC being within ROPE\tAdjusted Probability (%s)\tIs HDI outside ROPE?\tType of Interaction\n" % adjusted_label) # Write gene results for i,row in enumerate(data): #1 2 3 4 5 6 7 8 9 10 11 12 13 14 orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row type_of_interaction = "No Interaction" if ((self.doBFDR or self.doFWER) and adjusted_prob[i] < 0.05): type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A) elif not (self.doBFDR or self.doFWER) and not_HDI_overlap_bit: type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A) new_row = tuple(list(row[:-1])+[adjusted_prob[i], not_HDI_overlap_bit, type_of_interaction]) self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.8f\t%1.8f\t%s\t%s\n" % new_row) self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="GI") self.finish() self.transit_message("Finished Genetic Interactions Method")
def Run(self): #if not self.wxobj: # # Force matplotlib to use good backend for png. # import matplotlib.pyplot as plt #elif "matplotlib.pyplot" not in sys.modules: try: import matplotlib.pyplot as plt except: print "Error: cannot do histograms" self.doHistogram = False self.transit_message("Starting resampling Method") start_time = time.time() if self.doHistogram: histPath = os.path.join( os.path.dirname(self.output.name), transit_tools.fetch_name(self.output.name) + "_histograms") if not os.path.isdir(histPath): os.makedirs(histPath) else: histPath = "" Kctrl = len(self.ctrldata) Kexp = len(self.expdata) #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata + self.expdata, wxobj=self.wxobj) (K, N) = data.shape if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata + self.expdata, self.annotation_path) if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) G = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) #G = tnseq_tools.Genes(self.ctrldata+self.expdata, self.annotation_path, norm=self.normalization, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus) #Resampling data = [] N = len(G) count = 0 self.progress_range(N) for gene in G: count += 1 if gene.k == 0 or gene.n == 0: (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail, pval_2tail, testlist, data1, data2) = (0, 0, 0, 0, 1.00, 1.00, 1.00, [], [0], [0]) else: if not self.includeZeros: ii = numpy.sum(gene.reads, 0) > 0 else: ii = numpy.ones(gene.n) == 1 data1 = gene.reads[:Kctrl, ii].flatten() + self.pseudocount data2 = gene.reads[Kctrl:, ii].flatten() + self.pseudocount (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail, pval_2tail, testlist) = stat_tools.resampling( data1, data2, S=self.samples, testFunc=stat_tools.F_mean_diff_flat, adaptive=self.adaptive) if self.doHistogram: import matplotlib.pyplot as plt if testlist: n, bins, patches = plt.hist(testlist, density=1, facecolor='c', alpha=0.75, bins=100) else: n, bins, patches = plt.hist([0, 0], density=1, facecolor='c', alpha=0.75, bins=100) plt.xlabel('Delta Mean') plt.ylabel('Probability') plt.title('%s - Histogram of Delta Mean' % gene.orf) plt.axvline(test_obs, color='r', linestyle='dashed', linewidth=3) plt.grid(True) genePath = os.path.join(histPath, gene.orf + ".png") if not os.path.exists(histPath): os.makedirs(histPath) plt.savefig(genePath) plt.clf() sum1 = numpy.sum(data1) sum2 = numpy.sum(data2) data.append([ gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, sum1, sum2, test_obs, log2FC, pval_2tail ]) # Update progress text = "Running Resampling Method... %5.1f%%" % (100.0 * count / N) self.progress_update(text, count) # self.transit_message("") # Printing empty line to flush stdout self.transit_message("Performing Benjamini-Hochberg Correction") data.sort() qval = stat_tools.BH_fdr_correction([row[-1] for row in data]) self.output.write("#Resampling\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: norm=%s, samples=%s, pseudocounts=%1.2f, adaptive=%s, histogram=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.pseudocount, self.adaptive, self.doHistogram, self.includeZeros, self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Control Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Experimental Data: %s\n" % (",".join(self.expdata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8'))) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) for i, row in enumerate(data): (orf, name, desc, n, mean1, mean2, sum1, sum2, test_obs, log2FC, pval_2tail) = row self.output.write( "%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.1f\t%1.2f\t%1.1f\t%1.5f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, sum1, sum2, test_obs, pval_2tail, qval[i])) self.output.close() self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Resampling") self.finish() self.transit_message("Finished resampling Method")
def Run(self): self.transit_message("Starting Genetic Interactions Method") start_time = time.time() self.output.write("#GI\n") wiglist = self.ctrldataA + self.ctrldataB + self.expdataA + self.expdataB Nwig = len(wiglist) Na1 = len(self.ctrldataA) Nb1 = len(self.ctrldataB) Na2 = len(self.expdataA) Nb2 = len(self.expdataB) # Get data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(wiglist, wxobj=self.wxobj) # Normalize data if specified if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, wiglist, self.annotation_path) # Do LOESS correction if specified if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) # Get Gene objects for each condition G_A1 = tnseq_tools.Genes([], self.annotation_path, data=data[:Na1], position=position, nterm=self.NTerminus, cterm=self.CTerminus) G_B1 = tnseq_tools.Genes([], self.annotation_path, data=data[Na1:(Na1 + Nb1)], position=position, nterm=self.NTerminus, cterm=self.CTerminus) G_A2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1 + Nb1):(Na1 + Nb1 + Na2)], position=position, nterm=self.NTerminus, cterm=self.CTerminus) G_B2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1 + Nb1 + Na2):], position=position, nterm=self.NTerminus, cterm=self.CTerminus) means_list_a1 = [] means_list_b1 = [] means_list_a2 = [] means_list_b2 = [] var_list_a1 = [] var_list_a2 = [] var_list_b1 = [] var_list_b2 = [] # Base priors on empirical observations across genes. for gene in sorted(G_A1): if gene.n > 1: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() means_list_a1.append(numpy.mean(A1_data)) var_list_a1.append(numpy.var(A1_data)) means_list_b1.append(numpy.mean(B1_data)) var_list_b1.append(numpy.var(B1_data)) means_list_a2.append(numpy.mean(A2_data)) var_list_a2.append(numpy.var(A2_data)) means_list_b2.append(numpy.mean(B2_data)) var_list_b2.append(numpy.var(B2_data)) # Priors mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01) mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01) mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01) mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01) s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01) s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01) s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01) s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01) k0 = 1.0 nu0 = 1.0 data = [] postprob = [] count = 0 N = len(G_A1) self.progress_range(N) # Perform actual analysis for gene in G_A1: # If there is some data if gene.n > 0: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() # Time-1 Time-2 # # Strain-A A C # # Strain-B B D try: muA1_post, varA1_post = stat_tools.sample_trunc_norm_post( A1_data, self.samples, mu0_A1, s20_A1, k0, nu0) muB1_post, varB1_post = stat_tools.sample_trunc_norm_post( B1_data, self.samples, mu0_B1, s20_B1, k0, nu0) muA2_post, varA2_post = stat_tools.sample_trunc_norm_post( A2_data, self.samples, mu0_A2, s20_A2, k0, nu0) muB2_post, varB2_post = stat_tools.sample_trunc_norm_post( B2_data, self.samples, mu0_B2, s20_B2, k0, nu0) except Exception as e: muA1_post = varA1_post = numpy.ones(self.samples) muB1_post = varB1_post = numpy.ones(self.samples) muA2_post = varA2_post = numpy.ones(self.samples) muB2_post = varB2_post = numpy.ones(self.samples) logFC_A_post = numpy.log2(muA2_post / muA1_post) logFC_B_post = numpy.log2(muB2_post / muB1_post) delta_logFC_post = logFC_B_post - logFC_A_post alpha = 0.05 # Get Bounds of the HDI l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC( logFC_A_post, 1 - alpha) l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC( logFC_B_post, 1 - alpha) l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC( delta_logFC_post, 1 - alpha) mean_logFC_A = numpy.mean(logFC_A_post) mean_logFC_B = numpy.mean(logFC_B_post) mean_delta_logFC = numpy.mean(delta_logFC_post) # Is HDI significantly different than ROPE? (i.e. no overlap) not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope # Probability of posterior overlaping with ROPE probROPE = numpy.mean( numpy.logical_and(delta_logFC_post >= 0.0 - self.rope, delta_logFC_post <= 0.0 + self.rope)) # If there is no data, assume empty defaults else: A1_data = [0, 0] B1_data = [0, 0] A2_data = [0, 0] B2_data = [0, 0] muA1_post = varA1_post = numpy.ones(self.samples) muB1_post = varB1_post = numpy.ones(self.samples) muA2_post = varA2_post = numpy.ones(self.samples) muB2_post = varB2_post = numpy.ones(self.samples) logFC_A_post = numpy.log2(muA2_post / muA1_post) logFC_B_post = numpy.log2(muB2_post / muB1_post) delta_logFC_post = logFC_B_post - logFC_A_post mean_logFC_A = 0 mean_logFC_B = 0 mean_delta_logFC = 0 l_logFC_A = 0 u_logFC_A = 0 l_logFC_B = 0 u_logFC_B = 0 l_delta_logFC = 0 u_delta_logFC = 0 probROPE = 1.0 if numpy.isnan(l_logFC_A): l_logFC_A = -10 u_logFC_A = 10 if numpy.isnan(l_logFC_B): l_logFC_B = -10 u_logFC_B = 10 if numpy.isnan(l_delta_logFC): l_delta_logFC = -10 u_delta_logFC = 10 postprob.append(probROPE) data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post), numpy.mean(muA2_post), numpy.mean(muB1_post), numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit)) text = "Running GI Method... %2.0f%%" % (100.0 * (count + 1) / N) self.progress_update(text, count) self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0 * count / (N - 1))) count += 1 # for HDI, maybe I should sort on abs(mean_delta_logFC); however, need to sort by prob to calculate BFDR probcol = -2 # probROPEs data.sort(key=lambda x: x[probcol]) sortedprobs = numpy.array([x[probcol] for x in data]) # BFDR method: Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical mixture method. Biostatistics, 5:155–176. if self.signif == "BFDR": sortedprobs = numpy.array(sortedprobs) #sortedprobs.sort() # why, since already sorted? bfdr = numpy.cumsum(sortedprobs) / numpy.arange( 1, len(sortedprobs) + 1) adjusted_prob = bfdr # should be same order as sorted above by probROPE adjusted_label = "BFDR" elif self.signif == "FWER": fwer = stat_tools.FWER_Bayes(sortedprobs) #fwer.sort() # should not need this if monotonic adjusted_prob = fwer adjusted_label = "FWER" # If not using adjustment for classification, sort correctly else: adjusted_prob = sortedprobs adjusted_label = "un" # should I stable-sort by overlap_bit? # sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)] # adjusted_prob = [adjusted_prob[ii] for ii in sorted_index] # data = [data[ii] for ii in sorted_index] # Print(output) if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.includeZeros, self.output.name.encode('utf-8'))) else: self.output.write("#Console: python3 %s\n" % " ".join(sys.argv)) now = str(datetime.datetime.now()) now = now[:now.rfind('.')] self.output.write("#Date: " + now + "\n") #self.output.write("#Runtime: %s s\n" % (time.time() - start_time)) self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8'))) self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8'))) self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8'))) self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8'))) self.output.write("#ROPE=%s, method for significance=%s\n" % (self.rope, self.signif)) #self.output.write("#%s\n" % "\t".join(columns)) if self.signif == "HDI": self.output.write( "#Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n" ) elif self.signif in "prob BDFR FWER": self.output.write( "#Significant interactions are those whose %s-adjusted probability of the delta-logFC falling within ROPE is < 0.05.\n" % (adjusted_label)) # Write column names (redundant with self.columns) self.output.write( "#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tIs HDI outside ROPE?\tProb. of delta-logFC being within ROPE\t%s-Adjusted Probability\tType of Interaction\n" % adjusted_label) # Write gene results for i, row in enumerate(data): #1 2 3 4 5 6 7 8 9 10 11 12 13 14 orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A) type_of_interaction = "No Interaction" if self.signif in "prob BFDR FWER" and adjusted_prob[i] < 0.05: type_of_interaction = interaction if self.signif == "HDI" and not_HDI_overlap_bit: type_of_interaction = interaction new_row = tuple( list(row[:-2]) + [ not_HDI_overlap_bit, probROPE, adjusted_prob[i], type_of_interaction ]) self.output.write( "%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%s\t%1.8f\t%1.8f\t%s\n" % new_row) self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="GI") self.finish() self.transit_message("Finished Genetic Interactions Method")
def test_normalization(self): N = len(all_data_list) data,position = tnseq_tools.get_data(all_data_list) norm_data,factors = norm_tools.normalize_data(data, "TTR") self.assertFalse((factors == numpy.ones(N)).all())
if DO_LIB: ctrl_lib_str = "ABAB" exp_lib_str = "AAABBB" else: ctrl_lib_str = "" exp_lib_str = "" Kctrl = len(ctrldata) Kexp = len(expdata) (data, position) = transit_tools.get_validated_data(ctrldata+expdata) (K,N) = data.shape (data, factors) = norm_tools.normalize_data(data, "TTR", ctrldata+expdata, annotation) G = tnseq_tools.Genes(ctrldata + expdata, annotation, data=data, position=position) gene = G[i] print("\n\n") print("#"*100) print("# (%s) NEW TEST: %s" % (DO_LIB, gene)) print("#"*100) print("") ii = numpy.ones(gene.n) == 1
def Run(self): self.transit_message("Starting Example Method") start_time = time.time() #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K, N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) data = [] N = len(G) count = 0 self.progress_range(N) for gene in G: count += 1 if gene.n == 0: mean = 0.0 else: mean = numpy.mean(gene.reads) if gene.k == 0: nzmean = 0.0 else: nzmean = numpy.sum(gene.reads) / float(gene.k) data.append( "%s\t%s\t%s\t%s\t%s\t%1.2f\t%1.2f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, mean, nzmean)) # Update Progress text = "Running Example Method... %5.1f%%" % (100.0 * count / N) self.progress_update(text, count) self.output.write("#Example\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python3 %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) data.sort() for line in data: self.output.write(line) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Example") self.finish() self.transit_message("Finished Example Method")
def Run(self): self.transit_message("Starting Griffin Method") start_time = time.time() #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K,N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) N = len(G) self.progress_range(N) count = 0 pins = G.global_theta() pnon = 1.0 - pins results = [] for gene in G: if gene.n == 0: results.append([gene, 0.0, 1.000]) else: B = 1.0/math.log(1.0/pnon) u = math.log(gene.n*pins, 1.0/pnon) exprun = tnseq_tools.ExpectedRuns(gene.n, pnon) pval = 1.0 - tnseq_tools.GumbelCDF(gene.r, u, B) results.append([gene, exprun, pval]) text = "Running Griffin Method... %5.1f%%" % (100.0*(count+1)/(N)) self.progress_update(text, count) count+=1 pval = [row[-1] for row in results] padj = stat_tools.BH_fdr_correction(pval) for i in range(len(results)): results[i].append(padj[i]) results.sort() self.output.write("#Griffin\n") if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) for (gene, exprun, pval, padj) in results: self.output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%1.1f\t%1.5f\t%1.5f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, gene.s, gene.t, exprun, pval, padj)) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Griffin") self.finish() self.transit_message("Finished Griffin Method")
def Run(self): self.transit_message("Starting rankproduct Method") start_time = time.time() Kctrl = len(self.ctrldata) Kexp = len(self.expdata) #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata + self.expdata, wxobj=self.wxobj) if self.normalization != "none": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata + self.expdata, self.annotation_path) Gctrl = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[:Kctrl, :], position=position) Gexp = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[Kctrl:, :], position=position) Ngenes = len(Gctrl) # Get the average counts for all the genes, in each replicate meanCtrl = numpy.zeros((Kctrl, Ngenes)) meanExp = numpy.zeros((Kexp, Ngenes)) for i in range(Ngenes): if numpy.any(Gctrl[i].reads): meanCtrl[:, i] = numpy.mean(Gctrl[i].reads, 1) else: meanCtrl[:, i] = numpy.zeros(Kctrl) # if numpy.any(Gexp[i].reads): meanExp[:, i] = numpy.mean(Gexp[i].reads, 1) else: meanExp[:, i] = numpy.zeros(Kexp) # Calculate a logFC2 between Experimental and Control # Then calculates it's rank, and observed rankProduct logFC2 = numpy.log2((meanExp + 0.0001) / (meanCtrl + 0.0001)) rank = numpy.array([scipy.stats.rankdata(Lvec) for Lvec in logFC2]) obsRP = numpy.power(numpy.prod(rank, 0), 1.0 / Kctrl) permutations = numpy.zeros((self.samples, Ngenes)) tempranks = scipy.array( [numpy.arange(1, Ngenes + 1) for rep in range(Kctrl)]) for s in range(self.samples): rankperm = numpy.array( [numpy.random.permutation(tr) for tr in tempranks]) permutations[s] = numpy.power(numpy.prod(rankperm, 0), 1.0 / Kctrl) rankRP = numpy.argsort(obsRP) + 1 #rankproduct data = [] count = 0 self.progress_range(Ngenes) for i, gene in enumerate(Gctrl): count += 1 meanctrl = numpy.mean(Gctrl[i].reads) meanexp = numpy.mean(Gexp[i].reads) log2fc = numpy.log2((meanexp + 0.0001) / (meanctrl + 0.0001)) countbetter = numpy.sum(permutations <= obsRP[i]) pval = countbetter / float(self.samples * Ngenes) e_val = countbetter / float(self.samples) q_paper = e_val / float(rankRP[i]) data.append([ gene.orf, gene.name, gene.desc, gene.n, meanctrl, meanexp, log2fc, obsRP[i], e_val, q_paper, pval ]) # Update Progress text = "Running rankproduct Method... %5.1f%%" % (100.0 * count / Ngenes) self.progress_update(text, count) # self.transit_message("") # Printing empty line to flush stdout self.transit_message("Performing Benjamini-Hochberg Correction") data.sort() q_bh = stat_tools.BH_fdr_correction([row[-1] for row in data]) self.output.write("#RankProduct\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % (columns)) for i, row in enumerate(data): (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene, e_val, q_paper, pval) = row self.output.write( "%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.8f\t%1.1f\t%1.8f\n" % (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene, e_val, q_paper)) self.output.close() self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="RankProduct") self.finish() self.transit_message("Finished rankproduct Method")
def Run(self): self.transit_message("Starting Mann-Whitney U-test Method") start_time = time.time() Kctrl = len(self.ctrldata) Kexp = len(self.expdata) #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj) (K,N) = data.shape if self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path) if self.LOESS: self.transit_message("Performing LOESS Correction") for j in range(K): data[j] = stat_tools.loess_correction(position, data[j]) G = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) #u-test data = [] N = len(G) count = 0 self.progress_range(N) for gene in G: count+=1 if gene.k == 0 or gene.n == 0: (test_obs, mean1, mean2, log2FC, u_stat, pval_2tail) = (0, 0, 0, 0, 0.0, 1.00) else: if not self.includeZeros: ii = numpy.sum(gene.reads,0) > 0 else: ii = numpy.ones(gene.n) == 1 data1 = gene.reads[:Kctrl,ii].flatten() data2 = gene.reads[Kctrl:,ii].flatten() try: u_stat, pval_2tail = scipy.stats.mannwhitneyu(data1, data2, alternative="two-sided") except ValueError as e: u_stat, pval_2tail = 0.0, 1.00 n1 = len(data1) n2 = len(data2) mean1 = 0 if n1 > 0: mean1 = numpy.mean(data1) mean2 = 0 if n2 > 0: mean2 = numpy.mean(data2) try: # Only adjust log2FC if one of the means is zero if mean1 > 0 and mean2 > 0: log2FC = math.log((mean2)/(mean1),2) else: log2FC = math.log((mean2+1.0)/(mean1+1.0),2) except: log2FC = 0.0 #["Orf","Name","Desc","Sites","Mean Ctrl","Mean Exp","log2FC", "U-Statistic","p-value","Adj. p-value"] data.append([gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, log2FC, u_stat, pval_2tail]) # Update Progress text = "Running Mann-Whitney U-test Method... %1.1f%%" % (100.0*count/N) self.progress_update(text, count) # self.transit_message("") # Printing empty line to flush stdout self.transit_message("Performing Benjamini-Hochberg Correction") data.sort() qval = stat_tools.BH_fdr_correction([row[-1] for row in data]) self.output.write("#utest\n") if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: norm=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.includeZeros, self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Control Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Experimental Data: %s\n" % (",".join(self.expdata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8'))) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) for i,row in enumerate(data): (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail) = row self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.2f\t%1.5f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail, qval[i])) self.output.close() self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="utest") self.finish() self.transit_message("Finished Mann-Whitney U-test Method")
def Run(self): self.status_message("Starting Gumbel Method") #Set Default parameter values w1 = 0.15 w0 = 1.0 - w1 ALPHA = 1 BETA = 1 ALPHA_w = 600 BETA_w = 3400 mu_c = 0 acctot = 0.0 phi_start = 0.3 sigma_c = 0.01 start_time = time.time() self.progress_range(self.samples+self.burnin) #Get orf data self.transit_message("Reading Annotation") #Validate data has empty sites #(status, genome) = transit_tools.validate_wig_format(self.ctrldata, wxobj=self.wxobj) #if status <2: tn_used = "himar1" #else: tn_used = "tn5" self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K,N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=self.minread, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) ii_good = numpy.array([self.good_orf(g) for g in G]) # Gets index of the genes that can be analyzed K = G.local_insertions()[ii_good] N = G.local_sites()[ii_good] R = G.local_runs()[ii_good] S = G.local_gap_span()[ii_good] T = G.local_gene_span()[ii_good] self.transit_message("Doing Regression") mu_s, temp, sigma_s = stat_tools.regress(R, S) # Linear regression to estimate mu_s, sigma_s for span data mu_r, temp, sigma_r = stat_tools.regress(S, R) # Linear regression to estimate mu_r, sigma_r for run data N_GENES = len(G) N_GOOD = sum(ii_good) self.transit_message("Setting Initial Class") Z_sample = numpy.zeros((N_GOOD, self.samples)) Z = [self.classify(g.n, g.r, 0.5) for g in G if self.good_orf(g)] Z_sample[:,0] = Z N_ESS = numpy.sum(Z_sample[:,0] == 1) phi_sample = numpy.zeros(self.samples) #[] phi_sample[0] = phi_start phi_old = phi_start phi_new = 0.00 SIG = numpy.array([self.sigmoid(g.s, g.t) * scipy.stats.norm.pdf(g.r, mu_r*g.s, sigma_r) for g in G if self.good_orf(g)]) # idxG,idxN = -1,0 # for i in range(len(G)): # if G[i].name=="glf": idxG = i # if ii_good[i]==True: idxN += 1 # could do sum(ii_good[:idxG]) i = 1; count = 0; while i < self.samples: try: # PHI acc = 1.0 phi_new = phi_old + random.gauss(mu_c, sigma_c) i0 = Z_sample[:,i-1] == 0 if phi_new > 1 or phi_new <= 0 or (self.F_non(phi_new, N[i0], R[i0]) - self.F_non(phi_old, N[i0], R[i0])) < math.log(random.uniform(0,1)): phi_new = phi_old acc = 0.0 flag = 0 # Z Z = self.sample_Z(phi_new, w1, N, R, S, T, mu_s, sigma_s, SIG) # w1 N_ESS = sum(Z == 1) w1 = scipy.stats.beta.rvs(N_ESS + ALPHA_w, N_GOOD - N_ESS + BETA_w) count +=1 acctot+=acc if (count > self.burnin) and (count % self.trim == 0): phi_sample[i] = phi_new Z_sample[:,i] = Z i+=1 except ValueError as e: self.transit_message("Error: %s" % e) self.transit_message("This is likely to have been caused by poor data (e.g. too sparse).") self.transit_message("If the density of the dataset is too low, the Gumbel method will not work.") self.transit_message("Quitting.") return # print i,phi_new,w1,G[idxG].name,N[idxN],R[idxN],Z[idxN] phi_old = phi_new #Update progress text = "Running Gumbel Method... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin)) self.progress_update(text, count) ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample) (ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR) #Orf k n r s zbar self.output.write("#Gumbel\n") if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, minread=%s, trim=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.minread, self.trim)) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#FDR Corrected thresholds: %f, %f\n" % (ess_t, non_t)) self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count)) self.output.write("#Total Iterations Performed:\t%d\n" % count) self.output.write("#Sample Size:\t%d\n" % i) self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample)) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) i = 0 data = [] for g in G: if not self.good_orf(g): zbar = -1.0 else: zbar = ZBAR[i] i+=1 if zbar > ess_t: call = "E" elif non_t <= zbar <= ess_t: call = "U" elif 0 <= zbar < non_t: call = "NE" else: call = "S" data.append("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" % (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call)) data.sort() for line in data: self.output.write(line) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Gumbel") self.finish() self.transit_message("Finished Gumbel Method")
def Run(self): self.status_message("Starting Gumbel Method") #Set Default parameter values w1 = 0.15 w0 = 1.0 - w1 ALPHA = 1 BETA = 1 ALPHA_w = 600 BETA_w = 3400 mu_c = 0 acctot = 0.0 phi_start = 0.3 sigma_c = 0.01 start_time = time.time() self.progress_range(self.samples + self.burnin) #Get orf data self.transit_message("Reading Annotation") #Validate data has empty sites #(status, genome) = transit_tools.validate_wig_format(self.ctrldata, wxobj=self.wxobj) #if status <2: tn_used = "himar1" #else: tn_used = "tn5" self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K, N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=self.minread, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) ii_good = numpy.array( [self.good_orf(g) for g in G]) # Gets index of the genes that can be analyzed K = G.local_insertions()[ii_good] N = G.local_sites()[ii_good] R = G.local_runs()[ii_good] S = G.local_gap_span()[ii_good] T = G.local_gene_span()[ii_good] self.transit_message("Doing Regression") mu_s, temp, sigma_s = stat_tools.regress( R, S) # Linear regression to estimate mu_s, sigma_s for span data mu_r, temp, sigma_r = stat_tools.regress( S, R) # Linear regression to estimate mu_r, sigma_r for run data N_GENES = len(G) N_GOOD = sum(ii_good) self.transit_message("Setting Initial Class") Z_sample = numpy.zeros((N_GOOD, self.samples)) Z = [self.classify(g.n, g.r, 0.5) for g in G if self.good_orf(g)] Z_sample[:, 0] = Z N_ESS = numpy.sum(Z_sample[:, 0] == 1) phi_sample = numpy.zeros(self.samples) #[] phi_sample[0] = phi_start phi_old = phi_start phi_new = 0.00 SIG = numpy.array([ self.sigmoid(g.s, g.t) * scipy.stats.norm.pdf(g.r, mu_r * g.s, sigma_r) for g in G if self.good_orf(g) ]) # idxG,idxN = -1,0 # for i in range(len(G)): # if G[i].name=="glf": idxG = i # if ii_good[i]==True: idxN += 1 # could do sum(ii_good[:idxG]) i = 1 count = 0 while i < self.samples: try: # PHI acc = 1.0 phi_new = phi_old + random.gauss(mu_c, sigma_c) i0 = Z_sample[:, i - 1] == 0 if phi_new > 1 or phi_new <= 0 or ( self.F_non(phi_new, N[i0], R[i0]) - self.F_non(phi_old, N[i0], R[i0])) < math.log( random.uniform(0, 1)): phi_new = phi_old acc = 0.0 flag = 0 # Z Z = self.sample_Z(phi_new, w1, N, R, S, T, mu_s, sigma_s, SIG) # w1 N_ESS = sum(Z == 1) w1 = scipy.stats.beta.rvs(N_ESS + ALPHA_w, N_GOOD - N_ESS + BETA_w) count += 1 acctot += acc if (count > self.burnin) and (count % self.trim == 0): phi_sample[i] = phi_new Z_sample[:, i] = Z i += 1 except ValueError as e: self.transit_message("Error: %s" % e) self.transit_message( "This is likely to have been caused by poor data (e.g. too sparse)." ) self.transit_message( "If the density of the dataset is too low, the Gumbel method will not work." ) self.transit_message("Quitting.") return # print i,phi_new,w1,G[idxG].name,N[idxN],R[idxN],Z[idxN] phi_old = phi_new #Update progress text = "Running Gumbel Method... %5.1f%%" % ( 100.0 * (count + 1) / (self.samples + self.burnin)) self.progress_update(text, count) ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample) (ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR) #Orf k n r s zbar self.output.write("#Gumbel\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, minread=%s, trim=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.minread, self.trim)) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#FDR Corrected thresholds: %f, %f\n" % (ess_t, non_t)) self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" % (100.0 * acctot / count)) self.output.write("#Total Iterations Performed:\t%d\n" % count) self.output.write("#Sample Size:\t%d\n" % i) self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample)) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) i = 0 data = [] for g in G: if not self.good_orf(g): zbar = -1.0 else: zbar = ZBAR[i] i += 1 if zbar > ess_t: call = "E" elif non_t <= zbar <= ess_t: call = "U" elif 0 <= zbar < non_t: call = "NE" else: call = "S" data.append( "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" % (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call)) data.sort() for line in data: self.output.write(line) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Gumbel") self.finish() self.transit_message("Finished Gumbel Method")
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))})) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates( covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions( interactionsByFileList, filenamesInCombWig) data, conditions, covariates, interactions = self.filter_wigs_by_conditions( data, conditions, covariates = covariates, interactions = interactions, ignored_conditions = self.ignored_conditions, included_conditions = self.included_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split("_") ic2 = y.split("_") c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, orderStats) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") head = ("Rv Gene TAs".split() + map(lambda v: "Mean_" + v, orderedStatGroupNames) + map(lambda v: "NZmean_" + v, orderedStatGroupNames) + map(lambda v: "NZperc_" + v, orderedStatGroupNames) + "pval padj".split() + ["status"]) file.write("#Console: python %s\n" % " ".join(sys.argv)) file.write('\t'.join(head)+EOL) for gene in genes: Rv = gene["rv"] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))})) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates( covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions( interactionsByFileList, filenamesInCombWig) data, conditions, covariates, interactions = self.filter_wigs_by_conditions( data, conditions, covariates = covariates, interactions = interactions, ignored_conditions = self.ignored_conditions, included_conditions = self.included_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split(SEPARATOR) ic2 = y.split(SEPARATOR) c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats)) headersStatGroupNames = [x.replace(SEPARATOR,'_') for x in orderedStatGroupNames] self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") head = ("Rv Gene TAs".split() + list(map(lambda v: "Mean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) + "pval padj".split() + ["status"]) file.write("#Console: python %s\n" % " ".join(sys.argv)) file.write('\t'.join(head)+EOL) for gene in genes: Rv = gene["rv"] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [ x for x in packnames if not rpackages.isinstalled(x) ] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({ '0': '"{0}"'.format('", "'.join(r_packages_needed)) })) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition # if a covar is not found, this crashes; check for it? conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata( self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates(covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions(interactionsByFileList, filenamesInCombWig) conditionsList = self.select_conditions(conditions, self.included_conditions, self.ignored_conditions, orderingMetadata) data, conditions, covariates, interactions = self.filter_wigs_by_conditions2( data, conditions, conditionsList, covariates=covariates, interactions=interactions) # show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars filesByCondition = self.invertDict(conditionsByFile) samples_used = set() for cond in conditionsList: samples_used.update(filesByCondition[cond]) vars = [condition_name] + self.covars + self.interactions vars2vals = {} vars2vals[condition_name] = list(set(conditions)) for i, var in enumerate(self.covars): vars2vals[var] = list(set(covariates[i])) for i, var in enumerate(self.interactions): vars2vals[var] = list(set(interactions[i])) varsByFileList = [conditionsByFile ] + covariatesByFileList + interactionsByFileList for i, var in enumerate(vars): print("\nCondition/Covariate/Interaction: %s" % vars[i]) filesByVar = self.invertDict(varsByFileList[i]) for k, v in filesByVar.items(): samples = list(samples_used.intersection(set(v))) if k in vars2vals.get(var, []): print("%s: %s" % (k, ' '.join(samples))) pairs = [] print("\nsamples in cross-product:") any_empty = self.expandVar([], vars, varsByFileList, vars2vals, set(samples_used)) if any_empty: print( "warning: ZINB requires samples in all combinations of conditions; the fact that one is empty could result in Model Errors" ) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split(SEPARATOR) ic2 = y.split(SEPARATOR) c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats)) headersStatGroupNames = [ x.replace(SEPARATOR, '_') for x in orderedStatGroupNames ] self.transit_message("Adding File: %s" % (self.output)) file = open(self.output, "w") if len(headersStatGroupNames) == 2: lfcNames = ["LFC"] else: lfcNames = list(map(lambda v: "LFC_" + v, headersStatGroupNames)) head = ("Rv Gene TAs".split() + list(map(lambda v: "Mean_" + v, headersStatGroupNames)) + lfcNames + list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) + "pval padj".split() + ["status"]) file.write("#Console: python3 %s\n" % " ".join(sys.argv)) file.write( "#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization, self.NTerminus, self.CTerminus, self.PC)) file.write('#' + '\t'.join(head) + EOL) for gene in genes: Rv = gene["rv"] means = [ statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames ] PC = self.PC if len(means) == 2: LFCs = [numpy.math.log((means[1] + PC) / (means[0] + PC), 2)] else: m = numpy.mean(means) LFCs = [numpy.math.log((x + PC) / (m + PC), 2) for x in means] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + [ "%0.1f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames ] + ["%0.3f" % x for x in LFCs] + [ "%0.1f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames ] + [ "%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames ] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals) + EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def test_normalization(self): N = len(all_data_list) data, position = tnseq_tools.get_data(all_data_list) norm_data, factors = norm_tools.normalize_data(data, "TTR") self.assertFalse((factors == numpy.ones(N)).all())
def Run(self): self.transit_message("Starting Griffin Method") start_time = time.time() #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K, N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) N = len(G) self.progress_range(N) count = 0 pins = G.global_theta() pnon = 1.0 - pins results = [] for gene in G: if gene.n == 0: results.append([gene, 0.0, 1.000]) else: B = 1.0 / math.log(1.0 / pnon) u = math.log(gene.n * pins, 1.0 / pnon) exprun = tnseq_tools.ExpectedRuns(gene.n, pnon) pval = 1.0 - tnseq_tools.GumbelCDF(gene.r, u, B) results.append([gene, exprun, pval]) text = "Running Griffin Method... %5.1f%%" % (100.0 * (count + 1) / (N)) self.progress_update(text, count) count += 1 pval = [row[-1] for row in results] padj = stat_tools.BH_fdr_correction(pval) for i in range(len(results)): results[i].append(padj[i]) results.sort() self.output.write("#Griffin\n") if self.wxobj: members = sorted([ attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__") ]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write( "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python3 %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) for (gene, exprun, pval, padj) in results: self.output.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%1.1f\t%1.5f\t%1.5f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, gene.s, gene.t, exprun, pval, padj)) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Griffin") self.finish() self.transit_message("Finished Griffin Method")
def Run(self): self.transit_message("Starting Binomial Method") start_time = time.time() self.progress_range(self.samples+self.burnin) #Get orf data #self.transit_message("Getting Data") #G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus) self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K,N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) #Parameters self.transit_message("Setting Parameters") w1 = 0.15 w0 = 1.0 - w1 mu_c = 0 Ngenes = len(G) sample_size = self.samples+self.burnin numReps = len(self.ctrldata) theta = numpy.zeros((Ngenes, sample_size)) theta[:,0] = 0.10 rho0 = numpy.zeros(sample_size); rho0[0] = 0.5; Kp0 = numpy.zeros(sample_size); Kp0[0] = 10; rho1 = numpy.zeros(sample_size); rho1[0] = 0.10; Kp1 = numpy.zeros(sample_size); Kp1[0] = 3; Z = numpy.zeros((Ngenes, sample_size)) pz1 = numpy.zeros(sample_size); n1 = 0 w1 = scipy.stats.beta.rvs(self.alpha_w, self.beta_w) W1 = numpy.zeros(sample_size); W1[0] = w1 # self.transit_message("Setting Initial Values") K = numpy.array([sum([1 for x in gene.reads.flatten() if x> 0]) for gene in G]) N = numpy.array([len(gene.reads.flatten()) for gene in G]) for g,gene in enumerate(G): if N[g] == 0: theta[g][0] = 0.5 elif K[g]/float(N[g]) == 0: theta[g][0] = 0.001 elif K[g]/float(N[g]) == 1: theta[g][0] = 0.001 else: theta[g][0] = K[g]/float(N[g]) #print g, ORF[g], K[g], N[g], theta[g][0] Z[g][0] = scipy.stats.bernoulli.rvs(1-theta[g][0]) acc_p0 = 0; acc_k0 = 0; acc_p1 = 0; acc_k1 = 0; rho0c_std = 0.010 kp0c_std = 1.40 rho1c_std = 0.009 kp1c_std = 1.1 numpy.seterr(divide='ignore') for i in range(1, sample_size): i0 = Z[:,i-1] == 0; n0 = numpy.sum(i0); i1 = Z[:,i-1] == 1; n1 = numpy.sum(i1); theta[i0,i] = scipy.stats.beta.rvs(Kp0[i-1]*rho0[i-1] + K[i0], Kp0[i-1]*(1-rho0[i-1]) + N[i0] - K[i0]) theta[i1,i] = scipy.stats.beta.rvs(Kp1[i-1]*rho1[i-1] + K[i1], Kp1[i-1]*(1-rho1[i-1]) + N[i1] - K[i1]) rho0_c = rho0[i-1] + scipy.stats.norm.rvs(0, rho0c_std) Kp0_c = Kp0[i-1] + scipy.stats.norm.rvs(0, kp0c_std) if rho0_c <= 0: rho0[i] = rho0[i-1] else: fc = numpy.log(scipy.stats.beta.pdf(rho0_c, self.M0*self.pi0, self.M0*(1.0-self.pi0))) f0 = numpy.log(scipy.stats.beta.pdf(rho0[i-1], self.M0*self.pi0, self.M0*(1.0-self.pi0))) fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0_c, Kp0[i-1]*(1-rho0_c)))) f0 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0[i-1], Kp0[i-1]*(1-rho0[i-1])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f0: rho0[i] = rho0_c acc_p0+=1 else: rho0[i] = rho0[i-1] if Kp0_c <= 0: Kp0[i] = Kp0[i-1] else: fc = numpy.log(scipy.stats.gamma.pdf(Kp0_c, self.a0, self.b0)); f0 = numpy.log(scipy.stats.gamma.pdf(Kp0[i-1], self.a0, self.b0)); fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0_c*rho0[i], Kp0_c*(1-rho0[i])))) f0 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0[i], Kp0[i-1]*(1-rho0[i])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f0: Kp0[i] = Kp0_c acc_k0+=1 else: Kp0[i] = Kp0[i-1] rho1_c = rho1[i-1] + scipy.stats.norm.rvs(0, rho1c_std) Kp1_c = Kp1[i-1] + scipy.stats.norm.rvs(0, kp1c_std) if rho1_c <= 0: rho1[i] = rho1[i-1] else: fc = numpy.log(scipy.stats.beta.pdf(rho1_c, self.M1*self.pi1, self.M1*(1-self.pi1))) f1 = numpy.log(scipy.stats.beta.pdf(rho1[i-1], self.M1*self.pi1, self.M1*(1-self.pi1))) fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1_c, Kp1[i-1]*(1-rho1_c)))) f1 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1[i-1], Kp1[i-1]*(1-rho1[i-1])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f1: rho1[i] = rho1_c acc_p1+=1 else: rho1[i] = rho1[i-1] if Kp1_c <= 0: Kp1[i] = Kp1[i-1] else: fc = numpy.log(scipy.stats.gamma.pdf(Kp1_c, self.a1, self.b1)); f1 = numpy.log(scipy.stats.gamma.pdf(Kp1[i-1], self.a1, self.b1)); fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1_c*rho1[i], Kp1_c*(1-rho1[i])))) f1 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1[i], Kp1[i-1]*(1-rho1[i])))) if numpy.log(scipy.stats.uniform.rvs()) < fc - f1: Kp1[i] = Kp1_c acc_k1+=1 else: Kp1[i] = Kp1[i-1] g0 = scipy.stats.beta.pdf(theta[:,i], Kp0[i]*rho0[i], Kp0[i]*(1-rho0[i])) * (1-w1) g1 = scipy.stats.beta.pdf(theta[:,i], Kp1[i]*rho1[i], Kp1[i]*(1-rho1[i])) * (w1) p1 = g1/(g0+g1) p1 = numpy.nan_to_num(p1) try: Z[:,i] = scipy.stats.bernoulli.rvs(p1) except: inan = numpy.isnan(p1) print >> sys.stderr, "K=\t", K[inan] print >> sys.stderr, "N=\t", N[inan] print >> sys.stderr, "theta=", theta[inan,i] sys.exit() pz1[i] = p1[0] i1 = Z[:,i] == 1; n1 = numpy.sum(i1); #w1 = 0.15 w1 = scipy.stats.beta.rvs(self.alpha_w + n1, self.beta_w + Ngenes - n1) W1[i] = w1 #Update progress text = "Running Binomial Method... %5.1f%%" % (100.0*(i+1)/(sample_size)) self.progress_update(text, i) numpy.seterr(divide='warn') z_bar = numpy.apply_along_axis(numpy.mean, 1, Z[:, self.burnin:]) theta_bar = numpy.apply_along_axis(numpy.mean, 1, theta[:, self.burnin:]) #(ess_threshold, noness_threshold) = stat_tools.fdr_post_prob(z_bar) (ess_threshold, noness_threshold) = stat_tools.bayesian_ess_thresholds(z_bar) self.output.write("#Binomial\n") #output.write("#Command: %s\n" % " ".join(["%s=%s" %(key,val) for (key,val) in kwargs.items()])) if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, burnin=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.burnin)) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Thresholds: (%1.5f, %1.5f)\n" % (ess_threshold,noness_threshold)) self.output.write("#rho0 Acceptance Rate:\t%f%%\n" % ((100.0*acc_p0)/sample_size)) self.output.write("#Kp0 Acceptance Rate:\t%f%%\n" % ((100.0*acc_k0)/sample_size)) self.output.write("#rho1 Acceptance Rate:\t%f%%\n" % ((100.0*acc_p1)/sample_size)) self.output.write("#Kp1 Acceptance Rate:\t%f%%\n" % ((100.0*acc_k1)/sample_size)) self.output.write("#Hyperparameters rho: \t%1.2f\t%3.1f\t%1.2f\t%3.1f\n" % (self.pi0, self.M0, self.pi1, self.M1)) self.output.write("#Hyperparameters Kp: \t%3.1f\t%3.1f\t%3.1f\t%3.1f\n" % (self.a0, self.b0, self.a1, self.b1)) self.output.write("#Hyperparameters W: \t%1.3f\t%1.3f\n" % (self.alpha_w, self.beta_w)) self.output.write("#%s\n" % "\t".join(columns)) data = [] for g,gene in enumerate(G): c = "Uncertain" if z_bar[g] > ess_threshold: c = "Essential" if z_bar[g] < noness_threshold: c = "Non-Essential" data.append("%s\t%s\t%s\t%1.1f\t%d\t%d\t%d\t%f\t%f\t%s" % (gene.orf, gene.name, gene.desc, K[g]/float(numReps), N[g]/numReps, K[g], N[g], theta_bar[g], z_bar[g], c)) data.sort() for row in data: self.output.write("%s\n" % row) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Binomial") self.finish() self.transit_message("Finished Binomial Method")
def Run(self): self.transit_message("Starting Example Method") start_time = time.time() #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (K,N) = data.shape if self.normalization and self.normalization != "nonorm": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path) G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position) data = [] N = len(G) count = 0 self.progress_range(N) for gene in G: count+=1 if gene.n == 0: mean = 0.0 else: mean = numpy.mean(gene.reads) if gene.k == 0: nzmean = 0.0 else: nzmean = numpy.sum(gene.reads)/float(gene.k) data.append("%s\t%s\t%s\t%s\t%s\t%1.2f\t%1.2f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, mean, nzmean)) # Update Progress text = "Running Example Method... %5.1f%%" % (100.0*count/N) self.progress_update(text, count) self.output.write("#Example\n") if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % "\t".join(columns)) data.sort() for line in data: self.output.write(line) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="Example") self.finish() self.transit_message("Finished Example Method")
def main(args, kwargs, quite=False, jumble=False): missingArgs = False if "a1" not in kwargs: missingArgs = True error("Missing -a1 argument") if "a2" not in kwargs: missingArgs = True error("Missing -a2 argument") if "b1" not in kwargs: missingArgs = True error("Missing -b1 argument") if "b2" not in kwargs: missingArgs = True error("Missing -b2 argument") if "pt" not in kwargs: missingArgs = True error("Missing -pt argument") if missingArgs: usage() sys.exit() A_1list = kwargs["a1"].split(",") A_2list = kwargs["a2"].split(",") B_1list = kwargs["b1"].split(",") B_2list = kwargs["b2"].split(",") annotation = kwargs["pt"] rope = float(kwargs.get("rope", 0.5)) S = int(kwargs.get("s", 100000)) norm_method = kwargs.get("n", "TTR") label = kwargs.get("l", "debug") onlyNZ = kwargs.get("-nz", False) doBFDR = kwargs.get("-bfdr", False) doFWER = kwargs.get("-fwer", False) DEBUG = [] if "debug" in kwargs: DEBUG = kwargs["debug"].split(",") wiglist = A_1list + B_1list + A_2list + B_2list Nwig = len(wiglist) Na1 = len(A_1list) Nb1 = len(A_1list) Na2 = len(B_2list) Nb2 = len(B_2list) (data, position) = tnseq_tools.get_data(wiglist) ######### FILTER EMTPY SITES ######### if onlyNZ: ii_good = numpy.sum(data, 0) > 0 data = data[:, ii_good] position = position[ii_good] ###################################### (data, factors) = norm_tools.normalize_data(data, norm_method, wiglist, sys.argv[1]) if jumble: numpy.random.shuffle(data.flat) numpy.random.shuffle(data.flat) G_A1 = tnseq_tools.Genes([], annotation, data=data[:Na1], position=position) G_B1 = tnseq_tools.Genes([], annotation, data=data[Na1:(Na1 + Nb1)], position=position) G_A2 = tnseq_tools.Genes([], annotation, data=data[(Na1 + Nb1):(Na1 + Nb1 + Na2)], position=position) G_B2 = tnseq_tools.Genes([], annotation, data=data[(Na1 + Nb1 + Na2):], position=position) means_list_a1 = [] means_list_b1 = [] means_list_a2 = [] means_list_b2 = [] var_list_a1 = [] var_list_a2 = [] var_list_b1 = [] var_list_b2 = [] # Base priors on empirical observations accross genes. for gene in sorted(G_A1): if gene.n > 1: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() means_list_a1.append(numpy.mean(A1_data)) var_list_a1.append(numpy.var(A1_data)) means_list_b1.append(numpy.mean(B1_data)) var_list_b1.append(numpy.var(B1_data)) means_list_a2.append(numpy.mean(A2_data)) var_list_a2.append(numpy.var(A2_data)) means_list_b2.append(numpy.mean(B2_data)) var_list_b2.append(numpy.var(B2_data)) # Priors mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01) mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01) mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01) mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01) s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01) s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01) s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01) s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01) k0 = 1.0 nu0 = 1.0 data = [] postprob = [] if not quite: print "# Created with '%s'. Copyright 2016-2017. Michael A. DeJesus & Thomas R. Ioerger" % ( sys.argv[0]) print "# Version %1.2f; http://saclab.tamu.edu/essentiality/GI" % __version__ print "#" print "# python %s" % " ".join(sys.argv) print "# Samples = %d, k0=%1.1f, nu0=%1.1f" % (S, k0, nu0) print "# Mean Prior: Variance Prior:" print "# mu0_A1 = %1.2f s20_A1 = %1.1f" % (mu0_A1, s20_A1) print "# mu0_B1 = %1.2f s20_B1 = %1.1f" % (mu0_B1, s20_B1) print "# mu0_A2 = %1.2f s20_A2 = %1.1f" % (mu0_A2, s20_A2) print "# mu0_B2 = %1.2f s20_B2 = %1.1f" % (mu0_B2, s20_B2) print "# ROPE:", rope print "# TTR Factors:", ", ".join( ["%1.4f" % x for x in numpy.array(factors).flatten()]) for gene in sorted(G_A1): if len(DEBUG) > 0: if gene.orf not in DEBUG: continue if gene.n > 0: A1_data = G_A1[gene.orf].reads.flatten() B1_data = G_B1[gene.orf].reads.flatten() A2_data = G_A2[gene.orf].reads.flatten() B2_data = G_B2[gene.orf].reads.flatten() # Time-1 Time-2 # # Strain-A A C # # Strain-B B D try: muA1_post, varA1_post = sample_post(A1_data, S, mu0_A1, s20_A1, k0, nu0) muB1_post, varB1_post = sample_post(B1_data, S, mu0_B1, s20_B1, k0, nu0) muA2_post, varA2_post = sample_post(A2_data, S, mu0_A2, s20_A2, k0, nu0) muB2_post, varB2_post = sample_post(B2_data, S, mu0_B2, s20_B2, k0, nu0) except Exception as e: muA1_post = varA1_post = numpy.ones(S) muB1_post = varB1_post = numpy.ones(S) muA2_post = varA2_post = numpy.ones(S) muB2_post = varB2_post = numpy.ones(S) logFC_A_post = numpy.log2(muA2_post / muA1_post) logFC_B_post = numpy.log2(muB2_post / muB1_post) delta_logFC_post = logFC_B_post - logFC_A_post alpha = 0.05 # Get Bounds of the HDI l_logFC_A, u_logFC_A = HDI_from_MCMC(logFC_A_post, 1 - alpha) l_logFC_B, u_logFC_B = HDI_from_MCMC(logFC_B_post, 1 - alpha) l_delta_logFC, u_delta_logFC = HDI_from_MCMC( delta_logFC_post, 1 - alpha) mean_logFC_A = numpy.mean(logFC_A_post) mean_logFC_B = numpy.mean(logFC_B_post) mean_delta_logFC = numpy.mean(delta_logFC_post) # Is HDI significantly different than ROPE? not_HDI_overlap_bit = l_delta_logFC > rope or u_delta_logFC < -rope # Probability of posterior overlaping with ROPE probROPE = numpy.mean( numpy.logical_and(delta_logFC_post >= 0.0 - rope, delta_logFC_post <= 0.0 + rope)) else: A1_data = [0, 0] B1_data = [0, 0] A2_data = [0, 0] B2_data = [0, 0] mean_logFC_A = 0 mean_logFC_B = 0 mean_delta_logFC = 0 l_logFC_A = 0 u_logFC_A = 0 l_logFC_B = 0 u_logFC_B = 0 l_delta_logFC = 0 u_delta_logFC = 0 probROPE = 1.0 if numpy.isnan(l_logFC_A): l_logFC_A = -10 u_logFC_A = 10 if numpy.isnan(l_logFC_B): l_logFC_B = -10 u_logFC_B = 10 if numpy.isnan(l_delta_logFC): l_delta_logFC = -10 u_delta_logFC = 10 if DEBUG: out = open("%s.%s_muA1_post" % (label, gene.orf), "w") for x in muA1_post: print >> out, x out = open("%s.%s_muA2_post" % (label, gene.orf), "w") for x in muA2_post: print >> out, x out = open("%s.%s_logFC_A_post" % (label, gene.orf), "w") for x in logFC_A_post: print >> out, x out = open("%s.%s_muB1_post" % (label, gene.orf), "w") for x in muB1_post: print >> out, x out = open("%s.%s_muB2_post" % (label, gene.orf), "w") for x in muB2_post: print >> out, x out = open("%s.%s_logFC_B_post" % (label, gene.orf), "w") for x in logFC_A_post: print >> out, x out = open("%s.%s_delta_logFC_post" % (label, gene.orf), "w") for x in delta_logFC_post: print >> out, x postprob.append(probROPE) data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post), numpy.mean(muA2_post), numpy.mean(muB1_post), numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit)) if doBFDR or not doFWER: postprob = numpy.array(postprob) postprob.sort() bfdr = numpy.cumsum(postprob) / numpy.arange(1, len(postprob) + 1) adjusted_prob = bfdr adjusted_label = "BFDR" if doBFDR: data.sort(key=lambda x: x[-2]) else: data.sort(key=lambda x: x[-1], reverse=True) elif doFWER: fwer = FWER_Bayes(postprob) fwer.sort() adjusted_prob = fwer adjusted_label = "FWER" data.sort(key=lambda x: x[-2]) return (data, adjusted_prob, adjusted_label)
def Run(self): self.transit_message("Starting rankproduct Method") start_time = time.time() Kctrl = len(self.ctrldata) Kexp = len(self.expdata) #Get orf data self.transit_message("Getting Data") (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj) if self.normalization != "none": self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path) Gctrl= tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[:Kctrl,:], position=position) Gexp= tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[Kctrl:,:], position=position) Ngenes = len(Gctrl) # Get the average counts for all the genes, in each replicate meanCtrl = numpy.zeros((Kctrl, Ngenes)) meanExp = numpy.zeros((Kexp, Ngenes)) for i in range(Ngenes): if numpy.any(Gctrl[i].reads): meanCtrl[:,i] = numpy.mean(Gctrl[i].reads,1) else: meanCtrl[:,i] = numpy.zeros(Kctrl) # if numpy.any(Gexp[i].reads): meanExp[:,i] = numpy.mean(Gexp[i].reads,1) else: meanExp[:,i] = numpy.zeros(Kexp) # Calculate a logFC2 between Experimental and Control # Then calculates it's rank, and observed rankProduct logFC2 = numpy.log2((meanExp+0.0001)/(meanCtrl+0.0001)) rank = numpy.array([scipy.stats.rankdata(Lvec) for Lvec in logFC2]) obsRP = numpy.power(numpy.prod(rank,0), 1.0/Kctrl) permutations = numpy.zeros((self.samples, Ngenes)) tempranks = scipy.array([numpy.arange(1,Ngenes+1) for rep in range(Kctrl)]) for s in range(self.samples): rankperm = numpy.array([numpy.random.permutation(tr) for tr in tempranks]) permutations[s] = numpy.power(numpy.prod(rankperm,0), 1.0/Kctrl) rankRP = numpy.argsort(obsRP) + 1 #rankproduct data = [] count = 0 self.progress_range(Ngenes) for i,gene in enumerate(Gctrl): count+=1 meanctrl = numpy.mean(Gctrl[i].reads) meanexp = numpy.mean(Gexp[i].reads) log2fc = numpy.log2((meanexp+0.0001)/(meanctrl+0.0001)) countbetter = numpy.sum(permutations <= obsRP[i]) pval = countbetter/float(self.samples*Ngenes) e_val = countbetter/float(self.samples) q_paper = e_val/float(rankRP[i]) data.append([gene.orf, gene.name, gene.desc, gene.n, meanctrl, meanexp, log2fc, obsRP[i], e_val, q_paper, pval]) # Update Progress text = "Running rankproduct Method... %5.1f%%" % (100.0*count/Ngenes) self.progress_update(text, count) # self.transit_message("") # Printing empty line to flush stdout self.transit_message("Performing Benjamini-Hochberg Correction") data.sort() q_bh = stat_tools.BH_fdr_correction([row[-1] for row in data]) self.output.write("#RankProduct\n") if self.wxobj: members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")]) memberstr = "" for m in members: memberstr += "%s = %s, " % (m, getattr(self, m)) self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'))) else: self.output.write("#Console: python %s\n" % " ".join(sys.argv)) self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) self.output.write("#Time: %s\n" % (time.time() - start_time)) self.output.write("#%s\n" % (columns)) for i,row in enumerate(data): (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene, e_val, q_paper, pval) = row self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.8f\t%1.1f\t%1.8f\n" % (orf, name, desc, n, mean1, mean2,log2FCgene, obsRPgene, e_val, q_paper)) self.output.close() self.transit_message("Adding File: %s" % (self.output.name)) self.add_file(filetype="RankProduct") self.finish() self.transit_message("Finished rankproduct Method")