def Run(self): self.transit_message("Starting Normalization") start_time = time.time() infile = self.infile outputPath = self.outfile # output file exists, should I require -overwrite flag? # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines line2 = "variableStep chrom=" # unknown for line in open(infile): if line.startswith("variableStep"): line2 = line.rstrip(); break if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0]) else: (data, sites) = tnseq_tools.get_data(self.ctrldata) (data,factors) = norm_tools.normalize_data(data,self.normalization) print "writing",outputPath file = open(outputPath,"w") file.write("# %s normalization of %s\n" % (self.normalization,infile)) if self.combined_wig==True: for f in files: file.write("#File: %s\n" % f) for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n") else: file.write(line2+"\n") for j in range(len(sites)): file.write("%s %s\n" % (sites[j],int(data[0,j]))) file.close() self.finish() self.transit_message("Finished Normalization")
def Run(self): self.transit_message("Starting Anova analysis") start_time = time.time() self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) conditionsByFile, _, _, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata) conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata) data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions) self.transit_message("Running Anova") pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") heads = ("Rv Gene TAs".split() + ["Mean_%s" % x for x in conditionsList] + ["LFC_%s" % x for x in conditionsList] + "pval padj".split() + ["status"]) file.write("#Console: python3 %s\n" % " ".join(sys.argv)) file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC)) file.write('#'+'\t'.join(heads)+EOL) for gene in genes: Rv = gene["rv"] if Rv in MeansByRv: means = [MeansByRv[Rv][c] for c in conditionsList] LFCs = self.calcLFCs(means,self.PC) vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % x for x in means] + ["%0.3f" % x for x in LFCs] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]] + [run_status[Rv]]) file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Anova analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def Run(self): self.transit_message("Starting TnseqStats") start_time = time.time() datasets = self.wigs if self.combined_wig == None: (data, sites) = tnseq_tools.get_data(self.wigs) else: (sites, data, datasets) = tnseq_tools.read_combined_wig(self.combined_wig) # write table of stats (saturation,NZmean) file = sys.stdout if self.outfile != None: file = open(self.outfile, "w") PTI = True if PTI == True: file.write( "dataset\tdensity\tmean_ct\tNZmean\tNZmedian\tmax_ct\ttotal_cts\tskewness\tkurtosis\tpickands_tail_index\n" ) else: file.write( "dataset\tdensity\tmean_ct\tNZmean\tNZmedian\tmax_ct\ttotal_cts\tskewness\tkurtosis\n" ) for i in range(data.shape[0]): density, meanrd, nzmeanrd, nzmedianrd, maxrd, totalrd, skew, kurtosis = tnseq_tools.get_data_stats( data[i, :]) nzmedianrd = int(nzmedianrd) if numpy.isnan( nzmedianrd) == False else 0 pti = self.pickands_tail_index(data[i, :]) vals = [ datasets[i], "%0.3f" % density, "%0.1f" % meanrd, "%0.1f" % nzmeanrd, "%d" % nzmedianrd, maxrd, int(totalrd), "%0.1f" % skew, "%0.1f" % kurtosis ] if PTI == True: vals.append("%0.3f" % pti) file.write('\t'.join([str(x) for x in vals]) + '\n') if self.outfile != None: file.close() self.finish() self.transit_message("Finished TnseqStats")
def Run(self): self.transit_message("Starting Anova analysis") start_time = time.time() self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) conditions = self.wigs_to_conditions( self.read_samples_metadata(self.metadata), filenamesInCombWig) data, conditions = self.filter_by_conditions_blacklist( data, conditions, self.ignored_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map( genes, TASiteindexMap) MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions) self.transit_message("Running Anova") pvals, qvals = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output, "w") conditionsList = list(set(conditions)) vals = "Rv Gene TAs".split() + conditionsList + "pval padj".split() file.write('\t'.join(vals) + EOL) for gene in genes: Rv = gene["rv"] if Rv in MeansByRv: vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.1f" % MeansByRv[Rv][c] for c in conditionsList] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) file.write('\t'.join(vals) + EOL) file.close() self.transit_message("Finished Anova analysis")
def Run(self): #if not self.wxobj: # # Force matplotlib to use good backend for png. # import matplotlib.pyplot as plt #elif "matplotlib.pyplot" not in sys.modules: try: import matplotlib.pyplot as plt except: print("Error: cannot do histograms") self.doHistogram = False self.transit_message("Starting resampling Method") start_time = time.time() histPath = "" if self.doHistogram: histPath = os.path.join(os.path.dirname(self.output.name), transit_tools.fetch_name(self.output.name)+"_histograms") if not os.path.isdir(histPath): os.makedirs(histPath) #Get orf data self.transit_message("Getting Data") if self.diffStrains: self.transit_message("Multiple annotation files found") self.transit_message("Mapping ctrl data to {0}, exp data to {1}".format(self.annotation_path, self.annotation_path_exp)) if self.combinedWigParams: (position, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combinedWigParams['combined_wig']) conditionsByFile, _, _, _ = tnseq_tools.read_samples_metadata(self.combinedWigParams['samples_metadata']) conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig) data, conditions = self.filter_wigs_by_conditions(data, conditions, self.combinedWigParams['conditions']) data_ctrl = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][0]]) data_exp = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][1]]) position_ctrl, position_exp = position, position else: (data_ctrl, position_ctrl) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (data_exp, position_exp) = transit_tools.get_validated_data(self.expdata, wxobj=self.wxobj) (K_ctrl, N_ctrl) = data_ctrl.shape (K_exp, N_exp) = data_exp.shape if not self.diffStrains and (N_ctrl != N_exp): self.transit_error("Error: Ctrl and Exp wig files don't have the same number of sites.") self.transit_error("Make sure all .wig files come from the same strain.") return # (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj) self.transit_message("Preprocessing Ctrl data...") data_ctrl = self.preprocess_data(position_ctrl, data_ctrl) self.transit_message("Preprocessing Exp data...") data_exp = self.preprocess_data(position_exp, data_exp) G_ctrl = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_ctrl, position=position_ctrl) G_exp = tnseq_tools.Genes(self.expdata, self.annotation_path_exp, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_exp, position=position_exp) doLibraryResampling = False # If library string not empty if self.ctrl_lib_str or self.exp_lib_str: letters_ctrl = set(self.ctrl_lib_str) letters_exp = set(self.exp_lib_str) # Check if using exactly 1 letters; i.e. no different libraries if len(letters_ctrl) == 1 and letters_exp==1: pass # If using more than one letter, then check no differences in set else: lib_diff = letters_ctrl ^ letters_exp # Check that their differences if not lib_diff: doLibraryResampling = True else: transit_tools.transit_error("Error: Library Strings (Ctrl = %s, Exp = %s) do not use the same letters. Make sure every letter / library is represented in both Control and Experimental Conditions. Proceeding with resampling assuming all datasets belong to the same library." % (self.ctrl_lib_str, self.exp_lib_str)) self.ctrl_lib_str = "" self.exp_lib_str = "" (data, qval) = self.run_resampling(G_ctrl, G_exp, doLibraryResampling, histPath) self.write_output(data, qval, start_time) self.finish() self.transit_message("Finished resampling Method")
def Run(self): #if not self.wxobj: # # Force matplotlib to use good backend for png. # import matplotlib.pyplot as plt #elif "matplotlib.pyplot" not in sys.modules: try: import matplotlib.pyplot as plt except: print "Error: cannot do histograms" self.doHistogram = False self.transit_message("Starting resampling Method") start_time = time.time() histPath = "" if self.doHistogram: histPath = os.path.join(os.path.dirname(self.output.name), transit_tools.fetch_name(self.output.name)+"_histograms") if not os.path.isdir(histPath): os.makedirs(histPath) #Get orf data self.transit_message("Getting Data") if self.diffStrains: self.transit_message("Multiple annotation files found") self.transit_message("Mapping ctrl data to {0}, exp data to {1}".format(self.annotation_path, self.annotation_path_exp)) if self.combinedWigParams: (position, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combinedWigParams['combined_wig']) conditionsByFile, _, _, _ = tnseq_tools.read_samples_metadata(self.combinedWigParams['samples_metadata']) conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig) data, conditions = self.filter_wigs_by_conditions(data, conditions, self.combinedWigParams['conditions']) data_ctrl = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][0]]) data_exp = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][1]]) position_ctrl, position_exp = position, position else: (data_ctrl, position_ctrl) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj) (data_exp, position_exp) = transit_tools.get_validated_data(self.expdata, wxobj=self.wxobj) (K_ctrl, N_ctrl) = data_ctrl.shape (K_exp, N_exp) = data_exp.shape if not self.diffStrains and (N_ctrl != N_exp): self.transit_error("Error: Ctrl and Exp wig files don't have the same number of sites.") self.transit_error("Make sure all .wig files come from the same strain.") return # (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj) self.transit_message("Preprocessing Ctrl data...") data_ctrl = self.preprocess_data(position_ctrl, data_ctrl) self.transit_message("Preprocessing Exp data...") data_exp = self.preprocess_data(position_exp, data_exp) G_ctrl = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_ctrl, position=position_ctrl) G_exp = tnseq_tools.Genes(self.expdata, self.annotation_path_exp, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_exp, position=position_exp) doLibraryResampling = False # If library string not empty if self.ctrl_lib_str or self.exp_lib_str: letters_ctrl = set(self.ctrl_lib_str) letters_exp = set(self.exp_lib_str) # Check if using exactly 1 letters; i.e. no different libraries if len(letters_ctrl) == 1 and letters_exp==1: pass # If using more than one letter, then check no differences in set else: lib_diff = letters_ctrl ^ letters_exp # Check that their differences if not lib_diff: doLibraryResampling = True else: transit_tools.transit_error("Error: Library Strings (Ctrl = %s, Exp = %s) do not use the same letters. Make sure every letter / library is represented in both Control and Experimental Conditions. Proceeding with resampling assuming all datasets belong to the same library." % (self.ctrl_lib_str, self.exp_lib_str)) self.ctrl_lib_str = "" self.exp_lib_str = "" (data, qval) = self.run_resampling(G_ctrl, G_exp, doLibraryResampling, histPath) self.write_output(data, qval, start_time) self.finish() self.transit_message("Finished resampling Method")
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [ x for x in packnames if not rpackages.isinstalled(x) ] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({ '0': '"{0}"'.format('", "'.join(r_packages_needed)) })) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition # if a covar is not found, this crashes; check for it? conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata( self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates(covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions(interactionsByFileList, filenamesInCombWig) conditionsList = self.select_conditions(conditions, self.included_conditions, self.ignored_conditions, orderingMetadata) data, conditions, covariates, interactions = self.filter_wigs_by_conditions2( data, conditions, conditionsList, covariates=covariates, interactions=interactions) # show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars filesByCondition = self.invertDict(conditionsByFile) samples_used = set() for cond in conditionsList: samples_used.update(filesByCondition[cond]) vars = [condition_name] + self.covars + self.interactions vars2vals = {} vars2vals[condition_name] = list(set(conditions)) for i, var in enumerate(self.covars): vars2vals[var] = list(set(covariates[i])) for i, var in enumerate(self.interactions): vars2vals[var] = list(set(interactions[i])) varsByFileList = [conditionsByFile ] + covariatesByFileList + interactionsByFileList for i, var in enumerate(vars): print("\nCondition/Covariate/Interaction: %s" % vars[i]) filesByVar = self.invertDict(varsByFileList[i]) for k, v in filesByVar.items(): samples = list(samples_used.intersection(set(v))) if k in vars2vals.get(var, []): print("%s: %s" % (k, ' '.join(samples))) pairs = [] print("\nsamples in cross-product:") any_empty = self.expandVar([], vars, varsByFileList, vars2vals, set(samples_used)) if any_empty: print( "warning: ZINB requires samples in all combinations of conditions; the fact that one is empty could result in Model Errors" ) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split(SEPARATOR) ic2 = y.split(SEPARATOR) c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats)) headersStatGroupNames = [ x.replace(SEPARATOR, '_') for x in orderedStatGroupNames ] self.transit_message("Adding File: %s" % (self.output)) file = open(self.output, "w") if len(headersStatGroupNames) == 2: lfcNames = ["LFC"] else: lfcNames = list(map(lambda v: "LFC_" + v, headersStatGroupNames)) head = ("Rv Gene TAs".split() + list(map(lambda v: "Mean_" + v, headersStatGroupNames)) + lfcNames + list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) + "pval padj".split() + ["status"]) file.write("#Console: python3 %s\n" % " ".join(sys.argv)) file.write( "#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization, self.NTerminus, self.CTerminus, self.PC)) file.write('#' + '\t'.join(head) + EOL) for gene in genes: Rv = gene["rv"] means = [ statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames ] PC = self.PC if len(means) == 2: LFCs = [numpy.math.log((means[1] + PC) / (means[0] + PC), 2)] else: m = numpy.mean(means) LFCs = [numpy.math.log((x + PC) / (m + PC), 2) for x in means] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + [ "%0.1f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames ] + ["%0.3f" % x for x in LFCs] + [ "%0.1f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames ] + [ "%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames ] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals) + EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def Run(self): self.transit_message("Starting Gene Mean Counts Export") start_time = time.time() #Get orf data self.transit_message("Getting Data") if self.combined_wig: (position, fulldata, datasets) = tnseq_tools.read_combined_wig(self.ctrldata[0]) else: (fulldata, position) = tnseq_tools.get_data(self.ctrldata) (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path) position = position.astype(int) hash = transit_tools.get_pos_hash(self.annotation_path) rv2info = transit_tools.get_gene_info(self.annotation_path) self.transit_message("Normalizing") self.output.write("#Summarized to Mean Gene Counts with TRANSIT.\n") if self.normalization != "nonorm": self.output.write("#Reads normalized using '%s'\n" % self.normalization) if type(factors[0]) == type(0.0): self.output.write( "#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()])) else: self.output.write("#Normalization Factors: %s\n" % " ".join( [",".join(["%s" % bx for bx in b]) for b in factors])) self.output.write("#Files:\n") names = datasets if self.combined_wig else self.ctrldata for f in names: self.output.write("#%s\n" % f) K, Nsites = fulldata.shape # Get Gene objects if self.combined_wig: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization, data=fulldata, position=position) else: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization) N = len(G) self.progress_range(N) if self.combined_wig: dataset_header = '\t'.join(datasets) else: dataset_header = "\t".join( [transit_tools.fetch_name(D) for D in self.ctrldata]) self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header) for i, gene in enumerate(G): if gene.n > 0: data_str = "\t".join( ["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)]) else: data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)]) self.output.write("%s\t%s\t%s\t%s\n" % (gene.orf, gene.name, gene.n, data_str)) # Update progress text = "Running Export Method... %5.1f%%" % (100.0 * i / N) self.progress_update(text, i) self.output.close() self.transit_message("") # Printing empty line to flush stdout self.finish() self.transit_message("Finished Export")
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))})) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates( covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions( interactionsByFileList, filenamesInCombWig) data, conditions, covariates, interactions = self.filter_wigs_by_conditions( data, conditions, covariates = covariates, interactions = interactions, ignored_conditions = self.ignored_conditions, included_conditions = self.included_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split("_") ic2 = y.split("_") c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, orderStats) self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") head = ("Rv Gene TAs".split() + map(lambda v: "Mean_" + v, orderedStatGroupNames) + map(lambda v: "NZmean_" + v, orderedStatGroupNames) + map(lambda v: "NZperc_" + v, orderedStatGroupNames) + "pval padj".split() + ["status"]) file.write("#Console: python %s\n" % " ".join(sys.argv)) file.write('\t'.join(head)+EOL) for gene in genes: Rv = gene["rv"] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
def Run(self): self.transit_message("Starting ZINB analysis") start_time = time.time() packnames = ("MASS", "pscl") r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)] if (len(r_packages_needed) > 0): self.transit_error( "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))" % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))})) sys.exit(1) self.transit_message("Getting Data") (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig) self.transit_message("Normalizing using: %s" % self.normalization) (data, factors) = norm_tools.normalize_data(data, self.normalization) condition_name = self.condition conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name) ## [Condition] in the order of files in combined wig conditions = self.wigs_to_conditions( conditionsByFile, filenamesInCombWig) ## [Covariate] in the order of files in combined wig covariates = self.wigs_to_covariates( covariatesByFileList, filenamesInCombWig) ## [Interaction] in the order of files in combined wig interactions = self.wigs_to_interactions( interactionsByFileList, filenamesInCombWig) data, conditions, covariates, interactions = self.filter_wigs_by_conditions( data, conditions, covariates = covariates, interactions = interactions, ignored_conditions = self.ignored_conditions, included_conditions = self.included_conditions) genes = tnseq_tools.read_genes(self.annotation_path) TASiteindexMap = {TA: i for i, TA in enumerate(sites)} RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus) statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions) LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data) self.transit_message("Running ZINB") pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions) def orderStats(x, y): ic1 = x.split(SEPARATOR) ic2 = y.split(SEPARATOR) c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None) c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None) if len(self.included_conditions) > 0: condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2)) ## Order by interaction, if stat belongs to same condition if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff ## Order by samples metadata, if include flag not provided. condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2)) if condDiff == 0 and i1 is not None and i2 is not None: return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2)) return condDiff orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats)) headersStatGroupNames = [x.replace(SEPARATOR,'_') for x in orderedStatGroupNames] self.transit_message("Adding File: %s" % (self.output)) file = open(self.output,"w") head = ("Rv Gene TAs".split() + list(map(lambda v: "Mean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) + list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) + "pval padj".split() + ["status"]) file.write("#Console: python %s\n" % " ".join(sys.argv)) file.write('\t'.join(head)+EOL) for gene in genes: Rv = gene["rv"] vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] + ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] + ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] + ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]] file.write('\t'.join(vals)+EOL) file.close() self.transit_message("Finished Zinb analysis") self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))