Python normalize_data 예제들, pytransit.norm_tools.normalize_data Python 예제들

예제 #1

0

파일 보기

파일: test_norm_methods.py 프로젝트: ywf1215/transit

 def test_nonorm(self):
     data, position = tnseq_tools.get_data(all_data_list)
     norm_data, factors = norm_tools.normalize_data(data, "nonorm")
     self.assertTrue((factors == numpy.array([1.])).all())
     N = len(all_data_list)
     for k in range(N):
         self.assertEqual(numpy.mean(norm_data[k]), raw_means[k])

예제 #2

0

파일 보기

파일: test_norm_methods.py 프로젝트: mad-lab/transit

 def test_nonorm(self):
     data,position = tnseq_tools.get_data(all_data_list)
     norm_data,factors = norm_tools.normalize_data(data, "nonorm")
     self.assertTrue((factors == numpy.array([ 1.])).all())
     N = len(all_data_list)
     for k in range(N):
        self.assertEqual(numpy.mean(norm_data[k]), raw_means[k])

예제 #3

0

파일 보기

    def Run(self):

        self.transit_message("Starting Normalization")
        start_time = time.time()

        infile = self.infile
        outputPath = self.outfile # output file exists, should I require -overwrite flag?

        # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines
        line2 = "variableStep chrom=" # unknown
        for line in open(infile):
          if line.startswith("variableStep"): line2 = line.rstrip(); break

        if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0])
        else: (data, sites) = tnseq_tools.get_data(self.ctrldata)
        (data,factors) = norm_tools.normalize_data(data,self.normalization)

        print "writing",outputPath
        file = open(outputPath,"w")
        file.write("# %s normalization of %s\n" % (self.normalization,infile))
        if self.combined_wig==True:
          for f in files: file.write("#File: %s\n" % f)
          for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n")
        else:
          file.write(line2+"\n")
          for j in range(len(sites)):
            file.write("%s %s\n" % (sites[j],int(data[0,j])))
        file.close()

        self.finish()
        self.transit_message("Finished Normalization")

예제 #4

0

파일 보기

    def __init__(self,
                 parent,
                 dataset_list=["H37Rv_Sassetti_glycerol.wig"],
                 annotation="H37Rv.prot_table",
                 gene="",
                 scale=None,
                 feature_hashes=[],
                 feature_data=[]):

        view_trash.MainFrame.__init__(self, parent)

        self.parent = parent
        self.size = wx.Size(1500, 800)
        self.start = 1
        self.end = 10000

        #self.orf2data = draw_trash.read_prot_table(annotation)
        #self.hash = draw_trash.hash_prot_genes(annotation)

        self.orf2data = transit_tools.get_gene_info(annotation)
        self.hash = transit_tools.get_pos_hash(annotation)

        self.features = []

        #Data to facilitate search
        self.name2id = {}
        for orf, (name, desc, start, end, strand) in self.orf2data.items():
            name = name.lower()
            if name not in self.name2id: self.name2id[name] = []
            self.name2id[name].append(orf)

        self.lowerid2id = dict([(x.lower(), x) for x in self.orf2data.keys()])
        self.labels = [fetch_name(d) for d in dataset_list]
        (self.fulldata, self.position) = tnseq_tools.get_data(dataset_list)

        #Save normalized data
        (self.fulldata_norm,
         self.factors) = norm_tools.normalize_data(self.fulldata,
                                                   method="nzmean")
        self.wasNorm = False

        #initialize parent class

        self.feature_hashes = feature_hashes
        self.feature_data = feature_data

        if not scale:
            scale = [150] * len(dataset_list)
        self.scale = scale
        self.globalScale = False

        self.datasetChoice.SetItems(self.labels)
        self.datasetChoice.SetSelection(0)

        if gene:
            self.searchText.SetValue(gene)
            self.searchFunc(gene)

        self.updateFunc(parent)
        self.Fit()

예제 #5

0

파일 보기

파일: test_norm_methods.py 프로젝트: ywf1215/transit

 def test_TTR(self):
     N = len(all_data_list)
     data, position = tnseq_tools.get_data(all_data_list)
     norm_data, factors = norm_tools.normalize_data(data, "TTR")
     self.assertFalse((factors == numpy.ones(N)).all())
     for k in range(N):
         self.assertNotEqual(numpy.mean(norm_data[k]), raw_means[k])

예제 #6

0

파일 보기

파일: transit_tools.py 프로젝트: robertjenquin/transit

def convertToIGV(self, dataset_list, annotationPath, path, normchoice=None):

    if not normchoice:
        normchoice = "nonorm"

    (fulldata, position) = tnseq_tools.get_data(dataset_list)
    (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice,
                                                    dataset_list,
                                                    annotationPath)
    position = position.astype(int)

    output = open(path, "w")
    output.write("#Converted to IGV with TRANSIT.\n")
    if normchoice != "nonorm":
        output.write("#Reads normalized using '%s'\n" % normchoice)

    output.write("#Files:\n#%s\n" % "\n#".join(dataset_list))
    output.write(
        "#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" %
        ("\t".join([transit_tools.fetch_name(D) for D in dataset_list])))
    chrom = transit_tools.fetch_name(annotationPath)

    for i, pos in enumerate(position):
        output.write(
            "%s\t%s\t%s\tTA%s\t%s\t1\n" %
            (chrom, position[i], position[i] + 1, position[i], "\t".join(
                ["%1.1f" % fulldata[j][i] for j in range(len(fulldata))])))
    output.close()

예제 #7

0

파일 보기

파일: normalize.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Normalization")
        start_time = time.time()

        infile = self.infile
        outputPath = self.outfile # output file exists, should I require -overwrite flag?

        # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines
        line2 = "variableStep chrom=" # unknown
        for line in open(infile):
          if line.startswith("variableStep"): line2 = line.rstrip(); break

        if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0])
        else: (data, sites) = tnseq_tools.get_data(self.ctrldata)
        (data,factors) = norm_tools.normalize_data(data,self.normalization)

        print "writing",outputPath
        file = open(outputPath,"w")
        file.write("# %s normalization of %s\n" % (self.normalization,infile))
        if self.combined_wig==True:
          for f in files: file.write("#File: %s\n" % f)
          for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n")
        else:
          file.write(line2+"\n")
          for j in range(len(sites)):
            file.write("%s %s\n" % (sites[j],int(data[0,j])))
        file.close()

        self.finish()
        self.transit_message("Finished Normalization")

예제 #8

0

파일 보기

파일: test_norm_methods.py 프로젝트: mad-lab/transit

 def test_TTR(self):
     N = len(all_data_list)
     data,position = tnseq_tools.get_data(all_data_list)
     norm_data,factors = norm_tools.normalize_data(data, "TTR")
     self.assertFalse((factors == numpy.ones(N)).all())
     for k in range(N):
        self.assertNotEqual(numpy.mean(norm_data[k]), raw_means[k])

예제 #9

0

파일 보기

파일: igv.py 프로젝트: robertjenquin/transit

    def Run(self):

        self.transit_message("Starting IGV Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata,
         factors) = norm_tools.normalize_data(fulldata, self.normalization,
                                              self.ctrldata,
                                              self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to IGV with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" %
                              self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write(
                    "#Normalization Factors: %s\n" %
                    "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join(
                    [",".join(["%s" % bx for bx in b]) for b in factors]))

        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)

        dataset_str = "\t".join(
            [transit_tools.fetch_name(F) for F in self.ctrldata])
        self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" %
                          dataset_str)
        chrom = transit_tools.fetch_name(self.annotation_path)

        (K, N) = fulldata.shape
        self.progress_range(N)
        for i, pos in enumerate(position):
            self.output.write(
                "%s\t%s\t%s\tTA%s\t%s\t1\n" %
                (chrom, position[i], position[i] + 1, position[i], "\t".join(
                    ["%1.1f" % fulldata[j][i] for j in range(len(fulldata))])))

            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0 * i / N)
            self.progress_update(text, i)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")

예제 #10

0

파일 보기

파일: mean_counts.py 프로젝트: wmatern/transit

    def Run(self):

        self.transit_message("Starting Gene Mean Counts Export")
        start_time = time.time()
        
        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, 
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Summarized to Mean Gene Counts with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" % self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))


        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)


        K,Nsites = fulldata.shape
        # Get Gene objects
        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
        N = len(G)
        self.progress_range(N)
        dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
        self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header)
        for i,gene in enumerate(G):
            if gene.n > 0:
                data_str = "\t".join(["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)])
            else:
                data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)])
            self.output.write("%s\t%s\t%s\t%s\n" % (gene.orf, gene.name, gene.n, data_str))

            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()



        self.transit_message("") # Printing empty line to flush stdout 
        self.finish()
        self.transit_message("Finished Export")

예제 #11

0

파일 보기

    def Run(self):

        self.transit_message("Starting Combined Wig Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata,
         factors) = norm_tools.normalize_data(fulldata, self.normalization,
                                              self.ctrldata,
                                              self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to CombinedWig with TRANSIT.\n")
        self.output.write("#normalization method: %s\n" % self.normalization)
        if self.normalization != "nonorm":
            if type(factors[0]) == type(0.0):
                self.output.write(
                    "#Normalization Factors: %s\n" %
                    "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join(
                    [",".join(["%s" % bx for bx in b]) for b in factors]))

        (K, N) = fulldata.shape
        for f in self.ctrldata:
            self.output.write("#File: %s\n" % f)
        self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata)))

        for i, pos in enumerate(position):
            #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            if self.normalization != 'nonorm':
                vals = "\t".join(["%1.1f" % c for c in fulldata[:, i]])
            else:
                vals = "\t".join(["%d" % c for c in fulldata[:, i]
                                  ])  # no decimals if raw counts
            self.output.write("%d\t%s\t%s\n" % (position[i], vals, ",".join([
                "%s (%s)" % (orf, rv2info.get(orf, ["-"])[0])
                for orf in hash.get(position[i], [])
            ])))
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0 * i / N)
            self.progress_update(text, i)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")

예제 #12

0

파일 보기

파일: resampling.py 프로젝트: mad-lab/transit

    def preprocess_data(self, position, data):
        (K,N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        return data

예제 #13

0

파일 보기

파일: resampling.py 프로젝트: mad-lab/transit

    def preprocess_data(self, position, data):
        (K,N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        return data

예제 #14

0

파일 보기

파일: transit_tools.py 프로젝트: robertjenquin/transit

def convertToCombinedWig(dataset_list,
                         annotationPath,
                         outputPath,
                         normchoice="nonorm"):
    """Normalizes the input datasets and outputs the result in CombinedWig format.
    
    Arguments:
        dataset_list (list): List of paths to datasets in .wig format
        annotationPath (str): Path to annotation in .prot_table or GFF3 format.
        outputPath (str): Desired output path.
        normchoice (str): Choice for normalization method.
            
    """

    (fulldata, position) = tnseq_tools.get_data(dataset_list)
    (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice,
                                                    dataset_list,
                                                    annotationPath)
    position = position.astype(int)

    hash = get_pos_hash(annotationPath)
    rv2info = get_gene_info(annotationPath)

    output = open(outputPath, "w")
    output.write("#Converted to CombinedWig with TRANSIT.\n")
    if normchoice != "nonorm":
        output.write("#Reads normalized using '%s'\n" % normchoice)
        if type(factors[0]) == type(0.0):
            output.write("#Normalization Factors: %s\n" %
                         "\t".join(["%s" % f for f in factors.flatten()]))
        else:
            output.write(
                "#Normalization Factors: %s\n" %
                " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))

    (K, N) = fulldata.shape
    output.write("#Files:\n")
    for f in dataset_list:
        output.write("#%s\n" % f)

    for i, pos in enumerate(position):
        #output.write("%-10d %s  %s\n" % (position[i], "".join(["%7.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
        output.write(
            "%d\t%s\t%s\n" %
            (position[i], "\t".join(["%1.1f" % c
                                     for c in fulldata[:, i]]), ",".join([
                                         "%s (%s)" %
                                         (orf, rv2info.get(orf, ["-"])[0])
                                         for orf in hash.get(position[i], [])
                                     ])))
    output.close()

예제 #15

0

파일 보기

    def Run(self):
        self.transit_message("Starting Anova analysis")
        start_time = time.time()

        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        conditionsByFile, _, _, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata)
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)

        conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
        data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions)

        self.transit_message("Running Anova")
        pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")

        heads = ("Rv Gene TAs".split() +
                ["Mean_%s" % x for x in conditionsList] +
                ["LFC_%s" % x for x in conditionsList] +
                "pval padj".split() + ["status"])
        file.write("#Console: python3 %s\n" % " ".join(sys.argv))
        file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC))
        file.write('#'+'\t'.join(heads)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            if Rv in MeansByRv:
              means = [MeansByRv[Rv][c] for c in conditionsList]
              LFCs = self.calcLFCs(means,self.PC)
              vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                      ["%0.2f" % x for x in means] + 
                      ["%0.3f" % x for x in LFCs] + 
                      ["%f" % x for x in [pvals[Rv], qvals[Rv]]] + [run_status[Rv]])
              file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Anova analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))

예제 #16

0

파일 보기

파일: transit_tools.py 프로젝트: robertjenquin/transit

def convertToGeneCountSummary(dataset_list,
                              annotationPath,
                              outputPath,
                              normchoice="nonorm"):
    """Normalizes the input datasets and outputs the result in CombinedWig format.
    
    Arguments:
        dataset_list (list): List of paths to datasets in .wig format
        annotationPath (str): Path to annotation in .prot_table or GFF3 format.
        outputPath (str): Desired output path.
        normchoice (str): Choice for normalization method.
            
    """

    (fulldata, position) = tnseq_tools.get_data(dataset_list)
    (fulldata, factors) = norm_tools.normalize_data(fulldata, normchoice,
                                                    dataset_list,
                                                    annotationPath)
    output = open(outputPath, "w")
    output.write("#Summarized to Mean Gene Counts with TRANSIT.\n")
    if normchoice != "nonorm":
        output.write("#Reads normalized using '%s'\n" % normchoice)
        if type(factors[0]) == type(0.0):
            output.write("#Normalization Factors: %s\n" %
                         "\t".join(["%s" % f for f in factors.flatten()]))
        else:
            output.write(
                "#Normalization Factors: %s\n" %
                " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))

    (K, N) = fulldata.shape
    output.write("#Files:\n")
    for f in dataset_list:
        output.write("#%s\n" % f)

    # Get Gene objects
    G = tnseq_tools.Genes(dataset_list, annotationPath, norm=normchoice)

    dataset_header = "\t".join([os.path.basename(D) for D in dataset_list])
    output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header)
    for i, gene in enumerate(G):
        if gene.n > 0:
            data_str = "\t".join(
                ["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)])
        else:
            data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)])
        output.write("%s\t%s\t%s\t%s\n" %
                     (gene.orf, gene.name, gene.n, data_str))
    output.close()

예제 #17

0

파일 보기

파일: qcDisplay.py 프로젝트: mad-lab/transit

 def refresh(self):
     try:
         #(self.data, self.position) = tnseq_tools.get_data(self.wigList)
         self.plots_list = []
         self.statsListCtrl.DeleteAllItems()
         (self.normdata, factors) = norm_tools.normalize_data(self.data, self.norm)
         self.updateFiles()
         self.addPlots()
         self.statsListCtrl.Select(0)
         self.refreshPlots()
     except Exception as e:
         print self.qc_prefix, "Error:", e
         exc_type, exc_obj, exc_tb = sys.exc_info()
         fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
         print(exc_type, fname, exc_tb.tb_lineno)

예제 #18

0

파일 보기

파일: qcDisplay.py 프로젝트: ywf1215/transit

 def refresh(self):
     try:
         #(self.data, self.position) = tnseq_tools.get_data(self.wigList)
         self.plots_list = []
         self.statsListCtrl.DeleteAllItems()
         (self.normdata, factors) = norm_tools.normalize_data(self.data, self.norm)
         self.updateFiles()
         self.addPlots()
         self.statsListCtrl.Select(0)
         self.refreshPlots()
     except Exception as e:
         print(self.qc_prefix, "Error:", e)
         exc_type, exc_obj, exc_tb = sys.exc_info()
         fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
         print(exc_type, fname, exc_tb.tb_lineno)

예제 #19

0

파일 보기

파일: igv.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting IGV Export")
        start_time = time.time()
        
        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, 
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to IGV with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" % self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))

        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)

        dataset_str = "\t".join([transit_tools.fetch_name(F) for F in self.ctrldata])
        self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" % dataset_str)
        chrom = transit_tools.fetch_name(self.annotation_path)

        (K,N) = fulldata.shape
        self.progress_range(N)
        for i,pos in enumerate(position):
            self.output.write("%s\t%s\t%s\tTA%s\t%s\t1\n" % (chrom, position[i], position[i]+1, position[i], "\t".join(["%1.1f" % fulldata[j][i] for j in range(len(fulldata))])))
            
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()




        self.transit_message("") # Printing empty line to flush stdout 
        self.finish()
        self.transit_message("Finished Export")

예제 #20

0

파일 보기

파일: combined_wig.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Combined Wig Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization,
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to CombinedWig with TRANSIT.\n")
        self.output.write("#normalization method: %s\n" % self.normalization)
        if self.normalization != "nonorm":
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))


        (K,N) = fulldata.shape
        for f in self.ctrldata:
            self.output.write("#File: %s\n" % f)
        self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata)))

        for i,pos in enumerate(position):
            #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            if self.normalization!='nonorm': vals = "\t".join(["%1.1f" % c for c in fulldata[:,i]])
            else: vals = "\t".join(["%d" % c for c in fulldata[:,i]]) # no decimals if raw counts
            self.output.write("%d\t%s\t%s\n" % (position[i],vals,",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()



        self.transit_message("") # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")

예제 #21

0

파일 보기

    def Run(self):
        self.transit_message("Starting Anova analysis")
        start_time = time.time()

        self.transit_message("Getting Data")
        (sites, data,
         filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        conditions = self.wigs_to_conditions(
            self.read_samples_metadata(self.metadata), filenamesInCombWig)
        data, conditions = self.filter_by_conditions_blacklist(
            data, conditions, self.ignored_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(
            genes, TASiteindexMap)
        MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions)

        self.transit_message("Running Anova")
        pvals, qvals = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap,
                                      conditions)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output, "w")
        conditionsList = list(set(conditions))
        vals = "Rv Gene TAs".split() + conditionsList + "pval padj".split()
        file.write('\t'.join(vals) + EOL)
        for gene in genes:
            Rv = gene["rv"]
            if Rv in MeansByRv:
                vals = ([Rv, gene["gene"],
                         str(len(RvSiteindexesMap[Rv]))] +
                        ["%0.1f" % MeansByRv[Rv][c] for c in conditionsList] +
                        ["%f" % x for x in [pvals[Rv], qvals[Rv]]])
                file.write('\t'.join(vals) + EOL)
        file.close()
        self.transit_message("Finished Anova analysis")

예제 #22

0

파일 보기

파일: hmm.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting HMM Method")
        start_time = time.time()

        #Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        # Normalize data
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        # Do LOESS
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        if len(self.ctrldata) > 1:
            self.transit_message("Combining Replicates as '%s'" %
                                 self.replicates)
        O = tnseq_tools.combine_replicates(
            data, method=self.replicates
        ) + 1  # Adding 1 to because of shifted geometric in scipy

        #Parameters
        Nstates = 4
        label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"}

        reads = O - 1
        reads_nz = sorted(reads[reads != 0])
        size = len(reads_nz)
        mean_r = numpy.average(reads_nz[:int(0.95 * size)])
        mu = numpy.array([1 / 0.99, 0.01 * mean_r + 2, mean_r, mean_r * 5.0])
        #mu = numpy.array([1/0.99, 0.1 * mean_r + 2,  mean_r, mean_r*5.0])
        L = 1.0 / mu
        B = []  # Emission Probability Distributions
        for i in range(Nstates):
            B.append(scipy.stats.geom(L[i]).pmf)

        pins = self.calculate_pins(O - 1)
        pins_obs = sum([1 for rd in O if rd >= 2]) / float(len(O))
        pnon = 1.0 - pins
        pnon_obs = 1.0 - pins_obs

        for r in range(100):
            if pnon**r < 0.01: break

        A = numpy.zeros((Nstates, Nstates))
        a = math.log1p(-B[int(Nstates / 2)](1)**r)
        b = r * math.log(B[int(Nstates / 2)](1)) + math.log(
            1.0 / 3)  # change to Nstates-1?
        for i in range(Nstates):
            A[i] = [b] * Nstates
            A[i][i] = a

        PI = numpy.zeros(Nstates)  # Initial state distribution
        PI[0] = 0.7
        PI[1:] = 0.3 / (Nstates - 1)

        self.progress_range(self.maxiterations)

        ###############
        ### VITERBI ###
        (Q_opt, delta, Q) = self.viterbi(A, B, PI, O)
        ###############

        ##################
        ### ALPHA PASS ###
        (log_Prob_Obs, alpha,
         C) = self.forward_procedure(numpy.exp(A), B, PI, O)
        ##################

        #################
        ### BETA PASS ###
        beta = self.backward_procedure(numpy.exp(A), B, PI, O, C)
        #################

        T = len(O)
        total = 0
        state2count = dict.fromkeys(range(Nstates), 0)
        for t in range(T):
            state = Q_opt[t]
            state2count[state] += 1
            total += 1

        self.output.write("#HMM - Sites\n")
        self.output.write("# Tn-HMM\n")

        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("# \n")
        self.output.write("# Mean:\t%2.2f\n" % (numpy.average(reads_nz)))
        self.output.write("# Median:\t%2.2f\n" % numpy.median(reads_nz))
        self.output.write("# Normalization:\t%s\n" % self.normalization)
        self.output.write("# LOESS Correction:\t%s\n" % str(self.LOESS))
        self.output.write("# pins (obs):\t%f\n" % pins_obs)
        self.output.write("# pins (est):\t%f\n" % pins)
        self.output.write("# Run length (r):\t%d\n" % r)
        self.output.write("# State means:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %8.4f" % (label[i], mu[i]) for i in range(Nstates)]))
        self.output.write("# Self-Transition Prob:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %2.4e" % (label[i], A[i][i]) for i in range(Nstates)]))
        self.output.write("# State Emission Parameters (theta):\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %1.4f" % (label[i], L[i]) for i in range(Nstates)]))
        self.output.write("# State Distributions:")
        self.output.write("#    %s\n" % "   ".join([
            "%s: %2.2f%%" % (label[i], state2count[i] * 100.0 / total)
            for i in range(Nstates)
        ]))

        states = [int(Q_opt[t]) for t in range(T)]
        last_orf = ""
        for t in range(T):
            s_lab = label.get(states[t], "Unknown State")
            gamma_t = (alpha[:, t] * beta[:, t]) / numpy.sum(
                alpha[:, t] * beta[:, t])
            genes_at_site = hash.get(position[t], [""])
            genestr = ""
            if not (len(genes_at_site) == 1 and not genes_at_site[0]):
                genestr = ",".join([
                    "%s_(%s)" % (g, rv2info.get(g, "-")[0])
                    for g in genes_at_site
                ])

            self.output.write("%s\t%s\t%s\t%s\t%s\n" %
                              (int(position[t]), int(O[t]) - 1, "\t".join(
                                  ["%-9.2e" % g
                                   for g in gamma_t]), s_lab, genestr))

        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Finished HMM - Sites Method")
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="HMM - Sites")

        #Gene Files
        self.transit_message("Creating HMM Genes Level Output")
        genes_path = ".".join(self.output.name.split(
            ".")[:-1]) + "_genes." + self.output.name.split(".")[-1]

        tempObs = numpy.zeros((1, len(O)))
        tempObs[0, :] = O - 1
        self.post_process_genes(tempObs, position, states, genes_path)

        self.transit_message("Adding File: %s" % (genes_path))
        self.add_file(path=genes_path, filetype="HMM - Genes")
        self.finish()
        self.transit_message("Finished HMM Method")

예제 #23

0

파일 보기

파일: binomial.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Binomial Method")
        start_time = time.time()

        self.progress_range(self.samples + self.burnin)

        #Get orf data
        #self.transit_message("Getting Data")
        #G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus)

        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata,
                              self.annotation_path,
                              minread=1,
                              reps=self.replicates,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        #Parameters
        self.transit_message("Setting Parameters")
        w1 = 0.15
        w0 = 1.0 - w1
        mu_c = 0

        Ngenes = len(G)
        sample_size = self.samples + self.burnin
        numReps = len(self.ctrldata)

        theta = numpy.zeros((Ngenes, sample_size))
        theta[:, 0] = 0.10

        rho0 = numpy.zeros(sample_size)
        rho0[0] = 0.5
        Kp0 = numpy.zeros(sample_size)
        Kp0[0] = 10
        rho1 = numpy.zeros(sample_size)
        rho1[0] = 0.10
        Kp1 = numpy.zeros(sample_size)
        Kp1[0] = 3

        Z = numpy.zeros((Ngenes, sample_size))
        pz1 = numpy.zeros(sample_size)
        n1 = 0

        w1 = scipy.stats.beta.rvs(self.alpha_w, self.beta_w)
        W1 = numpy.zeros(sample_size)
        W1[0] = w1

        #
        self.transit_message("Setting Initial Values")
        K = numpy.array(
            [sum([1 for x in gene.reads.flatten() if x > 0]) for gene in G])
        N = numpy.array([len(gene.reads.flatten()) for gene in G])

        for g, gene in enumerate(G):
            if N[g] == 0: theta[g][0] = 0.5
            elif K[g] / float(N[g]) == 0: theta[g][0] = 0.001
            elif K[g] / float(N[g]) == 1: theta[g][0] = 0.001
            else: theta[g][0] = K[g] / float(N[g])

            #print(g, ORF[g], K[g], N[g], theta[g][0])
            Z[g][0] = scipy.stats.bernoulli.rvs(1 - theta[g][0])

        acc_p0 = 0
        acc_k0 = 0
        acc_p1 = 0
        acc_k1 = 0

        rho0c_std = 0.010
        kp0c_std = 1.40
        rho1c_std = 0.009
        kp1c_std = 1.1

        numpy.seterr(divide='ignore')
        for i in range(1, sample_size):

            i0 = Z[:, i - 1] == 0
            n0 = numpy.sum(i0)
            i1 = Z[:, i - 1] == 1
            n1 = numpy.sum(i1)

            theta[i0, i] = scipy.stats.beta.rvs(
                Kp0[i - 1] * rho0[i - 1] + K[i0],
                Kp0[i - 1] * (1 - rho0[i - 1]) + N[i0] - K[i0])
            theta[i1, i] = scipy.stats.beta.rvs(
                Kp1[i - 1] * rho1[i - 1] + K[i1],
                Kp1[i - 1] * (1 - rho1[i - 1]) + N[i1] - K[i1])

            rho0_c = rho0[i - 1] + scipy.stats.norm.rvs(0, rho0c_std)
            Kp0_c = Kp0[i - 1] + scipy.stats.norm.rvs(0, kp0c_std)

            if rho0_c <= 0: rho0[i] = rho0[i - 1]
            else:
                fc = numpy.log(
                    scipy.stats.beta.pdf(rho0_c, self.M0 * self.pi0,
                                         self.M0 * (1.0 - self.pi0)))
                f0 = numpy.log(
                    scipy.stats.beta.pdf(rho0[i - 1], self.M0 * self.pi0,
                                         self.M0 * (1.0 - self.pi0)))
                fc += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i0, i], Kp0[i - 1] * rho0_c,
                                             Kp0[i - 1] * (1 - rho0_c))))
                f0 += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i0, i],
                                             Kp0[i - 1] * rho0[i - 1],
                                             Kp0[i - 1] * (1 - rho0[i - 1]))))

                if numpy.log(scipy.stats.uniform.rvs()) < fc - f0:
                    rho0[i] = rho0_c
                    acc_p0 += 1
                else:
                    rho0[i] = rho0[i - 1]

            if Kp0_c <= 0: Kp0[i] = Kp0[i - 1]
            else:
                fc = numpy.log(scipy.stats.gamma.pdf(Kp0_c, self.a0, self.b0))
                f0 = numpy.log(
                    scipy.stats.gamma.pdf(Kp0[i - 1], self.a0, self.b0))
                fc += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i0, i], Kp0_c * rho0[i],
                                             Kp0_c * (1 - rho0[i]))))
                f0 += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i0,
                                                   i], Kp0[i - 1] * rho0[i],
                                             Kp0[i - 1] * (1 - rho0[i]))))

                if numpy.log(scipy.stats.uniform.rvs()) < fc - f0:
                    Kp0[i] = Kp0_c
                    acc_k0 += 1
                else:
                    Kp0[i] = Kp0[i - 1]

            rho1_c = rho1[i - 1] + scipy.stats.norm.rvs(0, rho1c_std)
            Kp1_c = Kp1[i - 1] + scipy.stats.norm.rvs(0, kp1c_std)

            if rho1_c <= 0:
                rho1[i] = rho1[i - 1]
            else:
                fc = numpy.log(
                    scipy.stats.beta.pdf(rho1_c, self.M1 * self.pi1,
                                         self.M1 * (1 - self.pi1)))
                f1 = numpy.log(
                    scipy.stats.beta.pdf(rho1[i - 1], self.M1 * self.pi1,
                                         self.M1 * (1 - self.pi1)))
                fc += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i1, i], Kp1[i - 1] * rho1_c,
                                             Kp1[i - 1] * (1 - rho1_c))))
                f1 += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i1, i],
                                             Kp1[i - 1] * rho1[i - 1],
                                             Kp1[i - 1] * (1 - rho1[i - 1]))))

                if numpy.log(scipy.stats.uniform.rvs()) < fc - f1:
                    rho1[i] = rho1_c
                    acc_p1 += 1
                else:
                    rho1[i] = rho1[i - 1]

            if Kp1_c <= 0: Kp1[i] = Kp1[i - 1]
            else:

                fc = numpy.log(scipy.stats.gamma.pdf(Kp1_c, self.a1, self.b1))
                f1 = numpy.log(
                    scipy.stats.gamma.pdf(Kp1[i - 1], self.a1, self.b1))
                fc += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i1, i], Kp1_c * rho1[i],
                                             Kp1_c * (1 - rho1[i]))))
                f1 += numpy.sum(
                    numpy.log(
                        scipy.stats.beta.pdf(theta[i1,
                                                   i], Kp1[i - 1] * rho1[i],
                                             Kp1[i - 1] * (1 - rho1[i]))))

                if numpy.log(scipy.stats.uniform.rvs()) < fc - f1:
                    Kp1[i] = Kp1_c
                    acc_k1 += 1
                else:
                    Kp1[i] = Kp1[i - 1]

            g0 = scipy.stats.beta.pdf(theta[:, i], Kp0[i] * rho0[i], Kp0[i] *
                                      (1 - rho0[i])) * (1 - w1)
            g1 = scipy.stats.beta.pdf(theta[:, i], Kp1[i] * rho1[i], Kp1[i] *
                                      (1 - rho1[i])) * (w1)
            p1 = g1 / (g0 + g1)
            p1 = numpy.nan_to_num(p1)

            try:
                Z[:, i] = scipy.stats.bernoulli.rvs(p1)
            except:
                inan = numpy.isnan(p1)
                sys.stderr.write("K=\t", K[inan], "\n")
                sys.stderr.write("N=\t", N[inan], "\n")
                sys.stderr.write("theta=", theta[inan, i], '\n')
                sys.exit()
            pz1[i] = p1[0]

            i1 = Z[:, i] == 1
            n1 = numpy.sum(i1)
            #w1 = 0.15
            w1 = scipy.stats.beta.rvs(self.alpha_w + n1,
                                      self.beta_w + Ngenes - n1)
            W1[i] = w1

            #Update progress
            text = "Running Binomial Method... %5.1f%%" % (100.0 * (i + 1) /
                                                           (sample_size))
            self.progress_update(text, i)

        numpy.seterr(divide='warn')

        z_bar = numpy.apply_along_axis(numpy.mean, 1, Z[:, self.burnin:])
        theta_bar = numpy.apply_along_axis(numpy.mean, 1, theta[:,
                                                                self.burnin:])
        #(ess_threshold, noness_threshold) = stat_tools.fdr_post_prob(z_bar)
        (ess_threshold,
         noness_threshold) = stat_tools.bayesian_ess_thresholds(z_bar)

        self.output.write("#Binomial\n")
        #output.write("#Command: %s\n" % " ".join(["%s=%s" %(key,val) for (key,val) in kwargs.items()]))
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, burnin=%s\n"
                %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8'), self.samples, self.burnin))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("#Thresholds: (%1.5f, %1.5f)\n" %
                          (ess_threshold, noness_threshold))
        self.output.write("#rho0 Acceptance Rate:\t%f%%\n" %
                          ((100.0 * acc_p0) / sample_size))
        self.output.write("#Kp0  Acceptance Rate:\t%f%%\n" %
                          ((100.0 * acc_k0) / sample_size))
        self.output.write("#rho1 Acceptance Rate:\t%f%%\n" %
                          ((100.0 * acc_p1) / sample_size))
        self.output.write("#Kp1  Acceptance Rate:\t%f%%\n" %
                          ((100.0 * acc_k1) / sample_size))
        self.output.write(
            "#Hyperparameters rho: \t%1.2f\t%3.1f\t%1.2f\t%3.1f\n" %
            (self.pi0, self.M0, self.pi1, self.M1))
        self.output.write(
            "#Hyperparameters Kp: \t%3.1f\t%3.1f\t%3.1f\t%3.1f\n" %
            (self.a0, self.b0, self.a1, self.b1))
        self.output.write("#Hyperparameters W: \t%1.3f\t%1.3f\n" %
                          (self.alpha_w, self.beta_w))

        self.output.write("#%s\n" % "\t".join(columns))

        data = []
        for g, gene in enumerate(G):
            c = "Uncertain"
            if z_bar[g] > ess_threshold:
                c = "Essential"
            if z_bar[g] < noness_threshold:
                c = "Non-Essential"
            data.append(
                "%s\t%s\t%s\t%1.1f\t%d\t%d\t%d\t%f\t%f\t%s" %
                (gene.orf, gene.name, gene.desc, K[g] / float(numReps),
                 N[g] / numReps, K[g], N[g], theta_bar[g], z_bar[g], c))

        data.sort()
        for row in data:
            self.output.write("%s\n" % row)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Binomial")
        self.finish()
        self.transit_message("Finished Binomial Method")

예제 #24

0

파일 보기

파일: stat_tools.py 프로젝트: mad-lab/transit

    
    
    if DO_LIB:
        ctrl_lib_str = "ABAB"
        exp_lib_str = "AAABBB"
    else:
        ctrl_lib_str = ""
        exp_lib_str = ""
    
    Kctrl = len(ctrldata)
    Kexp  = len(expdata)

    (data, position) = transit_tools.get_validated_data(ctrldata+expdata)
    (K,N) = data.shape

    (data, factors) = norm_tools.normalize_data(data, "TTR", ctrldata+expdata, annotation)

    G = tnseq_tools.Genes(ctrldata + expdata, annotation, data=data, position=position)


    gene = G[i]

    print "\n\n"
    print "#"*100
    print "#  (%s)  NEW TEST:   %s"  % (DO_LIB, gene)
    print "#"*100
    print ""
       

 
    ii = numpy.ones(gene.n) == 1

예제 #25

0

파일 보기

    def Run(self):


        self.transit_message("Starting Genetic Interactions Method")
        start_time = time.time()
        self.output.write("#GI\n")

        wiglist = self.ctrldataA + self.expdataA + self.ctrldataB + self.expdataB

        Nwig = len(wiglist)
        Na1 = len(self.ctrldataA)
        Nb1 = len(self.expdataA)
        Na2 = len(self.ctrldataB)
        Nb2 = len(self.expdataB)


        # Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(wiglist, wxobj=self.wxobj)

        # Normalize data if specified
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, wiglist, self.annotation_path)

        # Do LOESS correction if specified
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        # Get Gene objects for each condition
        G_A1 = tnseq_tools.Genes([], self.annotation_path, data=data[:Na1], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_B1 = tnseq_tools.Genes([], self.annotation_path, data=data[Na1:(Na1+Nb1)], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_A2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1):(Na1+Nb1+Na2)], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_B2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1+Na2):], position=position,nterm=self.NTerminus,cterm=self.CTerminus)

        means_list_a1 = []
        means_list_b1 = []
        means_list_a2 = []
        means_list_b2 = []

        var_list_a1 = []
        var_list_a2 = []
        var_list_b1 = []
        var_list_b2 = []


        # Base priors on empirical observations accross genes.
        for gene in sorted(G_A1):
            if gene.n > 1:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                means_list_a1.append(numpy.mean(A1_data))
                var_list_a1.append(numpy.var(A1_data))

                means_list_b1.append(numpy.mean(B1_data))
                var_list_b1.append(numpy.var(B1_data))

                means_list_a2.append(numpy.mean(A2_data))
                var_list_a2.append(numpy.var(A2_data))

                means_list_b2.append(numpy.mean(B2_data))
                var_list_b2.append(numpy.var(B2_data))

        # Priors
        mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01)
        mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01)
        mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01)
        mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01)

        s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01)
        s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01)
        s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01)
        s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01)

        k0=1.0
        nu0=1.0
        data = []

        postprob = []
        count = 0
        N = len(G_A1)
        self.progress_range(N)
        # Perform actual analysis
        for gene in G_A1:

            # If there is some data
            if gene.n > 0:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()


            #            Time-1   Time-2
            #
            #  Strain-A     A       C
            #
            #  Strain-B     B       D

                try:
                    muA1_post, varA1_post = stat_tools.sample_trunc_norm_post(A1_data, self.samples,
                        mu0_A1, s20_A1, k0, nu0)
                    muB1_post, varB1_post = stat_tools.sample_trunc_norm_post(B1_data, self.samples,
                        mu0_B1, s20_B1, k0, nu0)
                    muA2_post, varA2_post = stat_tools.sample_trunc_norm_post(A2_data, self.samples,
                        mu0_A2, s20_A2, k0, nu0)
                    muB2_post, varB2_post = stat_tools.sample_trunc_norm_post(B2_data, self.samples,
                        mu0_B2, s20_B2, k0, nu0)

                except Exception as e:
                    muA1_post = varA1_post = numpy.ones(self.samples)
                    muB1_post = varB1_post = numpy.ones(self.samples)
                    muA2_post = varA2_post = numpy.ones(self.samples)
                    muB2_post = varB2_post = numpy.ones(self.samples)

                logFC_A_post = numpy.log2(muA2_post/muA1_post)
                logFC_B_post = numpy.log2(muB2_post/muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                alpha = 0.05

                # Get Bounds of the HDI
                l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC(logFC_A_post, 1-alpha)

                l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC(logFC_B_post, 1-alpha)

                l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC(delta_logFC_post, 1-alpha)


                mean_logFC_A = numpy.mean(logFC_A_post)
                mean_logFC_B = numpy.mean(logFC_B_post)
                mean_delta_logFC = numpy.mean(delta_logFC_post)

                # Is HDI significantly different than ROPE?
                not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope

                # Probability of posterior overlaping with ROPE
                probROPE = numpy.mean(numpy.logical_and(delta_logFC_post>=0.0-self.rope,  delta_logFC_post<=0.0+self.rope))

            # If there is no data, assume empty defaults
            else:
                A1_data = [0,0]
                B1_data = [0,0]
                A2_data = [0,0]
                B2_data = [0,0]
                muA1_post = varA1_post = numpy.ones(self.samples)
                muB1_post = varB1_post = numpy.ones(self.samples)
                muA2_post = varA2_post = numpy.ones(self.samples)
                muB2_post = varB2_post = numpy.ones(self.samples)
                logFC_A_post = numpy.log2(muA2_post/muA1_post)
                logFC_B_post = numpy.log2(muB2_post/muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                mean_logFC_A = 0
                mean_logFC_B = 0
                mean_delta_logFC = 0
                l_logFC_A = 0
                u_logFC_A = 0
                l_logFC_B = 0
                u_logFC_B = 0
                l_delta_logFC = 0
                u_delta_logFC = 0
                probROPE = 1.0


            if numpy.isnan(l_logFC_A):
                l_logFC_A = -10
                u_logFC_A = 10
            if numpy.isnan(l_logFC_B):
                l_logFC_B = -10
                u_logFC_B = 10
            if numpy.isnan(l_delta_logFC):
                l_delta_logFC = -10
                u_delta_logFC = 10


            postprob.append(probROPE)
            data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post), numpy.mean(muA2_post), numpy.mean(muB1_post), numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit))


            text = "Running GI Method... %2.0f%%" % (100.0*(count+1)/N)
            self.progress_update(text, count)
            self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0*count/(N-1)))
            count+=1

        data.sort(key=lambda x: x[-2])

        if self.doBFDR or not self.doFWER:
            postprob = numpy.array(postprob)
            postprob.sort()
            bfdr = numpy.cumsum(postprob)/numpy.arange(1, len(postprob)+1)
            adjusted_prob = bfdr
            adjusted_label = "BFDR"
        elif doFWER:
            fwer = FWER_Bayes(postprob)
            fwer.sort()
            adjusted_prob = fwer
            adjusted_label = "FWER"

        # If not using adjustment for classification, sort correctly
        if not self.doBFDR and not self.doFWER:
            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
            data = [data[ii] for ii in sorted_index]



        # Print output
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.includeZeros, self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))


        self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8')))
        self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8')))
        self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8')))
        self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))


        if self.doBFDR or self.doFWER:
            self.output.write("# Significant interactions are those whose adjusted probability of the delta-logFC falling within ROPE is < 0.05 (Adjusted using %s)\n" % (adjusted_label))
        else:
            self.output.write("# Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
        self.output.write("#\n")

        # Write column names
        self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Time 1)\tMean count (Strain A Time 2)\tMean count (Strain B Time 1)\tMean count (Strain B Time 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tProb. of delta-logFC being within ROPE\tAdjusted Probability (%s)\tIs HDI outside ROPE?\tType of Interaction\n" % adjusted_label)

        # Write gene results
        for i,row in enumerate(data):
        #1   2    3        4                5              6               7                8            9            10              11             12            13         14
            orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
            type_of_interaction = "No Interaction"
            if ((self.doBFDR or self.doFWER) and adjusted_prob[i] < 0.05):
                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
            elif not (self.doBFDR or self.doFWER) and not_HDI_overlap_bit:
                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)

            new_row = tuple(list(row[:-1])+[adjusted_prob[i], not_HDI_overlap_bit, type_of_interaction])
            self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.8f\t%1.8f\t%s\t%s\n" % new_row)


        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="GI")
        self.finish()
        self.transit_message("Finished Genetic Interactions Method")

예제 #26

0

파일 보기

    def Run(self):

        #if not self.wxobj:
        #    # Force matplotlib to use good backend for png.
        #    import matplotlib.pyplot as plt
        #elif "matplotlib.pyplot" not in sys.modules:
        try:
            import matplotlib.pyplot as plt
        except:
            print "Error: cannot do histograms"
            self.doHistogram = False

        self.transit_message("Starting resampling Method")
        start_time = time.time()

        if self.doHistogram:
            histPath = os.path.join(
                os.path.dirname(self.output.name),
                transit_tools.fetch_name(self.output.name) + "_histograms")
            if not os.path.isdir(histPath):
                os.makedirs(histPath)
        else:
            histPath = ""

        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata +
                                                            self.expdata,
                                                            wxobj=self.wxobj)

        (K, N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata + self.expdata,
                                                  self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        G = tnseq_tools.Genes(self.ctrldata + self.expdata,
                              self.annotation_path,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        #G = tnseq_tools.Genes(self.ctrldata+self.expdata, self.annotation_path, norm=self.normalization, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus)

        #Resampling
        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count += 1
            if gene.k == 0 or gene.n == 0:
                (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,
                 pval_2tail, testlist, data1, data2) = (0, 0, 0, 0, 1.00, 1.00,
                                                        1.00, [], [0], [0])
            else:

                if not self.includeZeros:
                    ii = numpy.sum(gene.reads, 0) > 0
                else:
                    ii = numpy.ones(gene.n) == 1

                data1 = gene.reads[:Kctrl, ii].flatten() + self.pseudocount
                data2 = gene.reads[Kctrl:, ii].flatten() + self.pseudocount

                (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,
                 pval_2tail, testlist) = stat_tools.resampling(
                     data1,
                     data2,
                     S=self.samples,
                     testFunc=stat_tools.F_mean_diff_flat,
                     adaptive=self.adaptive)

            if self.doHistogram:
                import matplotlib.pyplot as plt
                if testlist:
                    n, bins, patches = plt.hist(testlist,
                                                density=1,
                                                facecolor='c',
                                                alpha=0.75,
                                                bins=100)
                else:
                    n, bins, patches = plt.hist([0, 0],
                                                density=1,
                                                facecolor='c',
                                                alpha=0.75,
                                                bins=100)
                plt.xlabel('Delta Mean')
                plt.ylabel('Probability')
                plt.title('%s - Histogram of Delta Mean' % gene.orf)
                plt.axvline(test_obs,
                            color='r',
                            linestyle='dashed',
                            linewidth=3)
                plt.grid(True)
                genePath = os.path.join(histPath, gene.orf + ".png")
                if not os.path.exists(histPath):
                    os.makedirs(histPath)
                plt.savefig(genePath)
                plt.clf()

            sum1 = numpy.sum(data1)
            sum2 = numpy.sum(data2)
            data.append([
                gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, sum1,
                sum2, test_obs, log2FC, pval_2tail
            ])

            # Update progress
            text = "Running Resampling Method... %5.1f%%" % (100.0 * count / N)
            self.progress_update(text, count)

        #
        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort()
        qval = stat_tools.BH_fdr_correction([row[-1] for row in data])

        self.output.write("#Resampling\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: norm=%s, samples=%s, pseudocounts=%1.2f, adaptive=%s, histogram=%s, includeZeros=%s, output=%s\n"
                % (self.normalization, self.samples, self.pseudocount,
                   self.adaptive, self.doHistogram, self.includeZeros,
                   self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))
        self.output.write("#Control Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Experimental Data: %s\n" %
                          (",".join(self.expdata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        for i, row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, sum1, sum2, test_obs, log2FC,
             pval_2tail) = row
            self.output.write(
                "%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.1f\t%1.2f\t%1.1f\t%1.5f\t%1.5f\n"
                % (orf, name, desc, n, mean1, mean2, log2FC, sum1, sum2,
                   test_obs, pval_2tail, qval[i]))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Resampling")
        self.finish()
        self.transit_message("Finished resampling Method")

예제 #27

0

파일 보기

파일: gi.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Genetic Interactions Method")
        start_time = time.time()
        self.output.write("#GI\n")

        wiglist = self.ctrldataA + self.ctrldataB + self.expdataA + self.expdataB

        Nwig = len(wiglist)
        Na1 = len(self.ctrldataA)
        Nb1 = len(self.ctrldataB)
        Na2 = len(self.expdataA)
        Nb2 = len(self.expdataB)

        # Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(wiglist,
                                                            wxobj=self.wxobj)

        # Normalize data if specified
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  wiglist,
                                                  self.annotation_path)

        # Do LOESS correction if specified
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        # Get Gene objects for each condition
        G_A1 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[:Na1],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_B1 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[Na1:(Na1 + Nb1)],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_A2 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[(Na1 + Nb1):(Na1 + Nb1 + Na2)],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_B2 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[(Na1 + Nb1 + Na2):],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)

        means_list_a1 = []
        means_list_b1 = []
        means_list_a2 = []
        means_list_b2 = []

        var_list_a1 = []
        var_list_a2 = []
        var_list_b1 = []
        var_list_b2 = []

        # Base priors on empirical observations across genes.
        for gene in sorted(G_A1):
            if gene.n > 1:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                means_list_a1.append(numpy.mean(A1_data))
                var_list_a1.append(numpy.var(A1_data))

                means_list_b1.append(numpy.mean(B1_data))
                var_list_b1.append(numpy.var(B1_data))

                means_list_a2.append(numpy.mean(A2_data))
                var_list_a2.append(numpy.var(A2_data))

                means_list_b2.append(numpy.mean(B2_data))
                var_list_b2.append(numpy.var(B2_data))

        # Priors
        mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01)
        mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01)
        mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01)
        mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01)

        s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01)
        s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01)
        s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01)
        s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01)

        k0 = 1.0
        nu0 = 1.0
        data = []

        postprob = []
        count = 0
        N = len(G_A1)
        self.progress_range(N)
        # Perform actual analysis
        for gene in G_A1:

            # If there is some data
            if gene.n > 0:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                #            Time-1   Time-2
                #
                #  Strain-A     A       C
                #
                #  Strain-B     B       D

                try:
                    muA1_post, varA1_post = stat_tools.sample_trunc_norm_post(
                        A1_data, self.samples, mu0_A1, s20_A1, k0, nu0)
                    muB1_post, varB1_post = stat_tools.sample_trunc_norm_post(
                        B1_data, self.samples, mu0_B1, s20_B1, k0, nu0)
                    muA2_post, varA2_post = stat_tools.sample_trunc_norm_post(
                        A2_data, self.samples, mu0_A2, s20_A2, k0, nu0)
                    muB2_post, varB2_post = stat_tools.sample_trunc_norm_post(
                        B2_data, self.samples, mu0_B2, s20_B2, k0, nu0)

                except Exception as e:
                    muA1_post = varA1_post = numpy.ones(self.samples)
                    muB1_post = varB1_post = numpy.ones(self.samples)
                    muA2_post = varA2_post = numpy.ones(self.samples)
                    muB2_post = varB2_post = numpy.ones(self.samples)

                logFC_A_post = numpy.log2(muA2_post / muA1_post)
                logFC_B_post = numpy.log2(muB2_post / muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                alpha = 0.05

                # Get Bounds of the HDI
                l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC(
                    logFC_A_post, 1 - alpha)

                l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC(
                    logFC_B_post, 1 - alpha)

                l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC(
                    delta_logFC_post, 1 - alpha)

                mean_logFC_A = numpy.mean(logFC_A_post)
                mean_logFC_B = numpy.mean(logFC_B_post)
                mean_delta_logFC = numpy.mean(delta_logFC_post)

                # Is HDI significantly different than ROPE? (i.e. no overlap)
                not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope

                # Probability of posterior overlaping with ROPE
                probROPE = numpy.mean(
                    numpy.logical_and(delta_logFC_post >= 0.0 - self.rope,
                                      delta_logFC_post <= 0.0 + self.rope))

            # If there is no data, assume empty defaults
            else:
                A1_data = [0, 0]
                B1_data = [0, 0]
                A2_data = [0, 0]
                B2_data = [0, 0]
                muA1_post = varA1_post = numpy.ones(self.samples)
                muB1_post = varB1_post = numpy.ones(self.samples)
                muA2_post = varA2_post = numpy.ones(self.samples)
                muB2_post = varB2_post = numpy.ones(self.samples)
                logFC_A_post = numpy.log2(muA2_post / muA1_post)
                logFC_B_post = numpy.log2(muB2_post / muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                mean_logFC_A = 0
                mean_logFC_B = 0
                mean_delta_logFC = 0
                l_logFC_A = 0
                u_logFC_A = 0
                l_logFC_B = 0
                u_logFC_B = 0
                l_delta_logFC = 0
                u_delta_logFC = 0
                probROPE = 1.0

            if numpy.isnan(l_logFC_A):
                l_logFC_A = -10
                u_logFC_A = 10
            if numpy.isnan(l_logFC_B):
                l_logFC_B = -10
                u_logFC_B = 10
            if numpy.isnan(l_delta_logFC):
                l_delta_logFC = -10
                u_delta_logFC = 10

            postprob.append(probROPE)
            data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post),
                         numpy.mean(muA2_post), numpy.mean(muB1_post),
                         numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B,
                         mean_delta_logFC, l_delta_logFC, u_delta_logFC,
                         probROPE, not_HDI_overlap_bit))

            text = "Running GI Method... %2.0f%%" % (100.0 * (count + 1) / N)
            self.progress_update(text, count)
            self.transit_message_inplace("Running Export Method... %1.1f%%" %
                                         (100.0 * count / (N - 1)))
            count += 1

        # for HDI, maybe I should sort on abs(mean_delta_logFC); however, need to sort by prob to calculate BFDR
        probcol = -2  # probROPEs
        data.sort(key=lambda x: x[probcol])
        sortedprobs = numpy.array([x[probcol] for x in data])

        # BFDR method: Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical mixture method. Biostatistics, 5:155–176.

        if self.signif == "BFDR":
            sortedprobs = numpy.array(sortedprobs)
            #sortedprobs.sort() # why, since already sorted?
            bfdr = numpy.cumsum(sortedprobs) / numpy.arange(
                1,
                len(sortedprobs) + 1)
            adjusted_prob = bfdr  # should be same order as sorted above by probROPE
            adjusted_label = "BFDR"

        elif self.signif == "FWER":
            fwer = stat_tools.FWER_Bayes(sortedprobs)
            #fwer.sort() # should not need this if monotonic
            adjusted_prob = fwer
            adjusted_label = "FWER"

        # If not using adjustment for classification, sort correctly
        else:
            adjusted_prob = sortedprobs
            adjusted_label = "un"
            # should I stable-sort by overlap_bit?


#            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
#            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
#            data = [data[ii] for ii in sorted_index]

# Print(output)
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n"
                % (self.normalization, self.samples, self.includeZeros,
                   self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        now = str(datetime.datetime.now())
        now = now[:now.rfind('.')]
        self.output.write("#Date: " + now + "\n")
        #self.output.write("#Runtime: %s s\n" % (time.time() - start_time))

        self.output.write("#Control Data-A: %s\n" %
                          (",".join(self.ctrldataA).encode('utf-8')))
        self.output.write("#Control Data-B: %s\n" %
                          (",".join(self.ctrldataB).encode('utf-8')))
        self.output.write("#Experimental Data-A: %s\n" %
                          (",".join(self.expdataA).encode('utf-8')))
        self.output.write("#Experimental Data-B: %s\n" %
                          (",".join(self.expdataB).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          (self.annotation_path.encode('utf-8')))
        self.output.write("#ROPE=%s, method for significance=%s\n" %
                          (self.rope, self.signif))
        #self.output.write("#%s\n" % "\t".join(columns))

        if self.signif == "HDI":
            self.output.write(
                "#Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n"
            )
        elif self.signif in "prob BDFR FWER":
            self.output.write(
                "#Significant interactions are those whose %s-adjusted probability of the delta-logFC falling within ROPE is < 0.05.\n"
                % (adjusted_label))

        # Write column names (redundant with self.columns)
        self.output.write(
            "#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tIs HDI outside ROPE?\tProb. of delta-logFC being within ROPE\t%s-Adjusted Probability\tType of Interaction\n"
            % adjusted_label)

        # Write gene results
        for i, row in enumerate(data):
            #1   2    3        4                5              6               7                8            9            10              11             12            13         14
            orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row

            interaction = self.classify_interaction(mean_delta_logFC,
                                                    mean_logFC_B, mean_logFC_A)
            type_of_interaction = "No Interaction"
            if self.signif in "prob BFDR FWER" and adjusted_prob[i] < 0.05:
                type_of_interaction = interaction
            if self.signif == "HDI" and not_HDI_overlap_bit:
                type_of_interaction = interaction

            new_row = tuple(
                list(row[:-2]) + [
                    not_HDI_overlap_bit, probROPE, adjusted_prob[i],
                    type_of_interaction
                ])
            self.output.write(
                "%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%s\t%1.8f\t%1.8f\t%s\n"
                % new_row)

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="GI")
        self.finish()
        self.transit_message("Finished Genetic Interactions Method")

예제 #28

0

파일 보기

파일: test_pytransit_tools.py 프로젝트: mad-lab/transit

 def test_normalization(self):
     N = len(all_data_list)
     data,position = tnseq_tools.get_data(all_data_list)
     norm_data,factors = norm_tools.normalize_data(data, "TTR")
     self.assertFalse((factors == numpy.ones(N)).all())

예제 #29

0

파일 보기

    
    
    if DO_LIB:
        ctrl_lib_str = "ABAB"
        exp_lib_str = "AAABBB"
    else:
        ctrl_lib_str = ""
        exp_lib_str = ""
    
    Kctrl = len(ctrldata)
    Kexp  = len(expdata)

    (data, position) = transit_tools.get_validated_data(ctrldata+expdata)
    (K,N) = data.shape

    (data, factors) = norm_tools.normalize_data(data, "TTR", ctrldata+expdata, annotation)

    G = tnseq_tools.Genes(ctrldata + expdata, annotation, data=data, position=position)


    gene = G[i]

    print("\n\n")
    print("#"*100)
    print("#  (%s)  NEW TEST:   %s"  % (DO_LIB, gene))
    print("#"*100)
    print("")
       

 
    ii = numpy.ones(gene.n) == 1

예제 #30

0

파일 보기

파일: example.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Example Method")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata,
                              self.annotation_path,
                              minread=1,
                              reps=self.replicates,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count += 1
            if gene.n == 0:
                mean = 0.0
            else:
                mean = numpy.mean(gene.reads)

            if gene.k == 0:
                nzmean = 0.0
            else:
                nzmean = numpy.sum(gene.reads) / float(gene.k)

            data.append(
                "%s\t%s\t%s\t%s\t%s\t%1.2f\t%1.2f\n" %
                (gene.orf, gene.name, gene.desc, gene.k, gene.n, mean, nzmean))

            # Update Progress
            text = "Running Example Method... %5.1f%%" % (100.0 * count / N)
            self.progress_update(text, count)

        self.output.write("#Example\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          self.annotation_path.encode('utf-8'))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        data.sort()
        for line in data:
            self.output.write(line)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Example")
        self.finish()
        self.transit_message("Finished Example Method")

예제 #31

0

파일 보기

파일: griffin.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Griffin Method")
        start_time = time.time()
       

        #Get orf data
        self.transit_message("Getting Data")

        (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
        (K,N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)





        N = len(G)
        self.progress_range(N)
        count = 0
        pins = G.global_theta()
        pnon = 1.0 - pins
        results = []
        for gene in G:
            if gene.n == 0:
                results.append([gene, 0.0, 1.000])
            else:
                B = 1.0/math.log(1.0/pnon)
                u = math.log(gene.n*pins, 1.0/pnon)
                exprun = tnseq_tools.ExpectedRuns(gene.n, pnon)
                pval = 1.0 - tnseq_tools.GumbelCDF(gene.r, u, B)
                results.append([gene, exprun, pval])

            text = "Running Griffin Method... %5.1f%%" % (100.0*(count+1)/(N))
            self.progress_update(text, count)
            count+=1


        pval = [row[-1] for row in results]
        padj = stat_tools.BH_fdr_correction(pval)
        for i in range(len(results)):
            results[i].append(padj[i])
        results.sort()
        
        self.output.write("#Griffin\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
        self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))
        
        for (gene, exprun, pval, padj) in results:
            self.output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%1.1f\t%1.5f\t%1.5f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r, gene.s, gene.t, exprun, pval, padj))

        self.output.close()

        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Griffin")
        self.finish()
        self.transit_message("Finished Griffin Method")

예제 #32

0

파일 보기

    def Run(self):

        self.transit_message("Starting rankproduct Method")
        start_time = time.time()

        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata +
                                                            self.expdata,
                                                            wxobj=self.wxobj)
        if self.normalization != "none":
            self.transit_message("Normalizing using: %s" % self.normalization)

            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata + self.expdata,
                                                  self.annotation_path)

        Gctrl = tnseq_tools.Genes(self.ctrldata + self.expdata,
                                  self.annotation_path,
                                  ignoreCodon=self.ignoreCodon,
                                  nterm=self.NTerminus,
                                  cterm=self.CTerminus,
                                  data=data[:Kctrl, :],
                                  position=position)

        Gexp = tnseq_tools.Genes(self.ctrldata + self.expdata,
                                 self.annotation_path,
                                 ignoreCodon=self.ignoreCodon,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus,
                                 data=data[Kctrl:, :],
                                 position=position)

        Ngenes = len(Gctrl)

        # Get the average counts for all the genes, in each replicate
        meanCtrl = numpy.zeros((Kctrl, Ngenes))
        meanExp = numpy.zeros((Kexp, Ngenes))

        for i in range(Ngenes):
            if numpy.any(Gctrl[i].reads):
                meanCtrl[:, i] = numpy.mean(Gctrl[i].reads, 1)
            else:
                meanCtrl[:, i] = numpy.zeros(Kctrl)
            #
            if numpy.any(Gexp[i].reads):
                meanExp[:, i] = numpy.mean(Gexp[i].reads, 1)
            else:
                meanExp[:, i] = numpy.zeros(Kexp)

        # Calculate a logFC2 between Experimental and Control
        # Then calculates it's rank, and observed rankProduct
        logFC2 = numpy.log2((meanExp + 0.0001) / (meanCtrl + 0.0001))
        rank = numpy.array([scipy.stats.rankdata(Lvec) for Lvec in logFC2])
        obsRP = numpy.power(numpy.prod(rank, 0), 1.0 / Kctrl)

        permutations = numpy.zeros((self.samples, Ngenes))
        tempranks = scipy.array(
            [numpy.arange(1, Ngenes + 1) for rep in range(Kctrl)])
        for s in range(self.samples):
            rankperm = numpy.array(
                [numpy.random.permutation(tr) for tr in tempranks])
            permutations[s] = numpy.power(numpy.prod(rankperm, 0), 1.0 / Kctrl)

        rankRP = numpy.argsort(obsRP) + 1

        #rankproduct
        data = []
        count = 0
        self.progress_range(Ngenes)
        for i, gene in enumerate(Gctrl):
            count += 1

            meanctrl = numpy.mean(Gctrl[i].reads)
            meanexp = numpy.mean(Gexp[i].reads)
            log2fc = numpy.log2((meanexp + 0.0001) / (meanctrl + 0.0001))
            countbetter = numpy.sum(permutations <= obsRP[i])

            pval = countbetter / float(self.samples * Ngenes)
            e_val = countbetter / float(self.samples)
            q_paper = e_val / float(rankRP[i])

            data.append([
                gene.orf, gene.name, gene.desc, gene.n, meanctrl, meanexp,
                log2fc, obsRP[i], e_val, q_paper, pval
            ])

            # Update Progress
            text = "Running rankproduct Method... %5.1f%%" % (100.0 * count /
                                                              Ngenes)
            self.progress_update(text, count)

        #
        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort()
        q_bh = stat_tools.BH_fdr_correction([row[-1] for row in data])

        self.output.write("#RankProduct\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          self.annotation_path.encode('utf-8'))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % (columns))

        for i, row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene, e_val,
             q_paper, pval) = row
            self.output.write(
                "%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.8f\t%1.1f\t%1.8f\n" %
                (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene,
                 e_val, q_paper))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="RankProduct")
        self.finish()
        self.transit_message("Finished rankproduct Method")

예제 #33

0

파일 보기

파일: utest.py 프로젝트: wmatern/transit

    def Run(self):

        self.transit_message("Starting Mann-Whitney U-test Method")
        start_time = time.time()



        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj)

        (K,N) = data.shape


        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])


        G = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)


        #u-test
        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count+=1
            if gene.k == 0 or gene.n == 0:
                (test_obs, mean1, mean2, log2FC, u_stat, pval_2tail) = (0, 0, 0, 0, 0.0, 1.00)
            else:

                if not self.includeZeros:
                    ii = numpy.sum(gene.reads,0) > 0
                else:
                    ii = numpy.ones(gene.n) == 1


                data1 = gene.reads[:Kctrl,ii].flatten()
                data2 = gene.reads[Kctrl:,ii].flatten()
                try:
                    u_stat, pval_2tail = scipy.stats.mannwhitneyu(data1, data2,
                        alternative="two-sided")
                except ValueError as e:
                    u_stat, pval_2tail = 0.0, 1.00

                n1 = len(data1)
                n2 = len(data2)

                mean1 = 0
                if n1 > 0:
                    mean1 = numpy.mean(data1)
                mean2 = 0
                if n2 > 0:
                    mean2 = numpy.mean(data2)

                try:
                    # Only adjust log2FC if one of the means is zero
                    if mean1 > 0 and mean2 > 0:
                        log2FC = math.log((mean2)/(mean1),2)
                    else:
                        log2FC = math.log((mean2+1.0)/(mean1+1.0),2)
                except:
                    log2FC = 0.0


            #["Orf","Name","Desc","Sites","Mean Ctrl","Mean Exp","log2FC", "U-Statistic","p-value","Adj. p-value"]


            data.append([gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, log2FC, u_stat, pval_2tail])

            # Update Progress
            text = "Running Mann-Whitney U-test Method... %1.1f%%" % (100.0*count/N)
            self.progress_update(text, count)


        #
        self.transit_message("") # Printing empty line to flush stdout
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort()
        qval = stat_tools.BH_fdr_correction([row[-1] for row in data])


        self.output.write("#utest\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: norm=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.includeZeros, self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))
        self.output.write("#Control Data: %s\n" % (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Experimental Data: %s\n" % (",".join(self.expdata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        for i,row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail) = row
            self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.2f\t%1.5f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail, qval[i]))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="utest")
        self.finish()
        self.transit_message("Finished Mann-Whitney U-test Method")

예제 #34

0

파일 보기

파일: gumbel.py 프로젝트: mad-lab/transit

    def Run(self):

        self.status_message("Starting Gumbel Method")

        #Set Default parameter values
        w1 = 0.15
        w0 = 1.0 - w1
        ALPHA = 1
        BETA = 1
        ALPHA_w = 600
        BETA_w = 3400
        mu_c = 0
        acctot = 0.0
        phi_start = 0.3
        sigma_c = 0.01 
        
        start_time = time.time()
       
        self.progress_range(self.samples+self.burnin)
        
        #Get orf data
        self.transit_message("Reading Annotation")

        #Validate data has empty sites
        #(status, genome) = transit_tools.validate_wig_format(self.ctrldata, wxobj=self.wxobj)
        #if status <2: tn_used = "himar1"
        #else: tn_used = "tn5"

        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
        (K,N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=self.minread, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)

        ii_good = numpy.array([self.good_orf(g) for g in G]) # Gets index of the genes that can be analyzed

        K = G.local_insertions()[ii_good]
        N = G.local_sites()[ii_good]
        R = G.local_runs()[ii_good]
        S = G.local_gap_span()[ii_good]
        T = G.local_gene_span()[ii_good]

        self.transit_message("Doing Regression")
        mu_s, temp, sigma_s = stat_tools.regress(R, S) # Linear regression to estimate mu_s, sigma_s for span data
        mu_r, temp, sigma_r = stat_tools.regress(S, R) # Linear regression to estimate mu_r, sigma_r for run data

        N_GENES = len(G)
        N_GOOD = sum(ii_good)

        self.transit_message("Setting Initial Class")
        Z_sample = numpy.zeros((N_GOOD, self.samples))
        Z = [self.classify(g.n, g.r, 0.5)   for g in G if self.good_orf(g)]
        Z_sample[:,0] = Z
        N_ESS = numpy.sum(Z_sample[:,0] == 1)
        
        phi_sample = numpy.zeros(self.samples) #[]
        phi_sample[0] = phi_start
        phi_old = phi_start
        phi_new = 0.00
        
        SIG = numpy.array([self.sigmoid(g.s, g.t) * scipy.stats.norm.pdf(g.r, mu_r*g.s, sigma_r) for g in G if self.good_orf(g)])

#        idxG,idxN = -1,0
#        for i in range(len(G)):
#          if G[i].name=="glf": idxG = i
#          if ii_good[i]==True: idxN += 1 # could do sum(ii_good[:idxG])

        i = 1; count = 0;
        while i < self.samples:

            try:
                # PHI
                acc = 1.0
                phi_new  = phi_old + random.gauss(mu_c, sigma_c)
                i0 = Z_sample[:,i-1] == 0
                if phi_new > 1 or phi_new <= 0 or (self.F_non(phi_new, N[i0], R[i0]) - self.F_non(phi_old, N[i0], R[i0])) < math.log(random.uniform(0,1)):
                    phi_new = phi_old
                    acc = 0.0
                    flag = 0
            
                # Z
                Z = self.sample_Z(phi_new, w1, N, R, S, T, mu_s, sigma_s, SIG)
            
                # w1
                N_ESS = sum(Z == 1)
                w1 = scipy.stats.beta.rvs(N_ESS + ALPHA_w, N_GOOD - N_ESS + BETA_w)
            
                count +=1
                acctot+=acc
            
                if (count > self.burnin) and (count % self.trim == 0):
                    phi_sample[i] = phi_new
                    Z_sample[:,i] = Z
                    i+=1

            except ValueError as e:
                self.transit_message("Error: %s" % e) 
                self.transit_message("This is likely to have been caused by poor data (e.g. too sparse).") 
                self.transit_message("If the density of the dataset is too low, the Gumbel method will not work.") 
                self.transit_message("Quitting.") 
                return

#            print i,phi_new,w1,G[idxG].name,N[idxN],R[idxN],Z[idxN]
            
            phi_old = phi_new
            #Update progress
            text = "Running Gumbel Method... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin))
            self.progress_update(text, count)


        ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample)
        (ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR)

        #Orf    k   n   r   s   zbar
        self.output.write("#Gumbel\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, minread=%s, trim=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.minread, self.trim))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
        self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
        self.output.write("#FDR Corrected thresholds: %f, %f\n" % (ess_t, non_t))
        self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count))
        self.output.write("#Total Iterations Performed:\t%d\n" % count)
        self.output.write("#Sample Size:\t%d\n" % i)
        self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))
        i = 0
        data = []
        for g in G:
            if not self.good_orf(g):
                zbar = -1.0
            else:
                zbar = ZBAR[i]
                i+=1
            if zbar > ess_t:
                call = "E"
            elif non_t <= zbar <= ess_t:
                call = "U"
            elif 0 <= zbar < non_t:
                call = "NE"
            else:
                call = "S"
            data.append("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" % (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call))
        data.sort()
        for line in data:
            self.output.write(line)
        self.output.close()

        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Gumbel")
        self.finish()
        self.transit_message("Finished Gumbel Method")

예제 #35

0

파일 보기

    def Run(self):

        self.status_message("Starting Gumbel Method")

        #Set Default parameter values
        w1 = 0.15
        w0 = 1.0 - w1
        ALPHA = 1
        BETA = 1
        ALPHA_w = 600
        BETA_w = 3400
        mu_c = 0
        acctot = 0.0
        phi_start = 0.3
        sigma_c = 0.01

        start_time = time.time()

        self.progress_range(self.samples + self.burnin)

        #Get orf data
        self.transit_message("Reading Annotation")

        #Validate data has empty sites
        #(status, genome) = transit_tools.validate_wig_format(self.ctrldata, wxobj=self.wxobj)
        #if status <2: tn_used = "himar1"
        #else: tn_used = "tn5"

        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata,
                              self.annotation_path,
                              minread=self.minread,
                              reps=self.replicates,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        ii_good = numpy.array(
            [self.good_orf(g)
             for g in G])  # Gets index of the genes that can be analyzed

        K = G.local_insertions()[ii_good]
        N = G.local_sites()[ii_good]
        R = G.local_runs()[ii_good]
        S = G.local_gap_span()[ii_good]
        T = G.local_gene_span()[ii_good]

        self.transit_message("Doing Regression")
        mu_s, temp, sigma_s = stat_tools.regress(
            R, S)  # Linear regression to estimate mu_s, sigma_s for span data
        mu_r, temp, sigma_r = stat_tools.regress(
            S, R)  # Linear regression to estimate mu_r, sigma_r for run data

        N_GENES = len(G)
        N_GOOD = sum(ii_good)

        self.transit_message("Setting Initial Class")
        Z_sample = numpy.zeros((N_GOOD, self.samples))
        Z = [self.classify(g.n, g.r, 0.5) for g in G if self.good_orf(g)]
        Z_sample[:, 0] = Z
        N_ESS = numpy.sum(Z_sample[:, 0] == 1)

        phi_sample = numpy.zeros(self.samples)  #[]
        phi_sample[0] = phi_start
        phi_old = phi_start
        phi_new = 0.00

        SIG = numpy.array([
            self.sigmoid(g.s, g.t) *
            scipy.stats.norm.pdf(g.r, mu_r * g.s, sigma_r) for g in G
            if self.good_orf(g)
        ])

        #        idxG,idxN = -1,0
        #        for i in range(len(G)):
        #          if G[i].name=="glf": idxG = i
        #          if ii_good[i]==True: idxN += 1 # could do sum(ii_good[:idxG])

        i = 1
        count = 0
        while i < self.samples:

            try:
                # PHI
                acc = 1.0
                phi_new = phi_old + random.gauss(mu_c, sigma_c)
                i0 = Z_sample[:, i - 1] == 0
                if phi_new > 1 or phi_new <= 0 or (
                        self.F_non(phi_new, N[i0], R[i0]) -
                        self.F_non(phi_old, N[i0], R[i0])) < math.log(
                            random.uniform(0, 1)):
                    phi_new = phi_old
                    acc = 0.0
                    flag = 0

                # Z
                Z = self.sample_Z(phi_new, w1, N, R, S, T, mu_s, sigma_s, SIG)

                # w1
                N_ESS = sum(Z == 1)
                w1 = scipy.stats.beta.rvs(N_ESS + ALPHA_w,
                                          N_GOOD - N_ESS + BETA_w)

                count += 1
                acctot += acc

                if (count > self.burnin) and (count % self.trim == 0):
                    phi_sample[i] = phi_new
                    Z_sample[:, i] = Z
                    i += 1

            except ValueError as e:
                self.transit_message("Error: %s" % e)
                self.transit_message(
                    "This is likely to have been caused by poor data (e.g. too sparse)."
                )
                self.transit_message(
                    "If the density of the dataset is too low, the Gumbel method will not work."
                )
                self.transit_message("Quitting.")
                return


#            print i,phi_new,w1,G[idxG].name,N[idxN],R[idxN],Z[idxN]

            phi_old = phi_new
            #Update progress
            text = "Running Gumbel Method... %5.1f%%" % (
                100.0 * (count + 1) / (self.samples + self.burnin))
            self.progress_update(text, count)

        ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample)
        (ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR)

        #Orf    k   n   r   s   zbar
        self.output.write("#Gumbel\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, minread=%s, trim=%s\n"
                % (",".join(self.ctrldata).encode('utf-8'),
                   self.annotation_path.encode('utf-8'),
                   self.output.name.encode('utf-8'), self.samples,
                   self.minread, self.trim))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          self.annotation_path.encode('utf-8'))
        self.output.write("#FDR Corrected thresholds: %f, %f\n" %
                          (ess_t, non_t))
        self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" %
                          (100.0 * acctot / count))
        self.output.write("#Total Iterations Performed:\t%d\n" % count)
        self.output.write("#Sample Size:\t%d\n" % i)
        self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))
        i = 0
        data = []
        for g in G:
            if not self.good_orf(g):
                zbar = -1.0
            else:
                zbar = ZBAR[i]
                i += 1
            if zbar > ess_t:
                call = "E"
            elif non_t <= zbar <= ess_t:
                call = "U"
            elif 0 <= zbar < non_t:
                call = "NE"
            else:
                call = "S"
            data.append(
                "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" %
                (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call))
        data.sort()
        for line in data:
            self.output.write(line)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Gumbel")
        self.finish()
        self.transit_message("Finished Gumbel Method")

예제 #36

0

파일 보기

파일: zinb.py 프로젝트: mad-lab/transit

    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                    "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                    % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))}))
            sys.exit(1)


        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(
            covariatesByFileList,
            filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(
            interactionsByFileList,
            filenamesInCombWig)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions(
                data,
                conditions,
                covariates = covariates,
                interactions = interactions,
                ignored_conditions = self.ignored_conditions,
                included_conditions = self.included_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split("_")
            ic2 = y.split("_")
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames, orderStats)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")
        head = ("Rv Gene TAs".split() +
                map(lambda v: "Mean_" + v, orderedStatGroupNames) +
                map(lambda v: "NZmean_" + v, orderedStatGroupNames) +
                map(lambda v: "NZperc_" + v, orderedStatGroupNames) +
                "pval padj".split() + ["status"])

        file.write("#Console: python %s\n" % " ".join(sys.argv))
        file.write('\t'.join(head)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                    ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] +
                    ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))

예제 #37

0

파일 보기

파일: zinb.py 프로젝트: ywf1215/transit

    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                    "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                    % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))}))
            sys.exit(1)


        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(
            covariatesByFileList,
            filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(
            interactionsByFileList,
            filenamesInCombWig)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions(
                data,
                conditions,
                covariates = covariates,
                interactions = interactions,
                ignored_conditions = self.ignored_conditions,
                included_conditions = self.included_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split(SEPARATOR)
            ic2 = y.split(SEPARATOR)
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats))
        headersStatGroupNames = [x.replace(SEPARATOR,'_') for x in orderedStatGroupNames]

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")
        head = ("Rv Gene TAs".split() +
                list(map(lambda v: "Mean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) +
                "pval padj".split() + ["status"])

        file.write("#Console: python %s\n" % " ".join(sys.argv))
        file.write('\t'.join(head)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                    ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] +
                    ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))

예제 #38

0

파일 보기

파일: zinb.py 프로젝트: mad-lab/transit

    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [
            x for x in packnames if not rpackages.isinstalled(x)
        ]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                % ({
                    '0': '"{0}"'.format('", "'.join(r_packages_needed))
                }))
            sys.exit(1)

        self.transit_message("Getting Data")
        (sites, data,
         filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        # if a covar is not found, this crashes; check for it?
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(
            self.metadata,
            self.covars,
            self.interactions,
            condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(conditionsByFile,
                                             filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(covariatesByFileList,
                                             filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(interactionsByFileList,
                                                 filenamesInCombWig)

        conditionsList = self.select_conditions(conditions,
                                                self.included_conditions,
                                                self.ignored_conditions,
                                                orderingMetadata)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
            data,
            conditions,
            conditionsList,
            covariates=covariates,
            interactions=interactions)

        # show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars

        filesByCondition = self.invertDict(conditionsByFile)
        samples_used = set()
        for cond in conditionsList:
            samples_used.update(filesByCondition[cond])
        vars = [condition_name] + self.covars + self.interactions
        vars2vals = {}
        vars2vals[condition_name] = list(set(conditions))
        for i, var in enumerate(self.covars):
            vars2vals[var] = list(set(covariates[i]))
        for i, var in enumerate(self.interactions):
            vars2vals[var] = list(set(interactions[i]))
        varsByFileList = [conditionsByFile
                          ] + covariatesByFileList + interactionsByFileList
        for i, var in enumerate(vars):
            print("\nCondition/Covariate/Interaction: %s" % vars[i])
            filesByVar = self.invertDict(varsByFileList[i])
            for k, v in filesByVar.items():
                samples = list(samples_used.intersection(set(v)))
                if k in vars2vals.get(var, []):
                    print("%s: %s" % (k, ' '.join(samples)))
        pairs = []
        print("\nsamples in cross-product:")
        any_empty = self.expandVar([], vars, varsByFileList, vars2vals,
                                   set(samples_used))
        if any_empty:
            print(
                "warning: ZINB requires samples in all combinations of conditions; the fact that one is empty could result in Model Errors"
            )

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes,
                                                          TASiteindexMap,
                                                          nterm=self.NTerminus,
                                                          cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap,
                                                     genes, conditions,
                                                     interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep,
                                                 LogZPercByRep,
                                                 RvSiteindexesMap, conditions,
                                                 covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split(SEPARATOR)
            ic2 = y.split(SEPARATOR)
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) -
                            self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) -
                            orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) -
                        orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) -
                        orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames,
                                       key=functools.cmp_to_key(orderStats))
        headersStatGroupNames = [
            x.replace(SEPARATOR, '_') for x in orderedStatGroupNames
        ]

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output, "w")
        if len(headersStatGroupNames) == 2: lfcNames = ["LFC"]
        else: lfcNames = list(map(lambda v: "LFC_" + v, headersStatGroupNames))
        head = ("Rv Gene TAs".split() +
                list(map(lambda v: "Mean_" + v, headersStatGroupNames)) +
                lfcNames +
                list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) +
                "pval padj".split() + ["status"])

        file.write("#Console: python3 %s\n" % " ".join(sys.argv))
        file.write(
            "#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n"
            % (self.normalization, self.NTerminus, self.CTerminus, self.PC))
        file.write('#' + '\t'.join(head) + EOL)
        for gene in genes:
            Rv = gene["rv"]
            means = [
                statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames
            ]
            PC = self.PC
            if len(means) == 2:
                LFCs = [numpy.math.log((means[1] + PC) / (means[0] + PC), 2)]
            else:
                m = numpy.mean(means)
                LFCs = [numpy.math.log((x + PC) / (m + PC), 2) for x in means]
            vals = ([Rv, gene["gene"],
                     str(len(RvSiteindexesMap[Rv]))] + [
                         "%0.1f" % statsByRv[Rv]['mean'][group]
                         for group in orderedStatGroupNames
                     ] + ["%0.3f" % x for x in LFCs] + [
                         "%0.1f" % statsByRv[Rv]['nz_mean'][group]
                         for group in orderedStatGroupNames
                     ] + [
                         "%0.2f" % statsByRv[Rv]['nz_perc'][group]
                         for group in orderedStatGroupNames
                     ] + ["%f" % x
                          for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals) + EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))

예제 #39

0

파일 보기

파일: test_pytransit_tools.py 프로젝트: ywf1215/transit

 def test_normalization(self):
     N = len(all_data_list)
     data, position = tnseq_tools.get_data(all_data_list)
     norm_data, factors = norm_tools.normalize_data(data, "TTR")
     self.assertFalse((factors == numpy.ones(N)).all())

예제 #40

0

파일 보기

파일: griffin.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Griffin Method")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")

        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata,
                              self.annotation_path,
                              minread=1,
                              reps=self.replicates,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        N = len(G)
        self.progress_range(N)
        count = 0
        pins = G.global_theta()
        pnon = 1.0 - pins
        results = []
        for gene in G:
            if gene.n == 0:
                results.append([gene, 0.0, 1.000])
            else:
                B = 1.0 / math.log(1.0 / pnon)
                u = math.log(gene.n * pins, 1.0 / pnon)
                exprun = tnseq_tools.ExpectedRuns(gene.n, pnon)
                pval = 1.0 - tnseq_tools.GumbelCDF(gene.r, u, B)
                results.append([gene, exprun, pval])

            text = "Running Griffin Method... %5.1f%%" % (100.0 * (count + 1) /
                                                          (N))
            self.progress_update(text, count)
            count += 1

        pval = [row[-1] for row in results]
        padj = stat_tools.BH_fdr_correction(pval)
        for i in range(len(results)):
            results[i].append(padj[i])
        results.sort()

        self.output.write("#Griffin\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          self.annotation_path.encode('utf-8'))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        for (gene, exprun, pval, padj) in results:
            self.output.write(
                "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%1.1f\t%1.5f\t%1.5f\n" %
                (gene.orf, gene.name, gene.desc, gene.k, gene.n, gene.r,
                 gene.s, gene.t, exprun, pval, padj))

        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Griffin")
        self.finish()
        self.transit_message("Finished Griffin Method")

예제 #41

0

파일 보기

파일: binomial.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Binomial Method")
        start_time = time.time()
        
        self.progress_range(self.samples+self.burnin)

        #Get orf data
        #self.transit_message("Getting Data")
        #G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus)

        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
        (K,N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)



        #Parameters
        self.transit_message("Setting Parameters")
        w1 = 0.15
        w0 = 1.0 - w1
        mu_c = 0

        Ngenes = len(G)
        sample_size = self.samples+self.burnin
        numReps = len(self.ctrldata)

        theta = numpy.zeros((Ngenes, sample_size))
        theta[:,0] = 0.10

        rho0 = numpy.zeros(sample_size); rho0[0] = 0.5;  Kp0 = numpy.zeros(sample_size); Kp0[0] = 10;
        rho1 = numpy.zeros(sample_size); rho1[0] = 0.10; Kp1 = numpy.zeros(sample_size); Kp1[0] = 3;

        Z = numpy.zeros((Ngenes, sample_size))
        pz1 = numpy.zeros(sample_size);
        n1 = 0

        w1 = scipy.stats.beta.rvs(self.alpha_w, self.beta_w)
        W1 = numpy.zeros(sample_size); W1[0] = w1



        #
        self.transit_message("Setting Initial Values")
        K = numpy.array([sum([1 for x in gene.reads.flatten() if x> 0]) for gene in G])
        N = numpy.array([len(gene.reads.flatten()) for gene in G])

        for g,gene in enumerate(G):
            if N[g] == 0: theta[g][0] = 0.5
            elif K[g]/float(N[g]) == 0: theta[g][0] = 0.001
            elif K[g]/float(N[g]) == 1: theta[g][0] = 0.001
            else: theta[g][0] = K[g]/float(N[g])

            #print g, ORF[g], K[g], N[g], theta[g][0]
            Z[g][0] = scipy.stats.bernoulli.rvs(1-theta[g][0])


        acc_p0 = 0; acc_k0 = 0;
        acc_p1 = 0; acc_k1 = 0;


        rho0c_std = 0.010
        kp0c_std = 1.40
        rho1c_std = 0.009
        kp1c_std = 1.1


        numpy.seterr(divide='ignore')
        for i in range(1, sample_size):

            i0 = Z[:,i-1] == 0; n0 = numpy.sum(i0);
            i1 = Z[:,i-1] == 1; n1 = numpy.sum(i1);

            theta[i0,i] = scipy.stats.beta.rvs(Kp0[i-1]*rho0[i-1] + K[i0],  Kp0[i-1]*(1-rho0[i-1]) + N[i0] - K[i0])
            theta[i1,i] = scipy.stats.beta.rvs(Kp1[i-1]*rho1[i-1] + K[i1],  Kp1[i-1]*(1-rho1[i-1]) + N[i1] - K[i1])
            
            rho0_c = rho0[i-1] + scipy.stats.norm.rvs(0, rho0c_std)
            Kp0_c = Kp0[i-1] + scipy.stats.norm.rvs(0, kp0c_std)


            if rho0_c <= 0: rho0[i] = rho0[i-1]
            else:
                fc = numpy.log(scipy.stats.beta.pdf(rho0_c, self.M0*self.pi0, self.M0*(1.0-self.pi0)))
                f0 = numpy.log(scipy.stats.beta.pdf(rho0[i-1], self.M0*self.pi0, self.M0*(1.0-self.pi0)))
                fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0_c, Kp0[i-1]*(1-rho0_c))))
                f0 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0[i-1], Kp0[i-1]*(1-rho0[i-1]))))
    
                if numpy.log(scipy.stats.uniform.rvs()) < fc - f0:
                    rho0[i] = rho0_c
                    acc_p0+=1
                else: rho0[i] = rho0[i-1]


            if Kp0_c <= 0: Kp0[i] = Kp0[i-1]
            else:
                fc = numpy.log(scipy.stats.gamma.pdf(Kp0_c, self.a0, self.b0));
                f0 = numpy.log(scipy.stats.gamma.pdf(Kp0[i-1], self.a0, self.b0));
                fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0_c*rho0[i], Kp0_c*(1-rho0[i]))))
                f0 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i0,i], Kp0[i-1]*rho0[i], Kp0[i-1]*(1-rho0[i]))))
    
                if numpy.log(scipy.stats.uniform.rvs()) < fc - f0:
                    Kp0[i] = Kp0_c
                    acc_k0+=1
                else: Kp0[i] = Kp0[i-1]

            rho1_c = rho1[i-1] + scipy.stats.norm.rvs(0, rho1c_std)
            Kp1_c = Kp1[i-1] + scipy.stats.norm.rvs(0, kp1c_std)


            if rho1_c <= 0:
                rho1[i] = rho1[i-1]
            else:
                fc = numpy.log(scipy.stats.beta.pdf(rho1_c, self.M1*self.pi1, self.M1*(1-self.pi1)))
                f1 = numpy.log(scipy.stats.beta.pdf(rho1[i-1], self.M1*self.pi1, self.M1*(1-self.pi1)))
                fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1_c, Kp1[i-1]*(1-rho1_c))))
                f1 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1[i-1], Kp1[i-1]*(1-rho1[i-1]))))
    
                if numpy.log(scipy.stats.uniform.rvs()) < fc - f1:
                    rho1[i] = rho1_c
                    acc_p1+=1
                else: rho1[i] = rho1[i-1]

            if Kp1_c <= 0: Kp1[i] = Kp1[i-1]
            else:
                
                fc = numpy.log(scipy.stats.gamma.pdf(Kp1_c, self.a1, self.b1));
                f1 = numpy.log(scipy.stats.gamma.pdf(Kp1[i-1], self.a1, self.b1));
                fc += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1_c*rho1[i], Kp1_c*(1-rho1[i]))))
                f1 += numpy.sum(numpy.log(scipy.stats.beta.pdf(theta[i1,i], Kp1[i-1]*rho1[i], Kp1[i-1]*(1-rho1[i]))))

                if numpy.log(scipy.stats.uniform.rvs()) < fc - f1:
                    Kp1[i] = Kp1_c
                    acc_k1+=1
                else: Kp1[i] = Kp1[i-1]


            g0 = scipy.stats.beta.pdf(theta[:,i], Kp0[i]*rho0[i], Kp0[i]*(1-rho0[i])) * (1-w1)
            g1 = scipy.stats.beta.pdf(theta[:,i], Kp1[i]*rho1[i], Kp1[i]*(1-rho1[i])) * (w1)
            p1 = g1/(g0+g1)
            p1 = numpy.nan_to_num(p1)

            
            try:
                Z[:,i] = scipy.stats.bernoulli.rvs(p1)
            except:
                inan = numpy.isnan(p1)
                print >> sys.stderr, "K=\t", K[inan]
                print >> sys.stderr, "N=\t", N[inan]
                print >> sys.stderr, "theta=", theta[inan,i]
                sys.exit()
            pz1[i] = p1[0]


            i1 = Z[:,i] == 1; n1 = numpy.sum(i1);
            #w1 = 0.15
            w1 = scipy.stats.beta.rvs(self.alpha_w + n1, self.beta_w + Ngenes - n1)
            W1[i] = w1


            #Update progress
            text = "Running Binomial Method... %5.1f%%" % (100.0*(i+1)/(sample_size))
            self.progress_update(text, i)

        numpy.seterr(divide='warn')

        z_bar = numpy.apply_along_axis(numpy.mean, 1, Z[:, self.burnin:])
        theta_bar = numpy.apply_along_axis(numpy.mean, 1, theta[:, self.burnin:])
        #(ess_threshold, noness_threshold) = stat_tools.fdr_post_prob(z_bar)
        (ess_threshold, noness_threshold) = stat_tools.bayesian_ess_thresholds(z_bar)

        self.output.write("#Binomial\n")
        #output.write("#Command: %s\n" % " ".join(["%s=%s" %(key,val) for (key,val) in kwargs.items()]))
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s, samples=%s, burnin=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8'), self.samples, self.burnin))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Thresholds: (%1.5f, %1.5f)\n" % (ess_threshold,noness_threshold))
        self.output.write("#rho0 Acceptance Rate:\t%f%%\n" % ((100.0*acc_p0)/sample_size))
        self.output.write("#Kp0  Acceptance Rate:\t%f%%\n" % ((100.0*acc_k0)/sample_size))
        self.output.write("#rho1 Acceptance Rate:\t%f%%\n" % ((100.0*acc_p1)/sample_size))
        self.output.write("#Kp1  Acceptance Rate:\t%f%%\n" % ((100.0*acc_k1)/sample_size))
        self.output.write("#Hyperparameters rho: \t%1.2f\t%3.1f\t%1.2f\t%3.1f\n" % (self.pi0, self.M0, self.pi1, self.M1))
        self.output.write("#Hyperparameters Kp: \t%3.1f\t%3.1f\t%3.1f\t%3.1f\n" % (self.a0, self.b0, self.a1, self.b1))
        self.output.write("#Hyperparameters W: \t%1.3f\t%1.3f\n" % (self.alpha_w, self.beta_w))


        self.output.write("#%s\n" % "\t".join(columns))

        data = []
        for g,gene in enumerate(G):
            c = "Uncertain"
            if z_bar[g] > ess_threshold:
                c = "Essential"
            if z_bar[g] < noness_threshold:
                c = "Non-Essential"    
            data.append("%s\t%s\t%s\t%1.1f\t%d\t%d\t%d\t%f\t%f\t%s" % (gene.orf, gene.name, gene.desc, K[g]/float(numReps), N[g]/numReps, K[g], N[g], theta_bar[g], z_bar[g], c))

        data.sort()
        for row in data:
            self.output.write("%s\n" % row)
        self.output.close()

        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Binomial")
        self.finish()
        self.transit_message("Finished Binomial Method")

예제 #42

0

파일 보기

파일: example.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting Example Method")
        start_time = time.time()
        
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
        (K,N) = data.shape

        if self.normalization and self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata, self.annotation_path)

        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, minread=1, reps=self.replicates, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)



        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count+=1
            if gene.n == 0:
                mean = 0.0
            else:
                mean = numpy.mean(gene.reads)

            if gene.k == 0:
                nzmean = 0.0
            else:
                nzmean = numpy.sum(gene.reads)/float(gene.k)

            data.append("%s\t%s\t%s\t%s\t%s\t%1.2f\t%1.2f\n" % (gene.orf, gene.name, gene.desc, gene.k, gene.n, mean, nzmean))

           
            # Update Progress 
            text = "Running Example Method... %5.1f%%" % (100.0*count/N)
            self.progress_update(text, count)
            
        
        self.output.write("#Example\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
        self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        data.sort()
        for line in data:
            self.output.write(line)
        self.output.close()

        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Example")
        self.finish()
        self.transit_message("Finished Example Method")

예제 #43

0

파일 보기

파일: tnseq_GI.py 프로젝트: abelew/tnseq_GI

def main(args, kwargs, quite=False, jumble=False):

    missingArgs = False
    if "a1" not in kwargs:
        missingArgs = True
        error("Missing -a1 argument")
    if "a2" not in kwargs:
        missingArgs = True
        error("Missing -a2 argument")
    if "b1" not in kwargs:
        missingArgs = True
        error("Missing -b1 argument")
    if "b2" not in kwargs:
        missingArgs = True
        error("Missing -b2 argument")
    if "pt" not in kwargs:
        missingArgs = True
        error("Missing -pt argument")

    if missingArgs:
        usage()
        sys.exit()

    A_1list = kwargs["a1"].split(",")
    A_2list = kwargs["a2"].split(",")
    B_1list = kwargs["b1"].split(",")
    B_2list = kwargs["b2"].split(",")

    annotation = kwargs["pt"]
    rope = float(kwargs.get("rope", 0.5))
    S = int(kwargs.get("s", 100000))
    norm_method = kwargs.get("n", "TTR")
    label = kwargs.get("l", "debug")
    onlyNZ = kwargs.get("-nz", False)
    doBFDR = kwargs.get("-bfdr", False)
    doFWER = kwargs.get("-fwer", False)
    DEBUG = []
    if "debug" in kwargs:
        DEBUG = kwargs["debug"].split(",")

    wiglist = A_1list + B_1list + A_2list + B_2list

    Nwig = len(wiglist)
    Na1 = len(A_1list)
    Nb1 = len(A_1list)
    Na2 = len(B_2list)
    Nb2 = len(B_2list)

    (data, position) = tnseq_tools.get_data(wiglist)

    ######### FILTER EMTPY SITES #########
    if onlyNZ:
        ii_good = numpy.sum(data, 0) > 0
        data = data[:, ii_good]
        position = position[ii_good]
    ######################################

    (data, factors) = norm_tools.normalize_data(data, norm_method, wiglist,
                                                sys.argv[1])

    if jumble:
        numpy.random.shuffle(data.flat)
        numpy.random.shuffle(data.flat)

    G_A1 = tnseq_tools.Genes([],
                             annotation,
                             data=data[:Na1],
                             position=position)
    G_B1 = tnseq_tools.Genes([],
                             annotation,
                             data=data[Na1:(Na1 + Nb1)],
                             position=position)
    G_A2 = tnseq_tools.Genes([],
                             annotation,
                             data=data[(Na1 + Nb1):(Na1 + Nb1 + Na2)],
                             position=position)
    G_B2 = tnseq_tools.Genes([],
                             annotation,
                             data=data[(Na1 + Nb1 + Na2):],
                             position=position)

    means_list_a1 = []
    means_list_b1 = []
    means_list_a2 = []
    means_list_b2 = []

    var_list_a1 = []
    var_list_a2 = []
    var_list_b1 = []
    var_list_b2 = []

    # Base priors on empirical observations accross genes.
    for gene in sorted(G_A1):
        if gene.n > 1:
            A1_data = G_A1[gene.orf].reads.flatten()
            B1_data = G_B1[gene.orf].reads.flatten()
            A2_data = G_A2[gene.orf].reads.flatten()
            B2_data = G_B2[gene.orf].reads.flatten()

            means_list_a1.append(numpy.mean(A1_data))
            var_list_a1.append(numpy.var(A1_data))

            means_list_b1.append(numpy.mean(B1_data))
            var_list_b1.append(numpy.var(B1_data))

            means_list_a2.append(numpy.mean(A2_data))
            var_list_a2.append(numpy.var(A2_data))

            means_list_b2.append(numpy.mean(B2_data))
            var_list_b2.append(numpy.var(B2_data))

    # Priors
    mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01)
    mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01)
    mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01)
    mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01)

    s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01)
    s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01)
    s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01)
    s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01)

    k0 = 1.0
    nu0 = 1.0

    data = []
    postprob = []

    if not quite:
        print "# Created with '%s'.  Copyright 2016-2017. Michael A. DeJesus & Thomas R. Ioerger" % (
            sys.argv[0])
        print "# Version %1.2f; http://saclab.tamu.edu/essentiality/GI" % __version__
        print "#"
        print "# python %s" % " ".join(sys.argv)
        print "# Samples = %d, k0=%1.1f, nu0=%1.1f" % (S, k0, nu0)
        print "# Mean Prior:       Variance Prior:"
        print "# mu0_A1 = %1.2f    s20_A1 = %1.1f" % (mu0_A1, s20_A1)
        print "# mu0_B1 = %1.2f    s20_B1 = %1.1f" % (mu0_B1, s20_B1)
        print "# mu0_A2 = %1.2f    s20_A2 = %1.1f" % (mu0_A2, s20_A2)
        print "# mu0_B2 = %1.2f    s20_B2 = %1.1f" % (mu0_B2, s20_B2)
        print "# ROPE:", rope
        print "# TTR Factors:", ", ".join(
            ["%1.4f" % x for x in numpy.array(factors).flatten()])
    for gene in sorted(G_A1):

        if len(DEBUG) > 0:
            if gene.orf not in DEBUG: continue

        if gene.n > 0:
            A1_data = G_A1[gene.orf].reads.flatten()
            B1_data = G_B1[gene.orf].reads.flatten()
            A2_data = G_A2[gene.orf].reads.flatten()
            B2_data = G_B2[gene.orf].reads.flatten()

            #            Time-1   Time-2
            #
            #  Strain-A     A       C
            #
            #  Strain-B     B       D

            try:
                muA1_post, varA1_post = sample_post(A1_data, S, mu0_A1, s20_A1,
                                                    k0, nu0)
                muB1_post, varB1_post = sample_post(B1_data, S, mu0_B1, s20_B1,
                                                    k0, nu0)
                muA2_post, varA2_post = sample_post(A2_data, S, mu0_A2, s20_A2,
                                                    k0, nu0)
                muB2_post, varB2_post = sample_post(B2_data, S, mu0_B2, s20_B2,
                                                    k0, nu0)
            except Exception as e:
                muA1_post = varA1_post = numpy.ones(S)
                muB1_post = varB1_post = numpy.ones(S)
                muA2_post = varA2_post = numpy.ones(S)
                muB2_post = varB2_post = numpy.ones(S)

            logFC_A_post = numpy.log2(muA2_post / muA1_post)
            logFC_B_post = numpy.log2(muB2_post / muB1_post)
            delta_logFC_post = logFC_B_post - logFC_A_post

            alpha = 0.05

            # Get Bounds of the HDI
            l_logFC_A, u_logFC_A = HDI_from_MCMC(logFC_A_post, 1 - alpha)

            l_logFC_B, u_logFC_B = HDI_from_MCMC(logFC_B_post, 1 - alpha)

            l_delta_logFC, u_delta_logFC = HDI_from_MCMC(
                delta_logFC_post, 1 - alpha)

            mean_logFC_A = numpy.mean(logFC_A_post)
            mean_logFC_B = numpy.mean(logFC_B_post)
            mean_delta_logFC = numpy.mean(delta_logFC_post)

            # Is HDI significantly different than ROPE?
            not_HDI_overlap_bit = l_delta_logFC > rope or u_delta_logFC < -rope

            # Probability of posterior overlaping with ROPE
            probROPE = numpy.mean(
                numpy.logical_and(delta_logFC_post >= 0.0 - rope,
                                  delta_logFC_post <= 0.0 + rope))

        else:
            A1_data = [0, 0]
            B1_data = [0, 0]
            A2_data = [0, 0]
            B2_data = [0, 0]

            mean_logFC_A = 0
            mean_logFC_B = 0
            mean_delta_logFC = 0
            l_logFC_A = 0
            u_logFC_A = 0
            l_logFC_B = 0
            u_logFC_B = 0
            l_delta_logFC = 0
            u_delta_logFC = 0
            probROPE = 1.0

        if numpy.isnan(l_logFC_A):
            l_logFC_A = -10
            u_logFC_A = 10
        if numpy.isnan(l_logFC_B):
            l_logFC_B = -10
            u_logFC_B = 10
        if numpy.isnan(l_delta_logFC):
            l_delta_logFC = -10
            u_delta_logFC = 10

        if DEBUG:

            out = open("%s.%s_muA1_post" % (label, gene.orf), "w")
            for x in muA1_post:
                print >> out, x

            out = open("%s.%s_muA2_post" % (label, gene.orf), "w")
            for x in muA2_post:
                print >> out, x

            out = open("%s.%s_logFC_A_post" % (label, gene.orf), "w")
            for x in logFC_A_post:
                print >> out, x

            out = open("%s.%s_muB1_post" % (label, gene.orf), "w")
            for x in muB1_post:
                print >> out, x

            out = open("%s.%s_muB2_post" % (label, gene.orf), "w")
            for x in muB2_post:
                print >> out, x

            out = open("%s.%s_logFC_B_post" % (label, gene.orf), "w")
            for x in logFC_A_post:
                print >> out, x

            out = open("%s.%s_delta_logFC_post" % (label, gene.orf), "w")
            for x in delta_logFC_post:
                print >> out, x

        postprob.append(probROPE)
        data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post),
                     numpy.mean(muA2_post), numpy.mean(muB1_post),
                     numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B,
                     mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE,
                     not_HDI_overlap_bit))

    if doBFDR or not doFWER:
        postprob = numpy.array(postprob)
        postprob.sort()
        bfdr = numpy.cumsum(postprob) / numpy.arange(1, len(postprob) + 1)
        adjusted_prob = bfdr
        adjusted_label = "BFDR"
        if doBFDR:
            data.sort(key=lambda x: x[-2])
        else:
            data.sort(key=lambda x: x[-1], reverse=True)
    elif doFWER:
        fwer = FWER_Bayes(postprob)
        fwer.sort()
        adjusted_prob = fwer
        adjusted_label = "FWER"
        data.sort(key=lambda x: x[-2])

    return (data, adjusted_prob, adjusted_label)

예제 #44

0

파일 보기

파일: rankproduct.py 프로젝트: mad-lab/transit

    def Run(self):

        self.transit_message("Starting rankproduct Method")
        start_time = time.time()
               


        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj)
        if self.normalization != "none":
            self.transit_message("Normalizing using: %s" % self.normalization)

            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)           
         

        Gctrl= tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[:Kctrl,:], position=position)

        Gexp= tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data[Kctrl:,:], position=position)


        Ngenes = len(Gctrl)

        # Get the average counts for all the genes, in each replicate
        meanCtrl = numpy.zeros((Kctrl, Ngenes))
        meanExp = numpy.zeros((Kexp, Ngenes))

        for i in range(Ngenes):
            if numpy.any(Gctrl[i].reads):
                meanCtrl[:,i] = numpy.mean(Gctrl[i].reads,1)
            else:
                meanCtrl[:,i] = numpy.zeros(Kctrl)
            #            
            if numpy.any(Gexp[i].reads):
                meanExp[:,i] = numpy.mean(Gexp[i].reads,1)
            else:
                meanExp[:,i] = numpy.zeros(Kexp)

            

        # Calculate a logFC2 between Experimental and Control
        # Then calculates it's rank, and observed rankProduct
        logFC2 = numpy.log2((meanExp+0.0001)/(meanCtrl+0.0001))
        rank = numpy.array([scipy.stats.rankdata(Lvec) for Lvec in logFC2])
        obsRP = numpy.power(numpy.prod(rank,0), 1.0/Kctrl)


        permutations = numpy.zeros((self.samples, Ngenes))
        tempranks = scipy.array([numpy.arange(1,Ngenes+1) for rep in range(Kctrl)])
        for s in range(self.samples):
            rankperm = numpy.array([numpy.random.permutation(tr) for tr in tempranks])
            permutations[s] = numpy.power(numpy.prod(rankperm,0), 1.0/Kctrl)

        rankRP = numpy.argsort(obsRP) + 1



        #rankproduct
        data = []
        count = 0
        self.progress_range(Ngenes)
        for i,gene in enumerate(Gctrl):
            count+=1

            meanctrl = numpy.mean(Gctrl[i].reads)
            meanexp = numpy.mean(Gexp[i].reads)
            log2fc = numpy.log2((meanexp+0.0001)/(meanctrl+0.0001))
            countbetter = numpy.sum(permutations <= obsRP[i])
            
            pval = countbetter/float(self.samples*Ngenes)
            e_val = countbetter/float(self.samples)
            q_paper = e_val/float(rankRP[i])
 
            data.append([gene.orf, gene.name, gene.desc, gene.n, meanctrl, meanexp, log2fc, obsRP[i], e_val, q_paper, pval])
            
            # Update Progress
            text = "Running rankproduct Method... %5.1f%%" % (100.0*count/Ngenes)
            self.progress_update(text, count)


        #
        self.transit_message("") # Printing empty line to flush stdout 
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort() 
        q_bh = stat_tools.BH_fdr_correction([row[-1] for row in data])
       
 
        self.output.write("#RankProduct\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: ctrldata=%s, annotation=%s, output=%s\n" % (",".join(self.ctrldata).encode('utf-8'), self.annotation_path.encode('utf-8'), self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))

        self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
        self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % (columns))

        for i,row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, log2FCgene, obsRPgene, e_val, q_paper, pval) = row
            self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.8f\t%1.1f\t%1.8f\n" % (orf, name, desc, n, mean1, mean2,log2FCgene, obsRPgene, e_val, q_paper))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="RankProduct")
        self.finish()
        self.transit_message("Finished rankproduct Method")