Exemplo n.º 1
0
    def rankSomaticMutations_byTreatmentContingency(inputRNAFilename, outputFilename, treatmentsCSVFilename, additionalHeaders = ("Entrez_Gene_Id","Chrom","Start_Position","End_Position","Strand","Variant_Type","Reference_Allele","Tumor_Seq_Allele1","Tumor_Seq_Allele2"), genesHeaderColumnName = "Hugo_Symbol", delimiter = "\t"):
        '''
        Contingency table approach for measuring significant difference in somatic mutations between different treatment groups.  
        The code builds a contingency table for each treatment pair using counts of mutated and non-mutated genes for each treatment.
        Use the additionalHeaders parameter to add additional headers that may occur in mutation signiture files.         
        '''
        from Utility.Utility import Utility
        import numpy as np
        import scipy.stats as scipystats
        
        USE_NUMPY_WRAPPING = True
        
        assert treatmentsCSVFilename
        assert inputRNAFilename
        assert outputFilename
        
        treatmentsDict = Analysis.buildTreatmentsLists(treatmentsCSVFilename, delimiter)

        if treatmentsDict not in ({},None):
            outputFile = open(outputFilename, "w")
            headerLine = genesHeaderColumnName + delimiter
            if additionalHeaders not in (None, ()):
                for addHeader in additionalHeaders:
                    headerLine += addHeader + delimiter
            for treat1 in treatmentsDict.keys():
                for treat2 in treatmentsDict.keys():
                    if treat1 != treat2:
                        headerLine += "%s_%s_pValue" % (treat1, treat2) + delimiter
            outputFile.write(headerLine.strip(delimiter) + "\n")
            
            with open(inputRNAFilename, "r") as inputRNAFile:
                columns, indexToName = Utility.getColumns(inputRNAFile) # @UnusedVariable

                geneList = columns[genesHeaderColumnName]
                
                totalMutations = len(geneList)
                counter = 0
                for i, gene in enumerate(geneList):
                    counter += 1
                    if counter % 1000 == 0:
                        print "Progress completed: %s percent.   Working on mutation at gene: %s" % (str((float(counter)/totalMutations)*100), gene)
                        
                    geneOutputLine = gene + delimiter
                    if additionalHeaders not in (None, ()):
                        for addHeader in additionalHeaders:
                            geneOutputLine += columns[addHeader][i] + delimiter
                    samplesDict = {}
                    for treatment in treatmentsDict.keys():
                        samplesDict[treatment] = []
                        samplesPerTreatment = treatmentsDict[treatment]
                        for sampleID in samplesPerTreatment:
                            try:
                                samplesDict[treatment].append(columns[sampleID][i])
                            except:
                                #print "Sample not found sampleID: %s" % (sampleID)
                                pass
                        if USE_NUMPY_WRAPPING:
                            samplesDict[treatment] = np.array(samplesDict[treatment]).astype(np.float)
                    for treat1 in treatmentsDict.keys():
                        for treat2 in treatmentsDict.keys():
                            if treat1 != treat2:
                                p = float(1.0)
                                try:
                                    p = Analysis.calculateContingencyP_expectedWeighted(samplesDict[treat1], samplesDict[treat2])
                                    geneOutputLine += str(p) + delimiter
                                except:
                                    geneOutputLine += str(p) + delimiter
                    outputFile.write(geneOutputLine.strip(delimiter) + "\n")
            outputFile.flush()
            outputFile.close()
            return outputFilename                    
Exemplo n.º 2
0
    def rankRNAExpressionTreatments_byMannWhitney(inputRNAFilename, outputFilename, treatmentsCSVFilename, genesHeaderColumnName = "gene_id|gene_id_code", delimiter = "\t"):
        '''
        Method uses Mann-Whitney test to determine variations in the distributions of RNA expression data between treatments.
        '''
        from Utility.Utility import Utility
        
        import numpy as np
        import scipy.stats as scistats
        
        TEST_TYPE = "mannwhitneyu"
        USE_NUMPY_WRAPPING = True
        
        assert treatmentsCSVFilename
        assert inputRNAFilename
        assert outputFilename
        
        treatmentsDict = Analysis.buildTreatmentsLists(treatmentsCSVFilename, delimiter)

        if treatmentsDict not in ({},None):
            outputFile = open(outputFilename, "w")
            headerLine = "gene_id" + delimiter
            for treat1 in treatmentsDict.keys():
                for treat2 in treatmentsDict.keys():
                    if treat1 != treat2:
                        headerLine += "%s_%s_U_%s" % (treat1, treat2, TEST_TYPE) + delimiter
                        headerLine += "%s_%s_p_%s" % (treat1, treat2, TEST_TYPE) + delimiter
            outputFile.write(headerLine.strip(delimiter) + "\n")
                        
            with open(inputRNAFilename, "r") as inputRNAFile:
                columns, indexToName = Utility.getColumns(inputRNAFile) # @UnusedVariable
                geneList = columns[genesHeaderColumnName]
                
                if geneList[0].find("|") != -1:
                    geneList = [x.split("|")[0] for x in geneList]
    
                for i, gene in enumerate(geneList):

                    geneOutputLine = gene + delimiter
                    samplesDict = {}
                    for treatment in treatmentsDict.keys():
                        samplesDict[treatment] = []
                        samplesPerTreatment = treatmentsDict[treatment]
                        for sampleID in samplesPerTreatment:
                            try:
                                samplesDict[treatment].append(columns[sampleID][i])
                            except:
                                #print "Sample not found sampleID: %s" % (sampleID)
                                pass
                        if USE_NUMPY_WRAPPING:
                            samplesDict[treatment] = np.array(samplesDict[treatment]).astype(np.float)
                            
                    for treat1 in treatmentsDict.keys():
                        for treat2 in treatmentsDict.keys():
                            if treat1 != treat2:
                                pValue = float(1.0)
                                U = float(0.0)
                                try:
                                    U, pValue = scistats.mannwhitneyu(samplesDict[treat1], samplesDict[treat2]) # @UnusedVariable
                                    geneOutputLine += str(U) + delimiter
                                    geneOutputLine += str(pValue) + delimiter
                                except:
                                    geneOutputLine += str(U) + delimiter
                                    geneOutputLine += str(pValue) + delimiter
                    outputFile.write(geneOutputLine.strip(delimiter) + "\n")
            
            outputFile.flush()
            outputFile.close()
            return outputFilename
Exemplo n.º 3
0
def WriteVRTransaction(set, writeHeader):
    transactions, statement, nonUpdatedStatementExporter, simpleStatementExporter = ReadTransaction(set, writeHeader);
    vrFormat = ValueResearchStatement(statement, writeHeader);
    Utility.WriteStatement(vrFormat, set.outputVRFormat);