예제 #1
0
def FilterFile(Guidefile, PSI, turn=0):
    if 'Clustering' in Guidefile:
        count = 1
    else:
        count = 0
    val = []
    head = 0
    for line in open(Guidefile, 'rU').xreadlines():
        if head > count:
            line = line.rstrip('\r\n')
            q = string.split(line, '\t')
            val.append(q[0])
        else:
            head += 1
            continue

    dire = export.findParentDir(export.findParentDir(Guidefile)[:-1])
    output_dir = dire + 'SubtypeAnalyses-Results'
    if os.path.exists(output_dir) == False:
        export.createExportFolder(output_dir)

    #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt'
    output_file = output_dir + '/round' + str(
        turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt'
    try:
        os.mkdir(output_dir + '/round' + str(turn))
    except:
        pass  ### already exists
    if turn == 1:
        ### No need to filter this file
        shutil.copyfile(PSI, output_file)
    else:
        filterRows(PSI, output_file, filterDB=val)

    return output_file
예제 #2
0
def FilterFile(Guidefile,Guidefile_block,PSI,turn):
    if 'Clustering' in Guidefile:
        count=1
        flag=True
        rank_Count=0
        prev=0
    else:
        count=0
    val=[]
    head=0
    
    print Guidefile_block
    for line in open(Guidefile_block,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            if flag:
               
                if int(q[1])==prev:
                    continue
                else:
                    rank_Count+=1
                    prev=int(q[1])
        else:
            head+=1
            continue
    head=0
    print Guidefile
    for line in open(Guidefile,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            val.append(q[0])
        else:
            head+=1
            continue
    dire = export.findParentDir(PSI)
    output_dir = dire+'OncoInputs'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)
    
    output_file = output_dir+'/NMFInput-Round'+str(turn)+'.txt'
    filterRows(PSI,output_file,filterDB=val)
    return output_file,rank_Count
예제 #3
0
def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy):
    """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """
    
    use_adjusted_p=True
           
    print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound)
    NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version
    print "Running metaData Analyses for finding differential splicing events"
    rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation)
    counter=1
    dPSI_results_dir=rootdir+CovariateQuery
    global upd_guides
    upd_guides=[]
    name=[]
    group=[]
    grplst=[]
    for filename in os.listdir(dPSI_results_dir):
        if filename.startswith("PSI."):
            dPSI_results_fn=os.path.join(dPSI_results_dir, filename)
            dPSI_comparison_alt_name=string.replace(filename,"PSI.","")
            omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir)
            if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison
                group.append(counter)
                name.append(string.replace(filename,"PSI.",""))
                counter+=1
                
    print counter, 'robust splicing subtypes identified in round',AnalysisRound
    if counter>0: #counter>2 --- changed to 0 to force NMF
        dire = export.findParentDir(full_PSI_InputFile)
        output_dir = dire+'OncoInputs'
        if os.path.exists(output_dir)==False:
            export.createExportFolder(output_dir)

        output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt'
        ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False)
        header=ExpandSampleClusters.header_file(output_file)
        print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound)
        #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file
        #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput
        train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name)
        grplst.append(group)
        ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version
        header=Correlationdepletion.header_file(NMFResult)
        
        output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt"
        sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header)
        print "Running Correlation Depletion - Round"+str(AnalysisRound)
        commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name)
        Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile)
        full_PSI_InputFile=Depleted
    
        flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed
    """"
    else:
        try:
            print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
            header=[]
            header=Kmeans.header_file(dPSI_results_fn_block)
            Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
            flag=True
        except Exception:
            print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
            print traceback.format_exc()
            AnalysisRound = True
    """
    return flag,full_PSI_InputFile
예제 #4
0
    
    print "Subtype discovery stringency:",strategy
    dire = export.findParentDir(EventAnnot)

    if EnrichmentOnly==False:
        
        print 'PSI input files:',EventAnnot
        print 'Using a rho-cutoff of:',rho_cutoff
    
        if filters==True: ### Filter based on a default percentage of samples with detected PSI values
            EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True)
        else:
            SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False)
        output_dir = dire+'ExpressionInput'
    
        export.createExportFolder(output_dir)
        full_PSI_InputFile=output_dir+"/exp.input.txt"
        header=header_list(EventAnnot)
        sampleIndexSelection.filterFile(EventAnnot,full_PSI_InputFile,header,FirstCol=False)
        
        ### Set Splice-ICGS defaults
        gsp = UI.GeneSelectionParameters(species,platform,platform)
        gsp.setNormalize('median')
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        gsp.setJustShowTheseIDs('')
        gsp.setSampleDiscoveryParameters(ExpressionCutoff,CountsCutoff,FoldDiff,SamplesDiffering,removeOutliers,
                        featurestoEvaluate,restrictBy,excludeCellCycle,column_metric,column_method,rho_cutoff)
        
        AnalysisRound=1
예제 #5
0
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp,
                     forceBroadClusters, turn):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """

    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber * 0.25))

        print 'Number varying samples to identify:', gsp.SamplesDiffering()

        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        ### Parameters are fixed as they are distinct
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting Rank=0'
        #print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        ### ADJUST THE RANKS - MUST UPDATE!!!!
        if turn == 1:
            if force_broad_round1:
                #Rank=2
                Rank = Rank
            else:
                if Rank > 2:
                    Rank = 30
        else:
            if Rank > 2:
                Rank = 30
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False

        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
예제 #6
0
def Classify(filename, Mutlabels={}, dire="", flag=True):
    count = 0
    start = 1
    orderdict = OrderedDict()
    countdict = OrderedDict()

    countlst = []

    Y = []
    head = 0
    rownames = []
    colnames = []
    q = []
    Z = []
    if dire != "":
        output_dir = dire + 'Results'
        export.createExportFolder(output_dir)
        if flag:
            output_file = output_dir + "/Consolidated-Increasing" + ".txt"
        else:
            output_file = output_dir + "/Consolidated-Decreasing" + ".txt"
    else:
        output_file = filename[:-4] + "-ordered.txt"
    export_object = open(output_file, 'w')
    for line in open(filename, 'rU').xreadlines():
        if head > 0:
            val = []
            counter2 = 0
            val2 = []
            me = 0.0

            line = line.rstrip('\r\n')

            q = string.split(line, '\t')
            # rownames.append(q[0])
            if q[0] == "":
                continue
            orderdict[q[0]] = [
                q[0],
            ]
            for i in range(start, len(q)):
                try:
                    val2.append(float(q[i]))
                    try:
                        orderdict[q[0]].append(float(q[i]))
                    except Exception:
                        orderdict[q[0]] = [
                            float(q[i]),
                        ]
                    try:
                        countdict[i].append(float(q[i]))
                    except Exception:
                        countdict[i] = [
                            float(q[i]),
                        ]
                except Exception:
                    continue

            count += 1
        else:
            #export_object.write(line)
            head = 1
            line = line.rstrip('\r\n')

            q = string.split(line, '\t')
            header = q
            continue

    for i in countdict:

        countlst.append(sum(countdict[i]))
    #print countlst

    B = sorted(range(len(countlst)), key=lambda x: countlst[x], reverse=flag)
    C = sorted(range(len(countlst)), key=lambda x: B[x])

    qu = 0
    for i in orderdict.keys():
        Y.append(orderdict[i])
        qu += 1
        #print Y

    for i in range(0, len(C)):
        jk = C.index(i) + 1
        #print jk
        #print Y[jk]
        Y = sorted(Y, key=itemgetter(jk))

        #orderdict=OrderedDict(sorted(orderdict,key=itemgetter(jk)))
        #colnames.append(header[C.index(i)+1])

    Y = np.array(Y)
    Y = zip(*Y)
    Y = np.array(Y)
    Z.append(Y[0, :])
    for i in range(0, len(C)):
        jk = C.index(i) + 1
        Z.append(Y[jk, :])
    Z = np.array(Z)
    q = Z.shape

    export_object.write("uid")

    for i in range(q[1]):
        export_object.write("\t" + Z[0][i])
    export_object.write("\n")
    for ij in range(1, q[0]):
        jk = C.index(ij - 1) + 1
        if header[jk] in Mutlabels:
            export_object.write(Mutlabels[header[jk]])
        else:
            export_object.write(header[jk])
        for jq in range(0, q[1]):
            export_object.write("\t" + str(Z[ij][jq]))
        export_object.write("\n")
    export_object.close()

    graphic_links = []
    row_method = None
    column_method = None
    column_metric = 'cosine'
    row_metric = 'cosine'
    color_gradient = 'yellow_black_blue'
    transpose = False
    graphic_links = clustering.runHCexplicit(output_file,
                                             graphic_links,
                                             row_method,
                                             row_metric,
                                             column_method,
                                             column_metric,
                                             color_gradient,
                                             transpose,
                                             display=False,
                                             Normalize=False)
예제 #7
0
def parseResultfolders(motifdir, GEdir, SFlist):
    sfs = []
    for lin in open(SFlist, 'rU').xreadlines():
        s = lin.rstrip('\r\n')
        s1 = string.split(s, '\t')
        sfs.append(s1[0])

    mappingdict = defaultdict(list)
    allden = []
    for filename in os.listdir(motifdir):
        name = filename
        mapping = []
        dellst = []

        if "._" not in filename and "Events" not in filename:
            fol = os.path.join(motifdir, filename)
            if os.path.isdir(fol):
                #for filename2 in os.listdir(fol):
                #filnam2=os.path.join(fol,filename2)
                #if "._" not in filnam2:
                #   if os.path.isdir(filnam2):
                #       #print filnam2
                #       flag=0
                #       if "._" not in filename2:
                #           name=filename+":"+filename2
                #           flag=1
                #
                #       if flag==1:
                for filename3 in os.listdir(fol):

                    if filename3 == "finalResults.tab":

                        clipres = os.path.join(fol, filename3)
                        for lin in open(clipres, 'rU').xreadlines():

                            q = lin.rstrip('\r\n')
                            q1 = string.split(q, '\t')

                            clipnam = q1[0] + ":" + q1[1] + ":" + q1[2]
                            mappingdict[name, clipnam, "Clipseq"] = q1[11]

                    if filename3 == "output_TF_strand":
                        knownrbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(knownrbp):
                            if filename4 == "knownResults.txt":
                                filenam4 = os.path.join(knownrbp, filename4)
                                try:
                                    head = 0
                                    for line in open(filenam4,
                                                     'rU').xreadlines():
                                        q = line.rstrip('\r\n')
                                        q1 = string.split(q, '\t')
                                        if head == 0:
                                            motif = q1.index('Motif Name')
                                            pval = q1.index('P-value')
                                            head = 1
                                            continue
                                        else:
                                            mappingdict[
                                                name, q1[motif],
                                                "Cisbp_Actual"] = q1[pval]

                                except Exception:
                                    continue

                    if filename3 == "output1":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))

                                        if col[2] == "P-value":
                                            continue
                                        else:

                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]

                    if filename3 == "output2":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output3":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output4":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output5":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #print name,motname,col[2]
                                            #sys.exit()
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
        mapping.sort(key=lambda x: x[0])

        mapping.sort(key=lambda x: x[1])
        #prev=""
        #output=os.path.join(motifdir,"test.txt")
        #output_w=open(output,"a")
        for i in range(len(mapping)):
            if mapping[i][0] not in dellst:
                mot = string.split(mapping[i][0], ";")[1]
                genes = []
                genes = string.split(mot, ":")[1:]
                allden.append([filename, mot, genes, mapping[i][1]])
                #output_w.write(mapping[i][0]+"\t"+str(mapping[i][1]))
                #      output_w.write("\n")
                dellst.append(mapping[i][0])
        final = {}
        for i in range(len(allden)):
            de = []
            de = allden[i]

            for q in de[2]:
                if q in final:
                    if de[3] < final[q][1]:
                        final[q] = [de[0], de[3], de[1]]
                else:
                    final[q] = [de[0], de[3], de[1]]
        for genes in final:

            de = []
            de = final[genes]
            mappingdict[de[0], de[2], "Cisbp_denovo"] = str(de[1])

    for filename in os.listdir(GEdir):
        if "GE" in filename and "._GE" not in filename:
            InputFile = os.path.join(GEdir, filename)
            name = string.replace(filename, "GE.", "")
            name = string.replace(name, "_vs_Others.txt", "")
            head = 0
            for line in open(InputFile, 'rU').xreadlines():
                q = line.rstrip('\r\n')
                q1 = string.split(q, '\t')
                if head == 0:
                    symbol = q1.index('Symbol')
                    adjp = q1.index('adjp')
                    head = 1
                    continue
                else:
                    if q1[symbol] in sfs:
                        mappingdict[name, q1[symbol], "GE"] = q1[adjp]
    dire = export.findParentDir(motifdir)
    output_dir = dire + 'MotifResults'
    export.createExportFolder(output_dir)
    output = output_dir + "/Motifresults.txt"

    #output=os.path.join(motifdir,"merged_output_allpvalues_nofold.txt")
    output1 = open(output, "w")
    #output1.write("signature"+"\t"+"gene"+"\t"+"tool"+"\t"+"p-value"+"\n")
    for name, gene, key in mappingdict:
        output1.write(name + "\t" + gene + "\t" + key + "\t" +
                      mappingdict[name, gene, key] + "\n")
    output1.close()
    return output
예제 #8
0
def Classify(header,Xobs,output_file,grplst,name,turn):
    count=0
    start=1
    Y=[]
    head=0
    for line in open(output_file,'rU').xreadlines():
        if head >count:
            val=[]
            counter2=0
            val2=[]
            me=0.0
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            Y.append(val)
        else:
            head+=1
            continue

    Xobs=zip(*Xobs)
    Xobs=np.array(Xobs)
    Xobs=zip(*Xobs)
    Xobs=np.array(Xobs)
    X=grplst
    X=zip(*X)
    X=np.array(X)
    Y=zip(*Y)
    Y=np.array(Y)

    dire = export.findParentDir(export.findParentDir(export.findParentDir(output_file)[:-1])[:-1])
    output_dir = dire+'SVMOutputs'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)

    exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt'
    export_class1=open(exportnam1,"w")
    exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt'
    export_class2=open(exportnam2,"w")
    regr = LinearSVC()
    regr.fit(Xobs,X[:,0])
    q=regr.predict(Y)
    count=1

    if len(X[:,0])>2:
        prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
        export_class1.write("uid")
        export_class2.write("uid")
        for ni in name:
            sub=string.split(ni,"_")[0]
            export_class1.write("\t"+"R"+str(turn)+"-"+sub)
            export_class2.write("\t"+"R"+str(turn)+"-"+sub)
        export_class1.write("\n")
        export_class2.write("\n")

        for iq in range(0,len(header)-1):
            export_class1.write(header[iq+1])
            export_class2.write(header[iq+1])
            for jq in range(0,len(X[:,0])):
                export_class1.write("\t"+str(prob_[iq][jq]))
                if prob_[iq][jq]>0:
                    export_class2.write("\t"+str(1))
                else:
                    export_class2.write("\t"+str(0))
            export_class1.write("\n")
            export_class2.write("\n")
    else:
        prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
        export_class1.write("uid"+"\t")
        export_class2.write("uid"+"\t")
        export_class1.write("group")
        export_class2.write("R"+str(turn)+"-V1"+"\t"+"R"+str(turn)+"-V2")
        export_class1.write("\n")
        export_class2.write("\n")

        for iq in range(0,len(header)-1):
            export_class1.write(header[iq+1])
            export_class2.write(header[iq+1])
            export_class1.write("\t"+str(prob_[iq]))
            if prob_[iq]>0.5:
                export_class2.write("\t"+str(1)+"\t"+str(0))
            else:
                if prob_[iq]<-0.5:  
                    export_class2.write("\t"+str(0)+"\t"+str(1))
                else:
                    export_class2.write("\t"+str(0)+"\t"+str(0))
            export_class1.write("\n")
            export_class2.write("\n")
    export_class2.close() 
    Orderedheatmap.Classify(exportnam2)
예제 #9
0
def FilterGuideGeneFile(Guidefile,Guidefile_block,expressionInputFile,iteration,platform,uniqueIDs,symbolIDs):
    """ Filters the original input expression file for Guide3 genes/events. Needed
    Since NMF only can deal with positive values [Guide3 has negative values]"""
    
    root_dir = export.findParentDir(expressionInputFile)[:-1]
    if 'ExpressionInput' in root_dir:
        root_dir = export.findParentDir(root_dir)
    
    if 'Clustering' in Guidefile:
        count=1
        flag=True
        rank_Count=0
        prev=0
    else:
        count=0
    val=[]
    head=0
    for line in open(Guidefile_block,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            #val.append(q[0])
            if flag:
                if int(q[1])==prev:
                    continue
                else:
                    rank_Count+=1
                    prev=int(q[1])
        else:
            head+=1
            continue
    head=0
    for line in open(Guidefile,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        n=len(q)
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            uid = q[0]
            if uid not in uniqueIDs:
                if uid in symbolIDs:
                    uid = symbolIDs[uid]
                    val.append(uid)
                else:
                    continue
            val.append(uid)
            if platform != "PSI" and head==2:
                rank_Count=rank_Count+int(q[1])
                print rank_Count
            head=head+1
        else:
            head+=1
            if platform != "PSI" and q[0]=="column_clusters-flat":
                    rank_Count=int(q[n-1])
            continue

    output_dir = root_dir+'/NMF-SVM'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)
    
    output_file = output_dir+'/NMFInput-Round'+str(iteration)+'.txt'
    filterRows(expressionInputFile,output_file,filterDB=val)
    
    return output_file,rank_Count
예제 #10
0
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq):

    species = "Hs"
    row_method = 'hopach'
    column_method = 'hopach'
    row_metric = 'correlation'
    column_metric = 'euclidean'
    color_gradient = 'yellow_black_blue'
    contrast = 3
    vendor = "RNASeq"
    GeneSelection = ''
    PathwaySelection = ''
    GeneSetSelection = 'None Selected'
    excludeCellCycle = False
    #rho_cutoff = 0.4
    restrictBy = 'protein_coding'
    featurestoEvaluate = 'Genes'
    ExpressionCutoff = 0
    CountsCutoff = 0
    FoldDiff = 1.2
    SamplesDiffering = 4
    JustShowTheseIDs = ''
    removeOutliers = False
    PathwaySelection = []
    array_type = "RNASeq"
    #rho_cutoff=0.4
    gsp = UI.GeneSelectionParameters(species, array_type, vendor)
    gsp.setGeneSet(GeneSetSelection)
    gsp.setPathwaySelect(PathwaySelection)
    gsp.setGeneSelection(GeneSelection)
    gsp.setJustShowTheseIDs(JustShowTheseIDs)
    gsp.setNormalize('median')
    gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff,
                                     SamplesDiffering, removeOutliers,
                                     featurestoEvaluate, restrictBy,
                                     excludeCellCycle, column_metric,
                                     column_method, rho_cutoff)
    #Run splice ICGS
    """import UI
        species='Mm'; platform = "3'array"; vendor = 'Ensembl'
        gsp = UI.GeneSelectionParameters(species,platform,vendor)
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect('')
        gsp.setGeneSelection('')
        gsp.setJustShowTheseIDs('')
        gsp.setNormalize('median')
        gsp.setSampleDiscoveryParameters(0,0,1.5,3,
        False,'PSI','protein_coding',False,'cosine','hopach',0.35)"""

    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)
        #except Exception:Rank=0
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)
    except Exception:
        print 'UNKNOWN ERROR!!!!!'
        print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        print 'Current turn:', turn, 'k =',
        if turn == 1:
            Rank = 2
        elif Rank > 2:
            Rank = 30
        else:
            Rank = 2
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False
        print Rank
        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False

    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)

                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
예제 #11
0
def Enrichment(Inputfile,mutdict,mutfile,Expand,header):
    import collections
    import mappfinder
    X=defaultdict(list)
    prev=""
    head=0
    group=defaultdict(list)
    enrichdict=defaultdict(float)
    mut=export.findFilename(mutfile)
    dire=export.findParentDir(Inputfile)
    output_dir = dire+'MutationEnrichment'
    export.createExportFolder(output_dir)

    exportnam=output_dir+'/Enrichment_Results.txt'
    export_enrich=open(exportnam,"w")
    exportnam=output_dir+'/Enrichment_tophits.txt'
    export_hit=open(exportnam,"w")
   
    export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n")
    if Expand=="yes":
        header2=header_file(Inputfile,Expand="yes")
        
        for line in open(Inputfile,'rU').xreadlines():
            if head >0:
                line=line.rstrip('\r\n')
                q= string.split(line,'\t')
                for i in range(1,len(q)):
                    if q[i]==str(1):
                        #group[q[0]].append(header2[i-1])
                        group[header2[i-1]].append(q[0])
           
            else:
                head+=1
                continue
    else:
        for line in open(Inputfile,'rU').xreadlines():
            line=line.rstrip('\r\n')
            line=string.split(line,'\t')
            #for i in range(1,len(line)):
            group[line[2]].append(line[0])
   
    total_Scores={}
    for kiy in mutdict:
        if kiy =="MDP":
            print mutdict[kiy]
        groupdict={}
        remaining=[]
        remaining=list(set(header) - set(mutdict[kiy]))
        groupdict[1]=mutdict[kiy]
        groupdict[2]=remaining
       # export_enrich1.write(kiy)
        for key2 in group:
           
            
            r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy]))))
            n=float(len(group[key2]))
            R=float(len(set(mutdict[kiy])))
            N=float(len(header))
        
            if r==0 or R==1.0:
                print kiy,key2,r,n,R,N
                pval=float(1)
                z=float(0)
                null_z = 0.000
                zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                zsd.SetP(pval)
            else:
                try: z = Zscore(r,n,N,R)
                except : z = 0.0000
                ### Calculate a Z-score assuming zero matching entries
                try: null_z = Zscore(0,n,N,R)
                except Exception: null_z = 0.000
               
                
                try:
                    pval = mappfinder.FishersExactTest(r,n,R,N)
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                except Exception:
                    pval=1.0
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                    #pass
                
          
            if kiy in total_Scores:
                    signature_db = total_Scores[kiy]
                    signature_db[key2]=zsd ### Necessary format for the permutation function
            else:
                    signature_db={key2:zsd}
                    total_Scores[kiy] = signature_db
    sorted_results=[]
    mutlabels={}
    for kiy in total_Scores:
        
        signature_db = total_Scores[kiy]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
           
            results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|')
            sorted_results.append([signature,float(zsd.PermuteP()),results])
    sorted_results.sort() ### Sort by p-value
    prev=""
    for (sig,p,values) in sorted_results:
        if sig!=prev:
            flag=True
            export_hit.write(string.join(values,'\t')+'\n')
        if flag:
            if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 :
                mutlabels[values[1]]=values[0]
                flag=False
                export_hit.write(string.join(values,'\t')+'\n')
        export_enrich.write(string.join(values,'\t')+'\n')
        prev=sig
    if len(sorted_results)==0:
            export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n')
    export_enrich.close()
    #print mutlabels
    return mutlabels
예제 #12
0
def KmeansAnalysis(filename, header, InputFile, turn):
    X = defaultdict(list)
    prev = ""
    head = 0
    for line in open(filename, 'rU').xreadlines():
        if head > 1:
            val = []
            line = line.rstrip('\r\n')
            q = string.split(line, '\t')
            for i in range(2, len(q)):
                val.append(float(q[i]))
            if q[1] == prev:
                X[prev].append(val)
            else:
                prev = q[1]
                X[prev].append(val)
        else:
            head += 1
            continue

    for key in X:
        print key
        X[key] = np.array(X[key])
        print X[key].shape
        mat = []
        dire = export.findParentDir(export.findParentDir(InputFile)[:-1])
        output_dir = dire + 'SVMOutputs'
        if os.path.exists(output_dir) == False:
            export.createExportFolder(output_dir)

        exportname = output_dir + '/R' + str(turn) + 'Kmeans_result.txt'
        #exportname=filename[:-4]+key+'.txt'
        export_results = open(exportname, "w")
        mat = zip(*X[key])
        mat = np.array(mat)
        print mat.shape
        kmeans = KMeans(n_clusters=2, random_state=0).fit(mat)

        y = kmeans.labels_
        #cent=kmeans.cluster_centers_
        y = y.tolist()
        total = len(y)
        cent_1 = y.count(0)
        cent_2 = y.count(1)
        print cent_1, cent_2
        group = 'R' + str(turn) + '_Kmeans'
        export_results.write("uid" + "\t" + group + "\n")
        if cent_1 < cent_2:
            count = 2
            for j in y:
                if j == 0:
                    export_results.write(header[count] + "\t" + "1" + "\n")
                else:
                    export_results.write(header[count] + "\t" + "0" + "\n")
                count += 1
        else:
            count = 2
            for j in y:
                if j == 1:
                    export_results.write(header[count] + "\t" + "1" + "\n")
                else:
                    export_results.write(header[count] + "\t" + "0" + "\n")
                count += 1
예제 #13
0
def Classify(header,Xobs,output_file,grplst,name,turn,platform,output_dir,root_dir):
    count=0
    start=1
    Y=[]
    head=0
    for line in open(output_file,'rU').xreadlines():
        if head >count:
            val=[]
            counter2=0
            val2=[]
            me=0.0
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            #if q[1]==prev:
            Y.append(val)
        
        else:
            head+=1
            continue

    Xobs=zip(*Xobs)

    Xobs=np.array(Xobs)
    Xobs=zip(*Xobs)

    Xobs=np.array(Xobs)
    X=grplst
    X=zip(*X)
    X=np.array(X)
    #print X
    Y=zip(*Y)
    Y=np.array(Y)

    #np.savetxt("/Volumes/MyPassport/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/PSI/ExpressionProfiles/DataPlots/complete_KNN.txt",q)
    #if platform=="PSI":
    #else:

    output_dir = output_dir+'/SVMOutputs'
    output_dir2 = root_dir+'/ICGS-NMF'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)
    if os.path.exists(output_dir2)==False:
        export.createExportFolder(output_dir2)
    #exportnam=output_dir+'/round'+str(turn)+'SVC_test_50cor.txt'
    #export_class=open(exportnam,"w")
    exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt'
    export_class1=open(exportnam1,"w")
    
    if platform=="PSI":
        exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt'
        export_class2=open(exportnam2,"w")
    else:
        exportnam2=output_dir2+'/FinalGroups.txt'
        export_class2=open(exportnam2,"w")
        
    exportnam3=output_dir+'/round'+str(turn)+'SVC_Results_max.txt'
    export_class3=open(exportnam3,"w")
    #export_class2.write("uid"+"\t"+"group"+"\t"+"class"+"\n")
    regr = LinearSVC()
    regr.fit(Xobs,X[:,0])
    q=regr.predict(Y)
    #print q
    count=1
    ordersamp={}
    order=[]
    for i in q:
        gr=string.split(name[int(i)-1],"_")[0]
        gr=gr.replace("V","")
        
        #export_class2.write(header[count]+"\t"+str(i)+"\t"+name[int(i)-1]+"\n")
       # export_class2.write(header[count]+"\t"+str(i)+"\t"+gr+"\n")
        ordersamp[header[count]]=[name[int(i)-1],str(i)]
        count+=1
    #print len(X[:,0])
    if len(X[:,0])>2:
        prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
        #k=list(prob_)

        export_class1.write("uid")
        #export_class2.write("uid")
        export_class3.write("uid")
        for ni in name:
            export_class1.write("\t"+"R"+str(turn)+"-"+ni)
            #export_class2.write("\t"+"R"+str(turn)+"-"+ni)
            export_class3.write("\t"+"R"+str(turn)+"-"+ni)
        export_class1.write("\n")
        #export_class2.write("\n")
        export_class3.write("\n")
        #print prob_
        for iq in range(0,len(header)-1):
            export_class1.write(header[iq+1])
            #export_class2.write(header[iq+1])
            export_class3.write(header[iq+1])
            for jq in range(0,len(name)):
                export_class1.write("\t"+str(prob_[iq][jq]))
                #print prob_[iq][jq],'\t',max(prob_[iq,:])
                if prob_[iq][jq]==max(prob_[iq,:]):
                    #print ordersamp[header[iq+1]],name[jq]
                    if ordersamp[header[iq+1]][0]==name[jq]:
                        if max(prob_[iq,:])>0: ### Increase this value to increase SVM alignment specificity
                            class_assignment = 1
                            order.append([header[iq+1],name[jq],prob_[iq][jq],ordersamp[header[iq+1]][1]])
                        else:
                            class_assignment = 0 ### The best match is poor, hence, the cell will be excluded from final results!!!
                    export_class3.write("\t"+str(class_assignment))
                else:
                    export_class3.write("\t"+str(0))
                
            export_class1.write("\n")
            #export_class2.write("\n")
            export_class3.write("\n")
        export_class1.close()
        export_class3.close()
    else:
        if platform=="PSI":
            prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
            #k=list(prob_)
            export_class1.write("uid"+"\t")
            export_class2.write("uid"+"\t")
            export_class1.write("group")
            export_class2.write("round"+str(turn)+"-V1"+"\t"+"round"+str(turn)+"-V2"+"\n")
            #for ni in name:
            #   export_class1.write("\t"+ni)
            #   export_class2.write("\t"+ni)
            export_class1.write("\n")
            export_class2.write("\n")
            #print prob_
            #export_class1.write(header[1])
            #export_class2.write(header[1])
            for iq in range(0,len(header)-1):
                export_class1.write(header[iq+1])
                export_class2.write(header[iq+1])
                #for jq in range(0,len(X[:,0])):
                export_class1.write("\t"+str(prob_[iq]))
                if prob_[iq]>0.5:                    
                    export_class2.write("\t"+str(1)+"\t"+str(0))            
                else:
                    if prob_[iq]<-0.5:  
                        export_class2.write("\t"+str(0)+"\t"+str(1))
                    else:
                        export_class2.write("\t"+str(0)+"\t"+str(0))
                export_class1.write("\n")
                export_class2.write("\n")
        else:
            prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
            #k=list(prob_)

            export_class1.write("uid")
            #export_class2.write("uid")
            export_class3.write("uid")
            for ni in name:
                export_class1.write("\t"+"R"+str(turn)+"-"+ni)
                #export_class2.write("\t"+"R"+str(turn)+"-"+ni)
                export_class3.write("\t"+"R"+str(turn)+"-"+ni)
            export_class1.write("\n")
            #export_class2.write("\n")
            export_class3.write("\n")
            #print prob_
            for iq in range(0,len(header)-1):
                export_class1.write(header[iq+1])
                #export_class2.write(header[iq+1])
                export_class3.write(header[iq+1])
               # for jq in range(0,len(name)):
                export_class1.write("\t"+str(prob_[iq]))
                if prob_[iq]>0.0:
                        #print ordersamp[header[iq+1]],name[jq]
                    if ordersamp[header[iq+1]][0]==name[jq]: 
                        order.append([header[iq+1],name[jq],prob_[iq],ordersamp[header[iq+1]][1]])
                        export_class3.write("\t"+str(1))
                    else:
                        export_class3.write("\t"+str(0))
                    
                export_class1.write("\n")
                #export_class2.write("\n")
                export_class3.write("\n")
        export_class1.close()
        export_class3.close()
    order = sorted(order, key = operator.itemgetter(2),reverse=True)
    order = sorted(order, key = operator.itemgetter(1))
    for i in range(len(order)):
        #export_class2.write(order[i][0]+"\t"+order[i][3]+"\t"+order[i][1]+"\n")
        gr=string.split(order[i][1],"_")[0]
        gr=gr.replace("V","")
        
        #export_class2.write(header[count]+"\t"+str(i)+"\t"+name[int(i)-1]+"\n")
        export_class2.write(order[i][0]+"\t"+order[i][3]+"\t"+gr+"\n")
    
    export_class2.close()
    if platform=="PSI":
        Orderedheatmap.Classify(exportnam2)
    else:
        Orderedheatmap.Classify(exportnam3)
예제 #14
0
def Enrichment(Inputfile,mutdict,mutfile,metaDataMatrixFormat,header):
    import collections
    import mappfinder
    X=defaultdict(list)
    prev=""
    head=0
    group=defaultdict(list)
    enrichdict=defaultdict(float)
    mut=export.findFilename(mutfile)
    dire=export.findParentDir(Inputfile)
    output_dir = dire+'MutationEnrichment'
    print output_dir
    export.createExportFolder(output_dir)
    number_of_samples = 0
    
    ### All enrichment results
    exportnam=output_dir+'/Enrichment_Results.txt'
    export_enrich=open(exportnam,"w")
    
    ### Selected Enrichment results based on p-value, sensitivity and specificity for association with cluster names
    exportnam=output_dir+'/Enrichment_tophits.txt'
    export_hit=open(exportnam,"w")
   
    header = "Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n"
    export_enrich.write(header)
    export_hit.write(header)
    header2=returnSamplesInMetaData(Inputfile,metaDataMatrixFormat=True)
    print header2
    for line in open(Inputfile,'rU').xreadlines():
        if head > 0:
            number_of_samples+=1
            line=line.rstrip('\r\n')
            q = string.split(line,'\t')
            for i in range(1,len(q)):
                if q[i]==str(1):
                    #group[q[0]].append(header2[i-1])
                    group[header2[i-1]].append(q[0]) ### [Cluster] = [full_sample_ID]
        else:
            head+=1
            continue
   
    print 'Number of patient samples in dataset =',number_of_samples
    total_Scores={}
    for kiy in mutdict:
        if kiy =="MDP":
            print mutdict[kiy]
        groupdict={}
        remaining=[]
        remaining=list(set(header) - set(mutdict[kiy]))
        groupdict[1]=mutdict[kiy]
        groupdict[2]=remaining
        #export_enrich1.write(kiy)
        for key2 in group:
            r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy]))))
            n=float(len(group[key2]))
            R=float(len(set(mutdict[kiy])))
            N=float(number_of_samples)
            if r==0 or key2=="1" or R==1.0:
                #print kiy,key2,r,n,R,N
                pval=float(1)
                z=float(0)
                null_z = 0.000
                zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                zsd.SetP(pval)
            else:
                try: z = Zscore(r,n,N,R)
                except: z=0
                ### Calculate a Z-score assuming zero matching entries
                try: null_z = Zscore(0,n,N,R)
                except Exception: null_z = 0.000
               
                try:
                    pval = mappfinder.FishersExactTest(r,n,R,N)
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                except Exception:
                    pval=1.0
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                    #pass
                
            if kiy in total_Scores:
                    signature_db = total_Scores[kiy]
                    signature_db[key2]=zsd ### Necessary format for the permutation function
            else:
                    signature_db={key2:zsd}
                    total_Scores[kiy] = signature_db
    sorted_results=[]
    mutlabels={}
    for kiy in total_Scores:
        signature_db = total_Scores[kiy]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
            results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|')
            sorted_results.append([signature,-1*float(zsd.ZScore()),results])
    sorted_results.sort() ### Sort z-score
    
    prev=""
    for (sig,p,values) in sorted_results:
        if sig!=prev:
            flag=True
            export_hit.write(string.join(values,'\t')+'\n')
        if flag:
            ### Update the cluster label to include the top enriched term meeting, sensitivity and specificity cutoffs
            #print values[5],values[6],values[6],values[2]; sys.exit()
            if (float(values[5])>=0.2 and float(values[6])>=0.2 and float(values[7])>=1.95 and float(values[2])>=2):
                clusterID = values[1]
                topEnrichedTerm=values[0]
                mutlabels[clusterID]=clusterID+' ('+topEnrichedTerm+')'
                flag=False
                export_hit.write(string.join(values,'\t')+'\n')
        export_enrich.write(string.join(values,'\t')+'\n')
        prev=sig
    if len(sorted_results)==0:
            export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n')
    export_enrich.close()

    return mutlabels