Пример #1
0
def probesetSummarize(exp_file_location_db, analyze_metaprobesets,
                      probeset_type, species, root):
    for dataset in exp_file_location_db:  ### Instance of the Class ExpressionFileLocationData
        fl = exp_file_location_db[dataset]
        apt_dir = fl.APTLocation()
        array_type = fl.ArrayType()
        pgf_file = fl.InputCDFFile()
        clf_file = fl.CLFFile()
        bgp_file = fl.BGPFile()
        xhyb_remove = fl.XHybRemoval()
        cel_dir = fl.CELFileDir() + '/cel_files.txt'
        expression_file = fl.ExpFile()
        stats_file = fl.StatsFile()
        output_dir = fl.OutputDir() + '/APT-output'
        cache_dir = output_dir + '/apt-probeset-summarize-cache'
        architecture = fl.Architecture(
        )  ### May over-ride the real architecture if a failure occurs
        get_probe_level_results = 'yes'

        if get_probe_level_results == 'yes': export_features = 'yes'
        if xhyb_remove == 'yes' and (array_type == 'gene'
                                     or array_type == 'junction'):
            xhyb_remove = 'no'  ### This is set when the user mistakenly selects exon array, initially
        if analyze_metaprobesets == 'yes':
            export_features = 'true'
            metaprobeset_file = filepath('AltDatabase/' + species + '/' +
                                         array_type + '/' + species + '_' +
                                         array_type + '_' + probeset_type +
                                         '.mps')
            count = verifyFileLength(metaprobeset_file)
            if count < 2:
                from build_scripts import ExonArray
                ExonArray.exportMetaProbesets(
                    array_type,
                    species)  ### Export metaprobesets for this build
        import subprocess
        import platform
        print 'Processor architecture set =', architecture, platform.machine()
        if '/bin' in apt_dir:
            apt_file = apt_dir + '/apt-probeset-summarize'  ### if the user selects an APT directory
        elif os.name == 'nt':
            if '32bit' in architecture:
                apt_file = apt_dir + '/PC/32bit/apt-probeset-summarize'
                plat = 'Windows'
            elif '64bit' in architecture:
                apt_file = apt_dir + '/PC/64bit/apt-probeset-summarize'
                plat = 'Windows'
        elif 'darwin' in sys.platform:
            apt_file = apt_dir + '/Mac/apt-probeset-summarize'
            plat = 'MacOSX'
        elif 'linux' in sys.platform:
            if '32bit' in platform.architecture():
                apt_file = apt_dir + '/Linux/32bit/apt-probeset-summarize'
                plat = 'linux32bit'
            elif '64bit' in platform.architecture():
                apt_file = apt_dir + '/Linux/64bit/apt-probeset-summarize'
                plat = 'linux64bit'
        apt_file = filepath(apt_file)
        apt_extract_file = string.replace(apt_file, 'probeset-summarize',
                                          'cel-extract')

        #print 'AltAnalyze has choosen APT for',plat
        print "Beginning probeset summarization of input CEL files with Affymetrix Power Tools (APT)..."
        if 'cdf' in pgf_file or 'CDF' in pgf_file:
            if xhyb_remove == 'yes' and array_type == 'AltMouse':
                kill_list_dir = osfilepath('AltDatabase/' + species +
                                           '/AltMouse/' + species +
                                           '_probes_to_remove.txt')
            else:
                kill_list_dir = osfilepath(
                    'AltDatabase/affymetrix/APT/probes_to_remove.txt')

            try:
                ### Below code attempts to calculate probe-level summarys and absent/present p-values
                ### for 3'arrays (may fail for arrays with missing missmatch probes - AltMouse)
                cdf_file = pgf_file
                algorithm = 'rma'
                retcode = subprocess.call([
                    apt_file, "-d", cdf_file, "--kill-list", kill_list_dir,
                    "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir,
                    "-a", "pm-mm,mas5-detect.calls=1.pairs=1"
                ])
                try:
                    extract_retcode = subprocess.call(
                        [
                            apt_extract_file, "-d", cdf_file,
                            "--pm-with-mm-only", "-o",
                            output_dir + '/probe.summary.txt', "--cel-files",
                            cel_dir, "-a"
                        ]
                    )  ### "quant-norm,pm-gcbg", "--report-background" -requires a BGP file
                except Exception, e:
                    #print traceback.format_exc()
                    retcode = False  ### On some system there is a no file found error, even when the analysis completes correctly
                if retcode: status = 'failed'
                else:
                    status = 'run'
                    summary_exp_file = output_dir + '/' + algorithm + '.summary.txt'
                    export.customFileCopy(
                        summary_exp_file,
                        expression_file)  ### Removes the # containing lines
                    #shutil.copyfile(summary_exp_file, expression_file)
                    os.remove(summary_exp_file)

                    summary_stats_file = output_dir + '/pm-mm.mas5-detect.summary.txt'
                    try:
                        shutil.copyfile(summary_stats_file, stats_file)
                    except Exception:
                        None  ### Occurs if dabg export failed
                    os.remove(summary_stats_file)
            except Exception:
                #print traceback.format_exc()
                try:
                    cdf_file = pgf_file
                    algorithm = 'rma'
                    pval = 'dabg'
                    retcode = subprocess.call([
                        apt_file, "-d", cdf_file, "--kill-list", kill_list_dir,
                        "-a", algorithm, "-o", output_dir, "--cel-files",
                        cel_dir
                    ])  # "-a", pval,
                    if retcode: status = 'failed'
                    else:
                        status = 'run'
                        summary_exp_file = output_dir + '/' + algorithm + '.summary.txt'
                        export.customFileCopy(
                            summary_exp_file, expression_file
                        )  ### Removes the # containing lines
                        #shutil.copyfile(summary_exp_file, expression_file)
                        os.remove(summary_exp_file)
                except NameError:
                    status = 'failed'
Пример #2
0
                                    "-b", bgp_file, "--kill-list",
                                    kill_list_dir, "-m", metaprobeset_file,
                                    "-a", algorithm, "-o", output_dir,
                                    "--cel-files", cel_dir, "--feat-details",
                                    export_features
                                ]
                            )  ### Exclude DABG p-value - known issue for Glue junction array
                        else:
                            bad_exit
                if retcode: status = 'failed'
                else:
                    status = 'run'
                    summary_exp_file = output_dir + '/' + algorithm + '.summary.txt'
                    #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, expression_file, metaprobeset_file, species)
                    export.customFileCopy(
                        summary_exp_file,
                        expression_file)  ### Removes the # containing lines
                    #shutil.copyfile(summary_exp_file, expression_file)
                    os.remove(summary_exp_file)

                    summary_exp_file = output_dir + '/' + pval + '.summary.txt'
                    #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, stats_file, metaprobeset_file, species)
                    try:
                        shutil.copyfile(summary_exp_file, stats_file)
                        os.remove(summary_exp_file)
                    except Exception:
                        print traceback.format_exc()
                        null = []  ### Occurs if dabg export failed

                    if analyze_metaprobesets == 'yes':
                        residual_destination_file = string.replace(
Пример #3
0
def probesetSummarize(exp_file_location_db,analyze_metaprobesets,probeset_type,species,root):
    for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData
        fl = exp_file_location_db[dataset]
        apt_dir =fl.APTLocation()
        array_type=fl.ArrayType()  
        pgf_file=fl.InputCDFFile()
        clf_file=fl.CLFFile()
        bgp_file=fl.BGPFile()
        xhyb_remove = fl.XHybRemoval()
        cel_dir=fl.CELFileDir() + '/cel_files.txt'
        expression_file = fl.ExpFile()
        stats_file = fl.StatsFile()
        output_dir = fl.OutputDir() + '/APT-output'
        cache_dir = output_dir + '/apt-probeset-summarize-cache'
        architecture = fl.Architecture() ### May over-ride the real architecture if a failure occurs
        get_probe_level_results = 'yes'
        
        if get_probe_level_results == 'yes': export_features = 'yes'
        if xhyb_remove == 'yes' and (array_type == 'gene' or array_type == 'junction'): xhyb_remove = 'no' ### This is set when the user mistakenly selects exon array, initially
        if analyze_metaprobesets == 'yes':
            export_features = 'true'
            metaprobeset_file = filepath('AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'_'+probeset_type+'.mps')
            count = verifyFileLength(metaprobeset_file)
            if count<2:
                import ExonArray
                ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        import subprocess; import platform
        print 'Processor architecture set =',architecture,platform.machine()
        if '/bin' in apt_dir: apt_file = apt_dir +'/apt-probeset-summarize' ### if the user selects an APT directory
        elif os.name == 'nt':
            if '32bit' in architecture: apt_file = apt_dir + '/PC/32bit/apt-probeset-summarize'; plat = 'Windows'
            elif '64bit' in architecture: apt_file = apt_dir + '/PC/64bit/apt-probeset-summarize'; plat = 'Windows'
        elif 'darwin' in sys.platform: apt_file = apt_dir + '/Mac/apt-probeset-summarize'; plat = 'MacOSX'
        elif 'linux' in sys.platform:
            if '32bit' in platform.architecture(): apt_file = apt_dir + '/Linux/32bit/apt-probeset-summarize'; plat = 'linux32bit'
            elif '64bit' in platform.architecture(): apt_file = apt_dir + '/Linux/64bit/apt-probeset-summarize'; plat = 'linux64bit'
        apt_file = filepath(apt_file)
        apt_extract_file = string.replace(apt_file,'probeset-summarize','cel-extract')
        
        #print 'AltAnalyze has choosen APT for',plat
        print "Beginning probeset summarization of input CEL files with Affymetrix Power Tools (APT)..."
        if 'cdf' in pgf_file or 'CDF' in pgf_file:
            if xhyb_remove == 'yes' and array_type == 'AltMouse':
                kill_list_dir = osfilepath('AltDatabase/'+species+'/AltMouse/'+species+'_probes_to_remove.txt')
            else: kill_list_dir = osfilepath('AltDatabase/affymetrix/APT/probes_to_remove.txt')
            
            try:
                ### Below code attempts to calculate probe-level summarys and absent/present p-values
                ### for 3'arrays (may fail for arrays with missing missmatch probes - AltMouse)
                cdf_file = pgf_file; algorithm = 'rma'
                retcode = subprocess.call([
                    apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir,
                    "--cel-files", cel_dir, "-a", "pm-mm,mas5-detect.calls=1.pairs=1"])
                try:
                    extract_retcode = subprocess.call([
                        apt_extract_file, "-d", cdf_file, "--pm-with-mm-only", "-o", output_dir+'/probe.summary.txt',
                        "--cel-files", cel_dir, "-a"]) ### "quant-norm,pm-gcbg", "--report-background" -requires a BGP file
                except Exception,e:
                    #print traceback.format_exc()
                    retcode = False ### On some system there is a no file found error, even when the analysis completes correctly
                if retcode: status = 'failed'
                else:
                    status = 'run'
                    summary_exp_file = output_dir+'/'+algorithm+'.summary.txt'
                    export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines
                    #shutil.copyfile(summary_exp_file, expression_file)
                    os.remove(summary_exp_file)
                    
                    summary_stats_file = output_dir+'/pm-mm.mas5-detect.summary.txt'
                    try: shutil.copyfile(summary_stats_file, stats_file)
                    except Exception: None ### Occurs if dabg export failed
                    os.remove(summary_stats_file)
            except Exception:
                #print traceback.format_exc()
                try:
                    cdf_file = pgf_file; algorithm = 'rma'; pval = 'dabg'
                    retcode = subprocess.call([
                        apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir]) # "-a", pval,
                    if retcode: status = 'failed'
                    else:
                        status = 'run'
                        summary_exp_file = output_dir+'/'+algorithm+'.summary.txt'
                        export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines
                        #shutil.copyfile(summary_exp_file, expression_file)
                        os.remove(summary_exp_file)
                except NameError:
                    status = 'failed'
Пример #4
0
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"):

    root_dir = export.findParentDir(NMFinputDir)[:-1]
    if 'ExpressionInput' in root_dir:
        root_dir = export.findParentDir(root_dir)
    if 'NMF-SVM' in root_dir:
        root_dir = export.findParentDir(root_dir)
        
    export.findFilename(NMFinputDir)
        
    X=[]
    header=[]
    head=0
    exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt'
    export_res=export.ExportFile(exportnam)
    exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt'
    export_res1=export.ExportFile(exportnam_bin)
    exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt'
    export_res5=export.ExportFile(exportnam_bint)
    MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt'
    export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt')
    export_res4=open(string.replace(MF_input,'exp.','groups.'),"w")
    export_res7=open(string.replace(MF_input,'exp.','comps.'),"w")
    exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt'
    export_res2=export.ExportFile(exportnam2)
    exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt'
    export_res3=export.ExportFile(exportnam3)
    #if 'Clustering' in NMFinputDir:
     #   count=1
      #  start=2
    #else:
    count=0
    start=1
    #print Rank
    for line in open(NMFinputDir,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        if head >count:
            val=[]
            val2=[]
            me=0.0
            
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            #if q[1]==prev:
            X.append(val)
          
        else:
            export_res1.write(line)
            export_res.write(line)
            export_res1.write("\n")
            #export_res4.write(line)
            #export_res4.write("\n")
            export_res.write("\n")
            header=q
            head+=1
            continue   
    group=defaultdict(list)
        
    sh=[]
    X=np.array(X)
    #print X.shape
    mat=[]
    #mat=X
    mat=zip(*X)
    mat=np.array(mat)
    #print mat.shape
    #model = NMF(n_components=15, init='random', random_state=0)
    #W = model.fit_transform(mat)
    nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95)
    nmf_fit = nmf()
    W = nmf_fit.basis()
    W=np.array(W)
    #np.savetxt("basismatrix2.txt",W,delimiter="\t")
    H=nmf_fit.coef()
    H=np.array(H)
   # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t")
    #print W.shape
    sh=W.shape
    export_res3.write("uid\tUID\tUID\n")
    if int(Rank)==2:
        par=1
    else:
        par=2
    #for i in range(sh[1]):
    #    val=W[:,i]
    #    me=np.mean(val)
    #    st=np.std(val)
    #    export_res2.write(header[i+1])
    #    for j in range(sh[0]):
    #        if float(W[i][j])>=float(me+(par*st)):
    #          
    #            export_res2.write("\t"+str(1))
    #        else:
    #            export_res2.write("\t"+str(0))
    #       
    #    export_res2.write("\n")
    if platform != 'PSI':
        sh=W.shape
        Z=[]
        export_res5.write("uid")
        export_res2.write("uid")
        for i in range(sh[1]):
            
            export_res5.write("\t"+'V'+str(i))
            export_res2.write("\t"+'V'+str(i))
            export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
            
        export_res5.write("\n")
        export_res2.write("\n")
        export_res3.write("\n")
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            export_res4.write(header[i+1])
            flag=True
            for j in range(sh[1]):
                if W[i][j]==max(val) and flag:
                    export_res5.write("\t"+str(1))
                    export_res2.write("\t"+str(1))
                    new_val.append(1)
                    export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j))
                    flag=False
                else:
                    export_res5.write("\t"+str(0))
                    export_res2.write("\t"+str(0))
                    new_val.append(0)
                
            Z.append(new_val)
            export_res5.write("\n")
            export_res2.write("\n")
            export_res4.write("\n")
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=zip(*Z)
        Z=np.array(Z)
        for i in range(sh[0]):
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
            for j in range(sh[1]):
                export_res.write("\t"+str(W[i][j]))
                export_res1.write("\t"+str(Z[i][j]))
            export_res.write("\n")
            export_res1.write("\n")
            
        export_res.close()
        export_res1.close()
        export_res2.close()
        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        
        return exportnam,exportnam_bin,exportnam2,exportnam3
    
    else:
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=[]
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            num=sum(i > 0.10 for i in val)
            if num >40 or num <3:
                compstd=True
            else:
                compstd=False
            me=np.mean(val)
            st=np.std(val)
            #print 'V'+str(i)
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
           
            for j in range(sh[1]):
                
                if compstd:   
                    if float(W[i][j])>=float(me+(par*st)):
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                else:
                    if float(W[i][j])>0.1:
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                export_res.write("\t"+str(W[i][j]))
                
            Z.append(new_val)
            export_res.write("\n")
            export_res1.write("\n")
       # Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
        Z_new=[]
        val1=[]
        Z1=[]
        dellst=[]
        export_res2.write("uid")
        export_res5.write("uid")
        for i in range(sh[0]):
            indices=[]
            val1=Z[i,:]
            sum1=sum(val1)
            flag=False
            indices=[index for index, value in enumerate(val1) if value == 1]
            for j in range(sh[0]):
                val2=[]
                
                if i!=j:
                    val2=Z[j,:]
                    
                    sum2=sum([val2[x] for x in indices])
                    summ2=sum(val2)
                    try:
                        if float(sum2)/float(sum1)>0.5:
                            if summ2>sum1:
                                flag=True
                                #print str(i)
                    except Exception:
                        continue
            if flag==False:
    
                Z1.append(val1)
                export_res2.write("\t"+'V'+str(i))
                export_res5.write("\t"+'V'+str(i))
                export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
        
        export_res2.write("\n")
        export_res5.write("\n")
        Z1=np.array(Z1)
        Z=Z1
        Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
            
        for i in range(sh[0]):
            val1=Z[i,:]
            #print sum(val1)
            #if sum(val)>2: 
            if sum(val1)>2:
                val=[0 if x==1 else x for x in val1]
            else:
                val=val1
            me=np.mean(val)
            st=np.std(val)
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            for j in range(sh[1]):
                if strategy=="conservative":
                    export_res2.write("\t"+str(val1[j]))
                    export_res5.write("\t"+str(val1[j]))
                else:
                   export_res2.write("\t"+str(val[j]))
                   export_res5.write("\t"+str(val[j])) 
            export_res2.write("\n")
            export_res5.write("\n")
            Z_new.append(val)
        Z_new=zip(*Z_new)
        Z_new=np.array(Z_new)
        
        sh=Z_new.shape

        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        if strategy=="conservative":
            return exportnam,exportnam_bin,exportnam2,exportnam3
        else:
            return exportnam,exportnam_bin,exportnam2,exportnam3
Пример #5
0
                        try: os.remove(summary_exp_file)
                        except Exception: null=[] ### Occurs if dabg export failed
                        fatal_error = APTDebugger(output_dir)
                        if len(fatal_error)>0:
                            print fatal_error
                            print 'Skipping DABG p-value calculation to resolve (Bad library files -> contact Affymetrix support)'
                            retcode = subprocess.call([
                            apt_file, "-p", pgf_file, "-c", clf_file, "-b", bgp_file, "--kill-list", kill_list_dir, "-m", metaprobeset_file,
                            "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir, "--feat-details", export_features]) ### Exclude DABG p-value - known issue for Glue junction array
                        else: bad_exit
                if retcode: status = 'failed'
                else:
                    status = 'run'
                    summary_exp_file = output_dir+'/'+algorithm+'.summary.txt'
                    #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, expression_file, metaprobeset_file, species)
                    export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines
                    #shutil.copyfile(summary_exp_file, expression_file)
                    os.remove(summary_exp_file)

                    summary_exp_file = output_dir+'/'+pval+'.summary.txt'
                    #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, stats_file, metaprobeset_file, species)
                    try:
                        shutil.copyfile(summary_exp_file, stats_file)
                        os.remove(summary_exp_file)
                    except Exception:
                        print traceback.format_exc()
                        null=[] ### Occurs if dabg export failed
                    
                    if analyze_metaprobesets == 'yes':
                        residual_destination_file = string.replace(expression_file,'exp.','residuals.')
                        residual_exp_file = output_dir+'/'+algorithm+'.residuals.txt'