예제 #1
0
def main():
    dbConn = db.connect_to_server('pfacts003_test', 'webuser')
    dbCur = db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    try:
        dbCur.execute("DROP TABLE snp_rost_intrepid_header;")
        print "\nPrevious version of 'snp_rost_intrepid_header' deleted."
    except:
        print "\nTable 'snp_rost_intrepid_header' does not exist and will be created."
    try:
        dbCur.execute("DROP TABLE snp_rost_intrepid_detail;")
        print "Previous version of 'snp_rost_intrepid_detail' deleted."
    except:
        print "Table 'snp_rost_intrepid_detail' does not exist and will be created."

    dbCur.execute("CREATE TABLE snp_rost_intrepid_header \
          (id serial PRIMARY KEY, protaccid varchar, familyid integer, \
          familytype char(1), msaok boolean, treeok boolean, idmapok boolean, scoresok boolean, familyscore integer, \
          numresscored integer, starttime varchar);")
    print "Created new or replacement 'snp_rost_intrepid_header' table in database."

    dbCur.execute("CREATE TABLE snp_rost_intrepid_detail \
      (id serial PRIMARY KEY, protaccid varchar, familyid integer, position integer,resseq integer,\
      chain varchar,residue char(1),icode char(1),scoreimpmap float,scorejsmap float,scoreremap float,\
      scoreglobalmap float,scoreglobalksmap float,scoreglobalremap float);")
    print "Created new or replacement 'snp_rost_intrepid_detail' table in database.\n"

    dbConn.commit()
    dbConn.close()
예제 #2
0
def main():
    askForConfirm()
    
    dbConn=db.connect_to_server('pfacts003_test','webuser')
    dbCur=db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    try:
        dbCur.execute("DROP VIEW snp_results_detail_avg;")
        print "\nPrevious version of 'snp_results_detail_avg' deleted."
    except:
        print "\nView 'snp_results_detail_avg' does not exist and will be created."
    try:
        dbCur.execute("DROP TABLE snp_results_detail;")
        print "Previous version of 'snp_results_detail' deleted."
    except:
        print "Table 'snp_results_detail' does not exist and will be created."
    try:
        dbCur.execute("DROP TABLE snp_results_header;")
        print "Previous version of 'snp_results_header' deleted."
    except:
        print "Table 'snp_results_header' does not exist and will be created."

    dbCur.execute("CREATE TABLE snp_results_header ( \
          id serial PRIMARY KEY, \
          time_stamp timestamp UNIQUE, \
          dataset_used varchar \
          );")
    print "Created new or replacement 'snp_results_header' table in database."

    dbCur.execute("CREATE TABLE snp_results_detail ( \
      id serial PRIMARY KEY, \
      time_stamp timestamp REFERENCES snp_results_header(time_stamp), \
      model_used varchar, \
      fold_no integer, \
      seq_no integer, \
      val_recall decimal(4,3), \
      val_precision decimal(4,3) \
      );")
    print "Created new or replacement 'snp_results_detail' table in database."

    dbCur.execute("CREATE VIEW snp_results_detail_avg AS \
      SELECT time_stamp, model_used, seq_no, AVG(val_recall) as recall, AVG(val_precision) as precision \
      FROM snp_results_detail \
      GROUP BY time_stamp, model_used, seq_no \
      ORDER BY time_stamp ASC, model_used ASC, seq_no ASC;")
    print "Created new or replacement 'snp_results_detail_avg' view in database.\n"

    dbConn.commit()
    dbConn.close()
예제 #3
0
def main():
    askForConfirm()
    
    dbConn=db.connect_to_server('pfacts003','webuser')
    dbCur=db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    try:
        dbCur.execute("DROP TABLE snp_SIFT_pred_header;")
        print "\nPrevious version of 'snp_SIFT_pred_header' deleted."
    except:
        print "\nTable 'snp_SIFT_pred_header' does not exist and will be created."
    try:
        dbCur.execute("DROP TABLE snp_SIFT_pred_detail;")
        print "Previous version of 'snp_SIFT_pred_detail' deleted."
    except:
        print "Table 'snp_SIFT_pred_detail' does not exist and will be created."

    dbCur.execute("CREATE TABLE snp_SIFT_pred_header ( \
                      id serial PRIMARY KEY, \
                      time_stamp timestamp UNIQUE, \
                      dataset_used varchar \
                      );")
    print "Created new or replacement 'snp_SIFT_pred_header' table in database."

    dbCur.execute("CREATE TABLE snp_SIFT_pred_detail ( \
                      id serial PRIMARY KEY, \
                      time_stamp timestamp REFERENCES snp_SIFT_pred_header(time_stamp), \
                      accession varchar, \
                      aa_ref varchar, \
                      aa_pos integer, \
                      aa_mut varchar, \
                      prediction integer, \
                      score1 float, \
                      score2 float, \
                      score3 float, \
                      score4 float \
                      );")
    print "Created new or replacement 'snp_SIFT_pred_detail' table in database."

    dbConn.commit()
    dbConn.close()
def main():
    if len(sys.argv) != 4:
        print "Usage: %s <PBS_JOBID> <NUMTHREADS> <NUMPROTSPROCESSED>" % sys.argv[
            0]
        sys.exit(1)

    jobID = int(sys.argv[1])
    numThreads = int(sys.argv[2])
    numProtsProcessed = int(sys.argv[3])
    jobidalpha = "00" + str(jobID)
    #    sys.stdout=NullDevice() #Turn off print messages.
    startTime = datetime.now()
    pgmName = 'compute_intrepid_snp_scores'
    fout = open(
        pgmName + ".out." + startTime.strftime("%Y%m%d.%Hh%M") + "." +
        jobidalpha[-3:], "w")
    fout.write("\nStarting program...\n")
    fout.write("Turned off deprecation warnings.\n")

    dbConn = db.connect_to_server('pfacts003_test', 'webuser')
    dbCur = db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    proteinsList = db.get_query_list(
        dbCur,
        "SELECT DISTINCT u.accession FROM uniprot as u INNER JOIN snp_rost_master as s \
    ON s.accession=u.accession WHERE s.status='M' ORDER BY u.accession ASC LIMIT %s;",
        [numProtsProcessed])

    totCounter = 0
    totCounterThread = 0
    totResCounter = 0

    l1 = 65
    l2 = 15
    for prot in proteinsList:
        if totCounter % numThreads != jobID:  #Check if this thread should be computing this protein in list.
            totCounter += 1
            continue  #Skip this protein.

        protAccID = prot[0]
        totCounterThread += 1

        famsList = db.get_query_list(
            dbCur, "SELECT DISTINCT f.id, f.family_type_id, f.score \
        FROM family_sequence_taxa_2012_02_17 as fs \
        LEFT JOIN family as f ON fs.family_id=f.id WHERE fs.uniprot_accession=%s \
        ORDER BY f.family_type_id DESC, f.score DESC;", [protAccID])
        if len(famsList) == 0:  # No families found for protein.
            famid = 0
            famtype = 'N'
            famscore = 0
            msaok = False
            treeok = False
            idmapok = False
            scoresok = False
            dbCur.execute("INSERT INTO intrepid_header \
              (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,start_pos,start_time) \
               VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"                                                          ,\
                   [protAccID,famid,famtype,famscore,msaok,treeok,idmapok,scoresok,0,0,startTime])
        else:
            flgGHGFound = False  # Initialize to false.
            for fam in famsList:  # Loop over all families for protein.
                famid = fam[0]
                famtype = fam[1]
                famscore = fam[2]
                if famtype == 'C' or (famtype == 'G' and flgGHGFound == False):
                    if famtype == 'G':
                        flgGHGFound = True
                    results = i.run_intrepid(
                        protAccID, famid)  # Run INTREPID and get results.
                    if results[
                            0] == False:  # INTREPID run was not OK, results[1] will indicate why.
                        dbCur.execute("INSERT INTO intrepid_header \
                          (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,\
                           start_pos,start_time) \
                           VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"                                                                      ,\
                                 [protAccID,famid,famtype,famscore,results[1][0],results[1][1],results[1][2],\
                                  results[1][3],None,None,startTime])
                    else:  # INTREPID run was OK, results[1] will be list of scores and results[2] will be start pos in seq.
                        for line in results[1]:
                            dbCur.execute("INSERT INTO intrepid_detail \
                              (accession, fam_id, position, resseq, chain, residue, icode, scoreimpmap, \
                               scorejsmap, scoreremap, scoreglobalmap, scoreglobalksmap, scoreglobalremap) \
                               VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"                                                                                , \
                                 [protAccID,famid,line[0],line[1],line[2],line[3],\
                                  line[4],line[5],line[6],line[7],line[8],line[9],line[10]])
                        startpos = i.get_start_pos(dbCur, protAccID,
                                                   results[2])
                        dbCur.execute("INSERT INTO intrepid_header \
                          (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,\
                           start_pos,start_time) \
                           VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"                                                                      , \
                             [protAccID,famid,famtype,famscore,True,True,True,True,len(results[1]),startpos,startTime])
                        totResCounter += len(
                            results[1]
                        )  #Increment counter of total number of detail lines added.
        totCounter += 1

    fout.write("\nResults\n")
    fout.write("-" * (l1 + l2) + "\n")
    fout.write("Number of proteins scored:".ljust(l1) +
               str(totCounterThread).rjust(l2) + "\n")
    fout.write("Number of protein positions scored:".ljust(l1) +
               str(totResCounter).rjust(l2) + "\n")
    endTime = datetime.now()
    diff = endTime - startTime
    fout.write("Elapsed time (HH:MM:SS):".ljust(l1) + str(diff).rjust(l2) +
               "\n\n")
    fout.close()
    dbConn.commit()
    dbConn.close()
예제 #5
0
def main():
    if len(sys.argv) != 3:
        print "Usage: %s <DATASET> <NUMFOLDS>\nProgram terminated!" % sys.argv[0]
        sys.exit(1)
    dataset = sys.argv[1]
    if dataset != 'rost':
        print "Dataset '%s' not supported.\nProgram terminated!" % dataset
        sys.exit(1)
    numFolds = int(sys.argv[2])
    askWhichModels('log reg')
    askWhichModels('SVM')

    startTime = datetime.now()
    formattedStartTime = startTime.strftime("%Y%m%dd%Hh%M")
    formattedDBTimeStamp = startTime.strftime("%Y-%m-%d %H:%M:%S")
    pgmName = 'run_model'
    fout = open(pgmName+".out."+formattedStartTime,"w")
    fout.write("\nStarting program...\n")

    dbConn = db.connect_to_server('pfacts003_test','webuser')
    dbCur = db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    #Get entire dataset with all required features.
    completeSNPList = db.get_query_list(dbCur,"SELECT id, label FROM snp_rost_detail_valid ORDER BY id ASC;")
    numSNPsPre = len(completeSNPList)
    idsWithFeatures = n.array(completeSNPList)[:,0]; y = n.array(completeSNPList)[:,1]
    idsWithFeatures.shape = (numSNPsPre,1); y.shape = (numSNPsPre,1) # Force as col vectors.
    
    X = n.ones((numSNPsPre,1)) #Prepend a col of ones for the intercept.
    featuresUsed.append('Intercept')

    validIndices,newFeatures = f.extract_intrepid_features(dbCur,idsWithFeatures)
    idsWithFeatures = idsWithFeatures[validIndices]
    y = y[validIndices]; X = n.hstack([X[validIndices,:],newFeatures])
    featuresUsed.append('INTREPID scores')
    numSNPsPost = len(y)

    validIndices,newFeatures = f.extract_blossum62_scores(dbCur,idsWithFeatures)
    idsWithFeatures = idsWithFeatures[validIndices]
    y = y[validIndices]; X = n.hstack([X[validIndices,:],newFeatures])
    featuresUsed.append('BLOSSUM62 scores')
    numSNPsPost = len(y)

    #Determine partition of dataset into folds.
    foldAssignments = m.get_shuffled_discrete_uniform_indices(0,numFolds-1,numSNPsPost)

    #Create result header record to document run.
    dbCur.execute("INSERT INTO snp_results_header (time_stamp, dataset_used) VALUES (timestamp %s, %s);",[formattedDBTimeStamp,dataset])

    #Peform k-fold cross-validation and produce results (per fold and overall).
    num_cutoff_points = 101
    results_all_folds = n.zeros((2*numFolds,num_cutoff_points))
    average_performance_results = n.zeros((2,num_cutoff_points)) #This will hold the overall results, first as a total and then as an average.
    average_performance_num_contributors = n.zeros((1,num_cutoff_points)) #This keeps track of how many rounds contributed to the total for a cutoff point.
    for round in range(0,numFolds):
        trainX = X[n.nonzero(foldAssignments != round)[0],:]
        trainY = y[n.nonzero(foldAssignments != round)[0],:]
        testX = X[n.nonzero(foldAssignments == round)[0],:]
        testY = y[n.nonzero(foldAssignments == round)[0],:]
        if 'log reg' in modelsUsed:
            models.run_logreg_model(dbCur,trainY,trainX,testY,testX,formattedDBTimeStamp,round,num_cutoff_points)
        if 'SVM' in modelsUsed:
            models.run_svm_model(dbCur,trainY,trainX,testY,testX,formattedDBTimeStamp,round,num_cutoff_points)

    l1=65
    l2=15
    fout.write("\nResults\n")
    fout.write("-"*(l1+l2)+"\n")
    fout.write("Dataset used:".ljust(l1)+str(dataset).rjust(l2)+"\n")
    fout.write("Number of folds in cross validation:".ljust(l1)+str(numFolds).rjust(l2)+"\n")
    fout.write("Number of SNPs included in dataset before feature extraction:".ljust(l1)+str(numSNPsPre).rjust(l2)+"\n")
    fout.write("Number of SNPs included in dataset post feature extraction:".ljust(l1)+str(numSNPsPost).rjust(l2)+"\n")
    fout.write("Models used:"+"\n")
    for modelDesc in modelsUsed:
        fout.write("\t"+modelDesc+"\n")
    fout.write("Features used:"+"\n")
    for featureDesc in featuresUsed:
        fout.write("\t"+featureDesc+"\n")
    fout.write("See database for storage of results.\n")
    endTime=datetime.now()
    diff=endTime-startTime
    fout.write("Elapsed time (HH:MM:SS):".ljust(l1)+str(diff).rjust(l2)+"\n\n")
    fout.close()
    dbConn.commit()
    dbConn.close()
def main():
    #Check input.
    if len(sys.argv) != 2:
        print "\nUsage: %s <filename w/o pdf extension>\nProgram terminated!\n" % os.path.basename(sys.argv[0])
        mlab.close()
        sys.exit(1)
    else:
        filename = sys.argv[1]
        print "\nProgram will create a plot/figure file named "+filename+".pdf\n" 

    #Set up empty request list.
    plotsRequested = []

    #Prompt for type of report.
    options = ['None/Quit','PR Curve(s)']
    chosenReport = getChoice('Choose the number of the desired report',options,True)
    if chosenReport == 0: mlab.close(); sys.exit('Program terminated at user request!')

    dbConn = db.connect_to_server('pfacts003_test','webuser')
    dbCur = db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)

    if chosenReport == 1:
        runsAvailable = db.get_query_list(dbCur,"SELECT time_stamp, dataset_used FROM snp_results_header ORDER BY time_stamp DESC;")
        if len(runsAvailable) == 0:
            print "No results available for chosen report type"
        else:
            idxFirst = 1; runsAvailable.insert(0,('None/Quit','NA'))
            print "Available runs include the following:"
            for time_stamp, dataset in runsAvailable[1:]:
                print "\t"+str(idxFirst)+"\t"+str(time_stamp)+"\t"+dataset
                idxFirst += 1
            chosenRun = getChoice("Choose the number of the desired run",range(len(runsAvailable)),False)
            if chosenRun == 0: mlab.close(); sys.exit('Program terminated at user request!')
            modelsAvailable = db.get_query_list(dbCur,"SELECT DISTINCT model_used FROM snp_results_detail WHERE time_stamp = %s ORDER BY model_used ASC;", \
                                                [runsAvailable[chosenRun][0]])
            modelsAvailable = [i[0] for i in modelsAvailable] # Flatten list of lists.
            modelsAvailable.insert(0,'None/Quit') # Add this option at start of list for user.
            print "For the run chosen, the following models have results:"
            idxSec = 1
            for each in modelsAvailable[1:]:
                print "\t"+str(idxSec)+"\t"+str(each)
                idxSec += 1
            while True:
                chosenModel = getChoice("Choose the number of a model to add/include or 'None' to continue",modelsAvailable,True)
                if chosenModel == 0:
                    break
                else:
                    chosenType = getChoice("Choose the number corresponding to level of detail or 'None' to continue",\
                                           ['None','Individual Folds','Average over Folds'],True)
                    if chosenType == 0:
                        continue
                    else:
                        plotsRequested.append((runsAvailable[chosenRun][0],modelsAvailable[chosenModel],chosenType))
                        
            # Collect data requested.
            dataToPlot = n.zeros((2,0)) # Create an empty 2-row, 0-col matrix that will used to horizontally append data.
            startPoints = n.array([]) # Initialize empty array that stores where each series starts.
            counter = 1 # Need to use base 1 because that's what Matlab uses.
            legend_string = ''
            for time_stamp,model,type in plotsRequested:
                if type == 1:
                    currFold = -1
                    data = db.get_query_list(dbCur,"SELECT fold_no, seq_no, val_recall, val_precision FROM snp_results_detail \
                           WHERE time_stamp = %s AND model_used = %s ORDER BY fold_no ASC, seq_no ASC;", [time_stamp,model])
                    for fold,seq,valrecall,valprec in data:
                        if fold != currFold:
                            startPoints = n.hstack([startPoints,n.array([counter])])
                            currFold = fold
                            legend_string = legend_string + 'Fold ' + str(fold) + ','
                        if valrecall != None and valprec != None:
                            dataToPlot = n.hstack([dataToPlot,n.array([[float(valrecall)],[float(valprec)]])])
                            counter += 1
                if type == 2:
                    legend_string = legend_string + model + ','
                    data = db.get_query_list(dbCur,"SELECT seq_no, recall, precision FROM snp_results_detail_avg \
                           WHERE time_stamp = %s AND model_used = %s ORDER BY seq_no ASC;", [time_stamp,model])
                    startPoints = n.hstack([startPoints,n.array([counter])])
                    for seq,valrecall,valprec in data:
                        if valrecall != None and valprec != None:
                            dataToPlot = n.hstack([dataToPlot,n.array([[float(valrecall)],[float(valprec)]])])
                            counter += 1
                        
    mlab.close()
    if startPoints.shape[0] <= 1: # Means that only one series is present.
        legend_string = 'NA'
    else:
        legend_string = legend_string[0:-1] # Strip off trailing comma.
    plot_pr_curve(dataToPlot,startPoints,filename,legend_string)
def run_query(query):

    dbConn = db.connect_to_server('pfacts003_test', 'webuser')
    dbCur = db.get_cursor(dbConn)
    dbConn.set_isolation_level(0)
    return db.get_query_list(dbCur, query)