def main(tableToUse, outputDirectory, predictionDirection, seqsPerFile, maxSeqLength, schemaProteins, databasePassword, columnWithPredictions): if predictionDirection != 'IN': # Clear the output directory, and then remake it. if os.path.exists(outputDirectory): shutil.rmtree(outputDirectory) os.mkdir(outputDirectory) conn, cursor = mysql.openConnection(databasePassword, schemaProteins) if predictionDirection == 'OUTA': # Output all proteins, whether or not they have a prediction made for them. cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableToUse) results = cursor.fetchall() results = ['>' + '\n'.join([j[0], j[1]]) for j in results if len(j[1]) <= maxSeqLength] length = len(results) numberOfSplits = length / seqsPerFile if length%seqsPerFile != 0: numberOfSplits += 1 fileOutput = [ results[i * seqsPerFile : (i+1) * seqsPerFile] for i in range(numberOfSplits)] for i in range(len(fileOutput)): utilities.list2file.main(fileOutput[i], outputDirectory + '/' + str(i) + '.fasta') elif predictionDirection == 'OUTS': # Output only those proteins that do not already have a prediction mae for them. cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableToUse, columnWithPredictions + '="NA"') results = cursor.fetchall() results = ['>' + '\n'.join([j[0], j[1]]) for j in results if len(j[1]) <= maxSeqLength] length = len(results) numberOfSplits = length / seqsPerFile if length%seqsPerFile != 0: numberOfSplits += 1 fileOutput = [ results[i * seqsPerFile : (i+1) * seqsPerFile] for i in range(numberOfSplits)] for i in range(len(fileOutput)): utilities.list2file.main(fileOutput[i], outputDirectory + '/' + str(i) + '.fasta') elif predictionDirection == 'IN': # Reading in predictions from the user. predFolderFiles = os.listdir(outputDirectory) fastaFiles = [i for i in predFolderFiles if i.split('.')[1] == 'fasta'] fastaFilesNoExtension = [i.split('.')[0] for i in fastaFiles] predFiles = [(i, i.split('.')[0] + '.fasta') for i in predFolderFiles if i.split('.')[0] in fastaFilesNoExtension and i not in fastaFiles] if len(predFiles) > len(fastaFiles): # There are too many prediction files. print 'There are too many prediction files, or too many files with the same name as the fasta files.' sys.exit() updatesToPerform = [] for i in predFiles: predictionFile = outputDirectory + '/' + i[0] fastaFile = outputDirectory + '/' + i[1] # Gather the accessions of the proteins that were supposed to have been predicted. outputProteins = utilities.file2list.main(fastaFile)[::2] outputProteins = [i[1:] for i in outputProteins] # Gather the input data. inputProteins = utilities.file2list.main(predictionFile) updateTuples = wolf_prediction_entry(outputProteins, inputProteins) updatesToPerform.extend(updateTuples) updateQuery = 'UPDATE ' + tableToUse + '\n\tSET ' + columnWithPredictions + ' = CASE UPAccession\n' for i in updatesToPerform: updateQuery += '\t\tWHEN "' + i[0] + '" THEN "' + i[1] + '"\n' updateQuery += '\tEND\nWHERE UPAccession IN ("' + '","'.join([i[0] for i in updatesToPerform]) + '")' cursor.execute(updateQuery) mysql.closeConnection(conn, cursor)
def main(goSchema, parsedGO): """ Returns a file containing the parsed GO data. parsedGO - A file of 6-tuples, one on each line. The first element is the numerical identifier of the GO term. The second element is the name of the GO term. The third element is the category (biological_process, cellular component or molecular_function) that the term belongs to. The fourth element is all the paths from the term to the category it belongs to. The paths are separated from one another using semi-colons, and the elements of each path are separated from one another using '#'. The fifth element is all the level one terms along the paths. These are all the terms that are in a path in element four and are diect descendants of the category in element three. The sixth element is all the level two terms along the paths. These are all the terms that are in a path in element four and are diect descendants of the terms in element five. """ # Connect to the GO schema. connGO, cursorGO = mysql.openConnection('root', goSchema) # Extract the information from the GO schema that is necessary to build up the paths. cursorGO = mysql.tableSELECT(cursorGO, 'id, name, term_type, acc', 'term') result = cursorGO.fetchall() # Turn the information extracted from the GO schema into a dictionary indexed by the identifiers of the GO terms. resultsDict = dict([(x[0], x[1:]) for x in result]) # recordsToInsert will contain the information to insert into the GO path schema. recordsToInsert = [] # For every term work out the name of the term, which of the three types of GO term it is, all the paths from the # term to the root terms (which are the three types), all unique level 1 and level 2 terms along the path. # A level 1 term is a child term of the root. A level 2 term is a child of a level 1 term. for r in resultsDict.keys(): if re.match('GO:[0-9]+$', resultsDict[r][2]): # If the term_type is a GO term rather than something like a relationship describer (i.e. not something like # part_of, goslim_plant or regulates). pathDict = {} entry = resultsDict[r] startID = r startName = entry[0] startType = entry[1] pathDict[startID] = ['#' + startName] toCheck = [startID] while toCheck != []: # In order to determine all of the paths from a term to the root nodes, we treat the set of GO terms # as a directed graph. The direction of the edges is from parent to child. In the traversal we only go # in the direction opposite to that of the arrows (i.e. up a tree). Starting with a single term B, we find # the set of it's parent terms, P. To each element of P we add B to the end. The path for B then looks like # #P1#B #P2#B ... #Pn#B. This is repeated until all paths are capped at the front with a root term (the # root term will be the same for every path of B). Not all paths for B will be of the same length. current = toCheck.pop() currentPath = pathDict[current] # Get the terms that current is a subterm of. # Every entry in the term2term table that is returned represents one superterm of the current term. cursor = mysql.tableSELECT(cursorGO, 'term1_id', 'term2term', 'term2_id = \'' + str(current) + '\'') crossrefResults = cursor.fetchall() if crossrefResults == (): # If the current term is not a subterm of any other term, then the current term is a root term. # This means that the current path is finished. rootNode = current continue for i in crossrefResults: # Get the id and name of the superterm being examined. uniqueID = i[0] name = resultsDict[uniqueID][0] path = [] for j in currentPath: # For every entry in the current path stick the superterm at the front of it, as it is an # ancestor of whatever the head of the current path is. path.append('#' + name + j) # Get the identifiers of the terms which have already had their descendants enumerated. keys = pathDict.keys() if keys.count(uniqueID) == 0: # If the current superterm has not already had its descendants enumerated, then simply record # the paths just enumerated in the pathDict. pathDict[uniqueID] = path else: # If the current superterm has already had its descendants enumerated, then add all the paths just # enumerated to its record. Following this remove all duplicate paths. pathDict[uniqueID].extend(path) pathDict[uniqueID] = list(set(pathDict[uniqueID])) # Add the superterm to the list of terms that need their superterms evaluating. toCheck.append(uniqueID) # The final paths of interest are those recorded in the root node entry in the dictionary. final = pathDict[rootNode] levelOne = [] levelTwo = [] path = [] for f in final: # For every path from the initial term to the root term, find the level 1 and level 2 terms. chunks = f.split('#') # Rather than setting the level 1 terms to be the 2nd and 3rd terms respectively, they are the 3rd and 4th. # This is because the GO database defines a term 'all' which is a dummy root node. if len(chunks) > 3: levelOne.append(chunks[3]) if len(chunks) > 4: levelTwo.append(chunks[4]) # Create the path that will be recorded in the database. Ignore the 'all' dummy root. path.append('#'.join(chunks[2:])) # Ensure only unique level 1 and level 2 terms are recorded. levelOne = list(set(levelOne)) if levelOne == []: level1 = 'NA' else: level1 = ';'.join(levelOne) levelTwo = list(set(levelTwo)) if levelTwo == []: level2 = 'NA' else: level2 = ';'.join(levelTwo) path = ';'.join(path) # Create the list of tuples to enter into the database. recordsToInsert.append((resultsDict[r][2][3:], startName, startType, path, level1, level2)) mysql.closeConnection(connGO, cursorGO) utilities.list2file.main(recordsToInsert, parsedGO)
def main(UPPPIData, schemaProteins, tablePPI, databasePassword): #=========================================================================== # Extract and format the parsed gene data. #=========================================================================== ppiData = utilities.file2list.main(UPPPIData) ppiData = [eval(i) for i in ppiData] ppiDict = dict([(tuple([i[0], i[1]]), i) for i in ppiData]) #=========================================================================== # Extract the gene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tablePPI) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the table. #=========================================================================== columnIndices = range(1, len(ppiData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tablePPI) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = ppiDict.keys() for i in results: proteinOne = i[0] proteinTwo = i[1] dictKey = tuple([proteinOne, proteinTwo]) if ppiDict.has_key(dictKey): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(dictKey) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != ppiDict[dictKey][j]: if not toUpdate.has_key(dictKey): toUpdate[dictKey] = [] toUpdate[dictKey].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(dictKey) values = '(' + ('%s,' * len(ppiData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tablePPI, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the PPI table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + str(ppiDict[i][j]) + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tablePPI, toSet, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the PPI table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the table, but are in the parsed file. #=========================================================================== rowsToAdd = [ppiDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tablePPI, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the PPI table: ', len(toAdd)
def main(parsedGOOutput, schemaProteins, tableGOInfo, databasePassword): # =========================================================================== # Extract and format the parsed GO data. # =========================================================================== GOData = utilities.file2list.main(parsedGOOutput) GOData = [eval(i) for i in GOData] GODict = dict([(i[0], i) for i in GOData]) # =========================================================================== # Extract the GO information recorded in the database. # =========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, "*", tableGOInfo) results = cursor.fetchall() mysql.closeConnection(conn, cursor) # =========================================================================== # Compare the parsed data with the data recorded in the table. # =========================================================================== columnIndices = range(1, len(GOData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute("SHOW COLUMNS FROM " + tableGOInfo) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = GODict.keys() for i in results: geneID = i[0] if GODict.has_key(geneID): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != GODict[geneID][j]: if not toUpdate.has_key(geneID): toUpdate[geneID] = [] toUpdate[geneID].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = "(" + ("%s," * len(GOData[0])) values = values[:-1] + ")" # =========================================================================== # Remove rows from the table that are not in the parsed file. # =========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableGOInfo, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries removed from the GO table: ", len(toRemove) # =========================================================================== # Update rows that have different values in the parsed file and the table. # =========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + GODict[i][j] + '"' toSet.append(updateString) toSet = ", ".join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableGOInfo, toSet, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries updated in the GO table: ", len(toUpdate) # =========================================================================== # Add rows which are not in the table, but are in the parsed file. # =========================================================================== # Split the records to be inserted into smaller chunks so the database connection is not lost # In order to get this to work you need to increase the size of max_allowed_packet for the # MySQL server. This is because the size of some of the paths is very large. # I did this by altering the default my.ini file to contain the line: # max_allowed_packets=32M # and put this line under the [mysqld] section rowsToAdd = [GODict[i] for i in toAdd] length = len(rowsToAdd) itemsInSplit = 5 numberOfSplits = length / itemsInSplit if length % itemsInSplit != 0: numberOfSplits += 1 recordsToInsert = [rowsToAdd[i * itemsInSplit : (i + 1) * itemsInSplit] for i in range(numberOfSplits)] for i in recordsToInsert: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableGOInfo, values, i) mysql.closeConnection(conn, cursor) print "\tEntries added to the GO table: ", len(toAdd)
def main(DBDrugIDs, DBTargetIDs, TTDUPAccessions, ChEMBLUPAccessions, UPHumanAccessionMap, UPDrugIDs, folderCulling, schemaProteins, tableProteinInfo, tableNonRedundant, tableBLASTResults, databasePassword, viewsDict): allTargetsDB = xref_drugbank_uniprot(DBTargetIDs, UPHumanAccessionMap) allTargetsTTD = xref_TTD_uniprot(TTDUPAccessions, UPHumanAccessionMap) allTargets = list(set(allTargetsChEMBL) | set(allTargetsDB) | set(allTargetsTTD) | set(allTargetsUP)) print '\tTotal number of unique targets found: ', len(allTargets) # Extract mode of action and clear the target information. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, ModeOfAction', tableProteinInfo) resultsModeOfAction = cursor.fetchall() for i in resultsModeOfAction: upid = i[0] mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="N"', 'UPAccession="' + upid + '"') mysql.closeConnection(conn, cursor) # Generate the sets of proteins that are GPCRs, kinases, ion channels and proteases. gpcr = [] kinases = [] ionChannels = [] proteases = [] for i in resultsModeOfAction: if i[1] == 'G-protein coupled receptor': gpcr.append(i[0]) elif i[1] == 'Kinase': kinases.append(i[0]) elif i[1] == 'Ion Channel': ionChannels.append(i[0]) elif i[1] == 'Protease': proteases.append(i[0]) # Update the table to indicate which proteins are targets. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) for i in allTargets: mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="Y"', 'UPAccession="' + i + '"') mysql.closeConnection(conn, cursor) # Perform redundancy removal using Leaf. print '\tPerforming redundancy removal.' # Proteins have had their target status changed, so the redundancy needs to be recalculated. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) # Set the number of columns in the nonredundant table. cursor.execute('SHOW COLUMNS FROM ' + tableNonRedundant) numberColumns = len(cursor.fetchall()) # Select all the proteins recorded in the database. The number of columns has one subtracted from it as the # UP accession column does not take the default value. cursor = mysql.tableSELECT(cursor, 'UPAccession', tableProteinInfo) allProteins = [tuple([i[0]] + (['N'] * (numberColumns - 1))) for i in cursor.fetchall()] # Wipe and refill the nonredundant table. cursor.execute('TRUNCATE TABLE ' + tableNonRedundant) values = '(' + ('%s,' * numberColumns) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableNonRedundant, values, allProteins) for column in sorted(viewsDict.keys()): print '\t\tRunning redundancy removal on ', column # For each set of proteins, run the Leaf program. inputLocation = folderCulling + '/' + column + '.txt' cursor = mysql.tableSELECT(cursor, '*', viewsDict[column]) results = cursor.fetchall() # Determine the accessions of the proteins in the current set. proteinSet = [i[0] for i in results] print '\t\t\tSize of Redundant Dataset: ', len(proteinSet) proteinSetString = '\',\''.join(proteinSet) # Select all the BLAST results where both the hit and query protein are in the set to cull. cursor = mysql.tableSELECT(cursor, '*', tableBLASTResults, 'ProteinA IN (\'' + proteinSetString + '\') AND ProteinB IN (\'' + proteinSetString + '\')') protResults = cursor.fetchall() # Generate the file that is going to be used to perform the culling. writeTo = open(inputLocation, 'w') for i in protResults: writeTo.write('\t'.join([str(j) for j in i]) + '\n') writeTo.close() # Perform the culling. adjMatrix, proteinNames = culling.adjlistcreation.main(inputLocation, cutoffPercent=20, maxEValue=1, minAlignLength=20) print '\t\t\tNumber of Proteins in Similarity Graph: ', len(proteinNames) proteinsToCull = culling.Leafcull.main(adjMatrix, proteinNames) print '\t\t\tNumber of Proteins to Cull: ', len(proteinsToCull) for i in proteinsToCull: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="N"', 'UPAccession="' + str(i) + '"') proteinsToKeep = [i for i in proteinSet if i not in proteinsToCull] print '\t\t\tNumber of Proteins to Keep: ', len(proteinsToKeep) for i in proteinsToKeep: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="Y"', 'UPAccession="' + str(i) + '"') mysql.closeConnection(conn, cursor)
def main(UPExternalLinks, tableUniProt2Ensembl, tableUniProt2GO, tableUniProt2UniGene, tableUniProt2HGNC, tableDict, schemaProteins, databasePassword): # Determine all the external database links for each represetnative UniProt accession. proteinLinks = {} readIn = open(UPExternalLinks, 'r') for line in readIn: line = line.strip() chunks = line.split(',') entrezLinks = [i for i in chunks[1].split(';')] if chunks[1] != '' else [] unigeneLinks = [int(i.replace('Hs.', '')) for i in chunks[2].split(';')] if chunks[2] != '' else [] goLinks = [int(i.replace('GO:', '')) for i in chunks[3].split(';')] if chunks[3] != '' else [] hgncLinks = [i for i in chunks[4].split(';')] if chunks[4] != '' else [] ensemblLinks = [i.split('-') for i in chunks[5].split(';')] if chunks[4] != '' else [] proteinLinks[chunks[0]] = {'Entrez' : entrezLinks, 'UniGene' : unigeneLinks, 'GO' : goLinks, 'HGNC' : hgncLinks, 'Ensembl' : ensemblLinks } readIn.close() conn, cursor = mysql.openConnection(databasePassword, schemaProteins) validXrefIDs = {'EnsemblGeneID' : set([]), 'GeneID' : set([]), 'GOTermID' : set([]), 'UPAccession' : set([]), 'UniGeneID' : set([]), 'EnsemblTranscriptID' : set([])} for i in tableDict.keys(): for j in tableDict[i]: # For each of the external databases with its own table in the database (e.g. Ensembl genes, UniGene, GO terms), determine which of the cross-references recorded # are actually referencing a valid ID in the respective database. # For example, if the file of cross-references says that UniProt accession U is linked to GO term 123, then make sure that 123 is in fact a valid Go term ID (i.e. that # it is in the table that contains all the GO term IDs). cursor = mysql.tableSELECT(cursor, i, j) results = [k[0] for k in cursor.fetchall()] results = set(results) validXrefIDs[i] = validXrefIDs[i].union(results) # Determine all the UniProt accession cross-references that are referencing a valid external database identifier. entrezInsert = [] unigeneInsert = [] goInsert = [] hgncInsert = [] ensemblInsert = [] for i in proteinLinks.keys(): if not i in validXrefIDs['UPAccession']: continue for j in proteinLinks[i]['Entrez']: if j in validXrefIDs['GeneID']: entrezInsert.append(tuple([i, str(j)])) for j in proteinLinks[i]['UniGene']: if j in validXrefIDs['UniGeneID']: unigeneInsert.append(tuple([i, j])) for j in proteinLinks[i]['GO']: if j in validXrefIDs['GOTermID']: goInsert.append(tuple([i, j])) for j in proteinLinks[i]['HGNC']: hgncInsert.append(tuple([i, j])) for j in proteinLinks[i]['Ensembl']: if j[0] in validXrefIDs['EnsemblGeneID']:# and j[1] in validXrefIDs['EnsemblTranscriptID']: ensemblInsert.append(tuple([i] + j)) print '\tNow Recording UniGene Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2UniGene) values = '(' + ('%s,' * len(unigeneInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2UniGene, values, unigeneInsert) print '\tNow Recording GO Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2GO) values = '(' + ('%s,' * len(goInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2GO, values, goInsert) print '\tNow Recording HGNC Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2HGNC) values = '(' + ('%s,' * len(hgncInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2HGNC, values, hgncInsert) print '\tNow Recording Ensembl Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2Ensembl) values = '(' + ('%s,' * len(ensemblInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2Ensembl, values, ensemblInsert) mysql.closeConnection(conn, cursor)
def main(unigeneParsedOutput, unigeneParsedTotals, schemaProteins, tableUniGene, tableUniGeneTotals, databasePassword): #=========================================================================== # Extract and format the parsed UniGene data, and the UniGene expression totals. #=========================================================================== UGData = utilities.file2list.main(unigeneParsedOutput) UGData = [tuple([int(j) for j in eval(i)]) for i in UGData] UGDict = dict([(i[0], i) for i in UGData]) UGTotalsData = utilities.file2list.main(unigeneParsedTotals) UGTotalsData = [tuple([j[0], eval(j[1])]) for j in [eval(i) for i in UGTotalsData]] #=========================================================================== # Extract the UniGene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tableUniGene) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the expression table. #=========================================================================== columnIndices = range(1, len(UGData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tableUniGene) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = UGDict.keys() for i in results: UniGeneID = i[0] if UGDict.has_key(UniGeneID): # If the key is in both the parsed file and the expression table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the expression table with the parsed file, to determine if the expression table needs updating. for j in columnIndices: if i[j] != UGDict[UniGeneID][j]: if not toUpdate.has_key(UniGeneID): toUpdate[UniGeneID] = [] toUpdate[UniGeneID].append(j) else: # If the key is in the expression table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = '(' + ('%s,' * len(UGData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the expression table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableUniGene, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the UniGene table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the expression table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + UGDict[i][j] + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableUniGene, toSet, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the UniGene table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the expression table, but are in the parsed file. #=========================================================================== rowsToAdd = [UGDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableUniGene, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the UniGene table: ', len(toAdd) #=========================================================================== # Enter the expression totals in the totals table. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('TRUNCATE TABLE ' + tableUniGeneTotals) values = '(' + ('%s,' * len(UGTotalsData[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableUniGeneTotals, values, UGTotalsData) mysql.closeConnection(conn, cursor) print '\tUniGene totals table updated.'
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability): # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool). halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3, 'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2, 'V' : 100.0, 'W' : 2.8, 'Y' : 2.8} # Extract all the sequences stored in the database. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo) results = cursor.fetchall() # Calculate the half life and instability index for each protein. stabilityTuples = [] for i in results: sequence = i[1] if halfLife.has_key(sequence[0]): protHalfLife = halfLife[sequence[0]] else: # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.) protHalfLife = -1 analysedSeq = ProteinAnalysis(sequence) try: instabilityIndex = analysedSeq.instability_index() except: instabilityIndex = -1 print '\tContains invalid aa code: ', i[0] stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex])) cursor.execute('TRUNCATE TABLE ' + tableStability) values = '(' + ('%s,' * len(stabilityTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableStability, values, stabilityTuples) mysql.closeConnection(conn, cursor) #def instability_index(prot, sequence): # # # A two dimentional dictionary for calculating the instability index. # # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). # # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G']. # DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, # 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, # 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0}, # 'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, # 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, # 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, # 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, # 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, # 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, # 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, # 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, # 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, # 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, # 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, # 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, # 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, # 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, # 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, # 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, # 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, # 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, # 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, # 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, # 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, # 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, # 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}, # } # # score = 0.0 # for i in range(len(sequence) - 1): # if DIWV.has_key(sequence[i]): # if DIWV[sequence[i]].has_key(sequence[i+1]): # score += DIWV[sequence[i]][sequence[i+1]] # return (10.0 / len(sequence)) * score