class LoadIfileManager: def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager() #print("Load IFile Manager Initialized.") def dropForeignTable(self, fdwTableName): self.cur.execute("drop foreign table if exists "+fdwTableName+";") #print("drop foreign table if exists "+fdwTableName+";") def createForeignTable(self, iFile, fTableName): header, fdwScript = self.fdm.generateFDWScript(iFile, fTableName) self.cur.execute(fdwScript) return header def createFileWithDerivedIds(self, outputFilePath, derivedIdSql): copyStmt = "copy ("+derivedIdSql+") to '"+outputFilePath+"' with delimiter E'\\t'"+" csv header;" #print("copyStmt = "+copyStmt) self.cur.execute(copyStmt) def createFileWithoutDuplicates(self, outputFilePath, noDupsSql): copyStmt = "copy ("+noDupsSql+") to '"+outputFilePath+"' with delimiter E'\\t'"+" csv header;" #print("copyStmt = "+copyStmt) self.cur.execute(copyStmt) def loadData(self, tableName, header, fileToLoad, primaryKeyColumnName): loadSql = "copy "+tableName+" ("+(",".join(header))+")"+" from '"+fileToLoad+"' with delimiter E'\\t' csv header;" #print("loadSql = "+loadSql) self.updateSerialSequence(tableName, primaryKeyColumnName) self.cur.execute(loadSql) def updateSerialSequence(self, tableName, primaryKeyColumnName): updateSeqSql = "SELECT pg_catalog.setval(pg_get_serial_sequence('"+tableName+"', '"+primaryKeyColumnName+"'), MAX("+primaryKeyColumnName+")) FROM "+tableName+";" #print("updateSeqSql = "+updateSeqSql) self.cur.execute(updateSeqSql) def commitTransaction(self): self.conn.commit() def rollbackTransaction(self): self.conn.rollback() def closeConnection(self): self.connMgr.disconnectFromDatabase()
class PreprocessIfileManager: WORK_MEM = 10240 def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager() self.cur.execute("set work_mem to %s", (self.WORK_MEM, )) #print("Preprocess IFile Manager Initialized.") def getCvIdOfTerm(self, term): self.cur.execute("select cv_id from cv where lower(term)=%s", (term.lower(), )) cv_id = self.cur.fetchone() if cv_id is not None: return cv_id[0] else: return cv_id def getCvIdOfGroupAndTerm(self, group, term): self.cur.execute( "select cv_id from cv where lower(\"group\")=%s and lower(term)=%s", (group.lower(), term.lower())) cv_id = self.cur.fetchone() if cv_id is not None: return cv_id[0] else: return cv_id #select data_type from information_schema.columns where table_name='marker' and column_name='platform_id'; def getTypeOfColumn(self, table, column): self.cur.execute( "select data_type from information_schema.columns where table_name=%s and column_name=%s", (table, column)) res = self.cur.fetchone() if res is not None: return res[0] else: return res def getColumnListOfTable(self, table): self.cur.execute( "select column_name from information_schema.columns where table_name = %s", (table, )) res = self.cur.fetchall() return res def dropForeignTable(self, fdwTableName): self.cur.execute("drop foreign table if exists " + fdwTableName) def createForeignTable(self, iFile, fTableName): header, fdwScript = self.fdm.generateFDWScript(iFile, fTableName) #print("fdwScript: %s" % fdwScript) self.cur.execute(fdwScript) return header def createFileWithDerivedIdsV1(self, outputFilePath, derivedIdSql): copyStmt = "copy (" + derivedIdSql + ") to '" + outputFilePath + "' with delimiter E'\\t'" + " csv header;" #print("copyStmt = "+copyStmt) self.cur.execute(copyStmt) def createFileWithDerivedIds(self, outputFilePath, derivedIdSql): copyStmt = "copy (" + derivedIdSql + ") to STDOUT with delimiter E'\\t'" + " csv header;" with open(outputFilePath, 'w') as outputFile: #let's try 20MB buffer size for a start, default was 8MB self.cur.copy_expert(copyStmt, outputFile, 20480) outputFile.close() def commitTransaction(self): self.conn.commit() def rollbackTransaction(self): self.conn.rollback() def closeConnection(self): self.connMgr.disconnectFromDatabase()
def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager() self.cur.execute("set work_mem to %s", (self.WORK_MEM, ))
class LoadIfileManager: WORK_MEM = 10240 def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager() self.cur.execute("set work_mem to %s", (self.WORK_MEM, )) #print("Load IFile Manager Initialized.") def dropForeignTable(self, fdwTableName): self.cur.execute("drop foreign table if exists " + fdwTableName + ";") #print("drop foreign table if exists "+fdwTableName+";") def createForeignTable(self, iFile, fTableName): header, fdwScript = self.fdm.generateFDWScript(iFile, fTableName) self.cur.execute(fdwScript) return header def createFileWithoutDuplicatesV1(self, outputFilePath, noDupsSql): copyStmt = "copy (" + noDupsSql + ") to '" + outputFilePath + "' with delimiter E'\\t'" + " csv header;" #print("copyStmt = "+copyStmt) self.cur.execute(copyStmt) def createFileWithoutDuplicates(self, outputFilePath, noDupsSql): copyStmt = "copy (" + noDupsSql + ") to STDOUT with delimiter E'\\t'" + " csv header;" with open(outputFilePath, 'w') as outputFile: #let's try 20MB buffer size for a start, default was 8MB self.cur.copy_expert(copyStmt, outputFile, 20480) outputFile.close() def loadDataV1(self, tableName, header, fileToLoad, primaryKeyColumnName): loadSql = "copy " + tableName + " (" + ( ",".join(header) ) + ")" + " from '" + fileToLoad + "' with delimiter E'\\t' csv header;" #print("loadSql = "+loadSql) self.cur.execute(loadSql) self.updateSerialSequence(tableName, primaryKeyColumnName) def loadData(self, tableName, header, fileToLoad, primaryKeyColumnName): loadSql = "copy " + tableName + " (" + (",".join( header)) + ")" + " from STDIN with delimiter E'\\t' csv header;" rowsLoaded = 0 with open(fileToLoad, 'r') as f: self.cur.copy_expert(loadSql, f, 20480) rowsLoaded = self.cur.rowcount #print("Rows loaded = %s" % self.cur.rowcount) f.close() self.updateSerialSequence(tableName, primaryKeyColumnName) return rowsLoaded def upsertKVPFromForeignTable(self, fTableName, sourceKey, sourceValue, targetTable, targetId, targetJsonb): kvpSql = "select * from upsertKVPFromForeignTable('" + fTableName.lower( ) + "', '" + sourceKey + "', '" + sourceValue + "', '" + targetTable + "', '" + targetId + "', '" + targetJsonb + "');" #print ("kvpSQL: %s" % kvpSql) self.cur.execute(kvpSql) rowsLoaded = self.cur.fetchone() if rowsLoaded is not None: return rowsLoaded[0] else: return rowsLoaded def updateSerialSequence(self, tableName, primaryKeyColumnName): updateSeqSql = "SELECT pg_catalog.setval(pg_get_serial_sequence('" + tableName + "', '" + primaryKeyColumnName + "'), MAX(" + primaryKeyColumnName + ")) FROM " + tableName + ";" #print("updateSeqSql = "+updateSeqSql) self.cur.execute(updateSeqSql) def commitTransaction(self): self.conn.commit() def rollbackTransaction(self): self.conn.rollback() def closeConnection(self): self.connMgr.disconnectFromDatabase()
def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager()
class ExtractMetadataManager: def __init__(self, connectionStr): self.connMgr = ConnectionManager() self.conn = self.connMgr.connectToDatabase(connectionStr) self.cur = self.conn.cursor() self.fdm = ForeignDataManager() def dropForeignTable(self, fdwTableName): self.cur.execute("drop foreign table if exists "+fdwTableName) def createForeignTable(self, iFile, fTableName): header, fdwScript = self.fdm.generateFDWScript(iFile, fTableName) self.cur.execute(fdwScript) return header def createFileWithoutDuplicates(self, outputFilePath, noDupsSql): copyStmt = "copy ("+noDupsSql+") to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(copyStmt, outputFile, 20480) outputFile.close() def createAllMarkerMetadataFile(self, outputFilePath, datasetId, mapId): sql = "" if mapId == -1: sql = "copy (select * from getAllMarkerMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" else: sql = "copy (select * from getAllMarkerMetadataByDatasetAndMap("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" #print (sql) with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createMarkerNamesFile(self, outputFilePath, datasetId, mapId): sql = "" if mapId == -1: sql = "copy (select * from getMarkerNamesByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" else: sql = "copy (select * from getMarkerNamesByDatasetAndMap("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createMinimalMarkerMetadataFile(self, outputFilePath, datasetId, mapId): sql = "" if mapId == -1: sql = "copy (select * from getMinimalMarkerMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" else: sql = "copy (select * from getMinimalMarkerMetadataByDatasetAndMap("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createQCMarkerMetadataFile(self, outputFilePath, datasetId, mapId): sql = "copy (select * from getMarkerQCMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" #TODO: in case we offer the feature to filter by mapId, we'll need to create a version of getMarkerQCMetadataByDataset that filters by mapId as well. And add the conditional expressions as in createAllMarkerMetadataFile with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createQCMarkerMetadataByMarkerList(self, outputFilePath, markerList): sql = "copy (select * from getMarkerQCMetadataByMarkerList('{"+(','.join(markerList))+"}')) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createChrLenFile(self, outputFilePath, datasetId, mapId, markerList, sampleList): sql = "" outputFilePath = outputFilePath+".chr" if markerList: sql = "copy (select * from getAllChrLenByMarkerList('{"+(','.join(markerList))+"}')) to STDOUT with delimiter E'\\t'"+" csv header;" elif sampleList: print("Not yet implemented") #with the decision on doing an extraction by marker list in the background of an extraction by samples request, this is now an unreachable condition. Code cleanup pending. return else: if mapId == -1: sql = "copy (select * from getAllChrLenByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" else: sql = "copy (select * from getAllChrLenByDatasetAndMap("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createAllSampleMetadataFile(self, outputFilePath, datasetId): sql = "copy (select * from getAllSampleMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createDnarunNamesFile(self, outputFilePath, datasetId): sql = "copy (select * from getDnarunNamesByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createMinimalSampleMetadataFile(self, outputFilePath, datasetId): sql = "copy (select * from getMinimalSampleMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createAllProjectMetadataFile(self, outputFilePath, datasetId): sql = "copy (select * from getAllProjectMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def getMarkerMapsetInfoByDataset(self, outputFilePath, datasetId, mapId): #rename this function ''' For the given datasetId & mapId this funtion would output all markers in dataset and only the given mapset info. ''' outputFilePath = outputFilePath+".mapset" sql = "copy (select * from getMarkerMapsetInfoByDataset("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createSampleQCMetadataFile(self, outputFilePath, datasetId): sql = "copy (select * from getSampleQCMetadataByDataset("+datasetId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createSampleQCMetadataByMarkerList(self, outputFilePath, markerList, datasetType): #print(self.cur.mogrify("copy (select * from getSampleQCMetadataByMarkerList('{"+(','.join(markerList))+"}',"+datasetType+")) to STDOUT with delimiter E'\\t'"+" csv header;")) sql = "copy (select * from getSampleQCMetadataByMarkerList('{"+(','.join(markerList))+"}',"+datasetType+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createSampleQCMetadataBySampleList(self, outputFilePath, sampleList, datasetType): sql = "copy (select * from getSampleQCMetadataBySampleList('{"+(','.join(sampleList))+"}',"+datasetType+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def getMarkerAllMapsetInfoByDataset(self, outputFilePath, datasetId, mapId): ''' For the given datasetId & mapId this funtion would output all markers in dataset and mapsets associated with them.Markers with multiple mapsets will be repeated. ''' outputFilePath = outputFilePath+".mapset" sql = "copy (select * from getMarkerAllMapsetInfoByDataset("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createMapsetFile(self, outputFilePath, datasetId, mapId, markerList, sampleList, extractionType): #outputFilePath = outputFilePath+".mapset" sql = "" if extractionType == 2 or extractionType == 3: sql = "copy (select * from getMarkerMapsetInfoByMarkerList('{"+(','.join(markerList))+"}')) to STDOUT with delimiter E'\\t'"+" csv header;" elif extractionType == 1: sql = "copy (select * from getMarkerAllMapsetInfoByDataset("+datasetId+","+mapId+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createMarkerPositionsFile(self, outputFilePath, markerList, datasetType): outputFilePath = outputFilePath+".pos" sql = "copy (select * from getMatrixPosOfMarkers('{"+(','.join(markerList))+"}',"+datasetType+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def createSamplePositionsFile(self, outputFilePath, sampleList, datasetType): outputFilePath = outputFilePath+".pos" sql = "copy (select * from getMatrixPosOfSamples('{"+(','.join(sampleList))+"}',"+datasetType+")) to STDOUT with delimiter E'\\t'"+" csv header;" with open(outputFilePath, 'w') as outputFile: self.cur.copy_expert(sql, outputFile, 20480) outputFile.close() def getMarkerIds(self, markerNames, platformList): #print("Generating marker ids...") if markerNames and platformList: #print(self.cur.mogrify("select marker_id from getMarkerIdsByMarkerNamesAndPlatformList(%s, %s)", ("{"+(','.join(markerNames))+"}", "{"+(','.join(platformList))+"}"))) self.cur.execute("select marker_id from getMarkerIdsByMarkerNamesAndPlatformList(%s, %s)", ("{"+(','.join(markerNames))+"}", "{"+(','.join(platformList))+"}")) elif markerNames and not platformList: #print(self.cur.mogrify("select marker_id from getMarkerIdsByMarkerNames(%s)", ("{"+(','.join(markerNames))+"}",))) self.cur.execute("select marker_id from getMarkerIdsByMarkerNames(%s)", ("{"+(','.join(markerNames))+"}",)) elif platformList and not markerNames: #print(self.cur.mogrify("select marker_id from getMarkerIdsByPlatformList(%s)", ("{"+(','.join(platformList))+"}",))) self.cur.execute("select marker_id from getMarkerIdsByPlatformList(%s)", ("{"+(','.join(platformList))+"}",)) else: # both params are null return None res = self.cur.fetchall() return res def getDnarunIds(self, piId, projectId, sampleType, sampleNames): print("Deriving Dnarun IDs...") if sampleNames and sampleType > 0: #1 = Germplasm Names, 2 = External Codes, 3 = DnaSample Names if sampleType == 1: if projectId > 0: print("...based on Germplasm Names AND projectID") self.cur.execute("select dnarun_id from getDnarunIdsByGermplasmNamesAndProject(%s, %s)", ("{"+(','.join(sampleNames))+"}", projectId)) elif piId > 0: print("...based on Germplasm Names AND piId") self.cur.execute("select dnarun_id from getDnarunIdsByGermplasmNamesAndPI(%s, %s)", ("{"+(','.join(sampleNames))+"}", piId)) else: print("...based on Germplasm Names") self.cur.execute("select dnarun_id from getDnarunIdsByGermplasmNames(%s)", ("{"+(','.join(sampleNames))+"}",)) elif sampleType == 2: if projectId > 0: print("...based on External Codes AND projectID") self.cur.execute("select dnarun_id from getDnarunIdsByExternalCodesAndProject(%s, %s)", ("{"+(','.join(sampleNames))+"}", projectId)) elif piId > 0: print("...based on External Codes AND piId") self.cur.execute("select dnarun_id from getDnarunIdsByExternalCodesAndPI(%s, %s)", ("{"+(','.join(sampleNames))+"}", piId)) else: print("...based on External Codes") self.cur.execute("select dnarun_id from getDnarunIdsByExternalCodes(%s)", ("{"+(','.join(sampleNames))+"}",)) elif sampleType == 3: if projectId > 0: print("...based on Dnasample Names AND projectID") self.cur.execute("select dnarun_id from getDnarunIdsByDnasampleNamesAndProject(%s, %s)", ("{"+(','.join(sampleNames))+"}", projectId)) elif piId > 0: print("...based on Dnasample Names AND piId") self.cur.execute("select dnarun_id from getDnarunIdsByDnasampleNamesAndPI(%s, %s)", ("{"+(','.join(sampleNames))+"}", piId)) else: print("...based on Dnasample Names") #print(self.cur.mogrify("select dnarun_id from getDnarunIdsByDnasampleNames(%s)", ("{"+(','.join(sampleNames))+"}",))) self.cur.execute("select dnarun_id from getDnarunIdsByDnasampleNames(%s)", ("{"+(','.join(sampleNames))+"}",)) else: print("Invalid usage.") return None elif projectId > 0: print("...based on projectID") self.cur.execute("select dnarun_id from getDnarunIdsByProject(%s)", (projectId,)) elif piId > 0: print("...based on PI") self.cur.execute("select dnarun_id from getDnarunIdsByPI(%s)", (piId,)) else: print("Invalid usage.") return None res = self.cur.fetchall() return res #datasetType = required, sampleList = required, platformList = optional def getMarkerIdsFromSamples(self, sampleList, datasetType, platformList): if sampleList and platformList: self.cur.execute("select marker_id from getMarkerIdsBySamplesPlatformsAndDatasetType(%s, %s, %s)", ("{"+(','.join(sampleList))+"}", "{"+(','.join(platformList))+"}", datasetType)) elif sampleList and not platformList: self.cur.execute("select marker_id from getMarkerIdsBySamplesAndDatasetType(%s, %s)", ("{"+(','.join(sampleList))+"}", datasetType)) else: # invalid usage return None res = self.cur.fetchall() return res def commitTransaction(self): self.conn.commit() def rollbackTransaction(self): self.conn.rollback() def closeConnection(self): self.connMgr.disconnectFromDatabase()