def convertSourceItems(self, convOptions): """Primary run function to process the contents of the order_med table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeatEd conversions. startDate - If provided, only return items whose order_time_jittered is on or after that date. endDate - If provided, only return items whose order_time_jittered is before that date. """ log.info("Conversion for items dated {} to {}".format( convOptions.startDate, convOptions.endDate)) progress = ProgressDots() conn = self.connFactory.connection() try: # Load up the medication mapping table to facilitate subsequent conversions rxcuiDataByMedId = self.loadRXCUIData() # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def convertSourceItems(self, convOptions, conn=None): """Primary run function to process the contents of the raw source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeated conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)) progress = ProgressDots() extConn = conn is not None if not extConn: conn = self.connFactory.connection() try: # Next round for medications directly from order_med table not addressed in medmix TODO (nodir) seems like an unrelated comment? category = self.categoryFromSourceItem(conn) for sourceItem in self.querySourceItems(convOptions): log.debug('sourceItem: {}'.format(sourceItem)) self.convertSourceItem(category, sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def generatePatientItemsForCompositeId(self, clinicalItemIds, compositeId, conn=None): """Create patient_item records for the composite to match the given clinical item ID patient items. """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Record linking information for componentId in clinicalItemIds: linkModel = RowItemModel() linkModel["clinical_item_id"] = compositeId linkModel["linked_item_id"] = componentId insertQuery = DBUtil.buildInsertQuery("clinical_item_link", linkModel.keys()) insertParams = linkModel.values() DBUtil.execute(insertQuery, insertParams, conn=conn) # Extract back link information, which will also flatten out any potential inherited links linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn) linkedItemIds = linkedItemIdsByBaseId[compositeId] # Create patienItem records for the composite clinical item to overlap existing component ones # First query for the existing component records query = SQLQuery() query.addSelect("*") query.addFrom("patient_item") query.addWhereIn("clinical_item_id", linkedItemIds) results = DBUtil.execute(query, includeColumnNames=True, conn=conn) patientItems = modelListFromTable(results) # Patch component records to instead become composite item records then insert back into database progress = ProgressDots(total=len(patientItems)) for patientItem in patientItems: del patientItem["patient_item_id"] patientItem["clinical_item_id"] = compositeId patientItem["analyze_date"] = None insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err) progress.Update() # progress.PrintStatus(); finally: if not extConn: conn.close()
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the stride_order_med table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeatEd conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)) progress = ProgressDots() conn = self.connFactory.connection() try: # Load up the medication mapping table to facilitate subsequent conversions rxcuiDataByMedId = self.loadRXCUIData(conn=conn) # Keep track of which order meds have already been converted based on mixture components (don't repeat for the aggregate order then) # Can be a lot to store in local memory for large conversions, so may need to batch smaller sub-processes convertedOrderMedIds = set() # First round for medication combinations that must be extracted from order_medmixinfo table for sourceItem in self.queryMixSourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) convertedOrderMedIds.add(sourceItem["order_med_id"]) progress.Update() # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): if sourceItem[ "order_med_id"] not in convertedOrderMedIds: # Don't repeat conversion if mixture components already addressed self.convertSourceItem(sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def convertSourceItems(self, userSIDs=None, limit=None, offset=None): """Primary run function to process the contents of the source table and convert them into normalized data table entries. """ log.info("Conversion for patients: %s" % userSIDs); progress = ProgressDots(); conn = self.connFactory.connection(); try: for i, sourceItem in enumerate(self.querySourceItems(userSIDs, limit, offset, progress=progress, conn=conn)): self.convertSourceItem(sourceItem, conn=conn); progress.Update(); # Go through accumulated metric description lines into single entries for the metric table self.updateMetricDescriptionLines(); finally: conn.close();
def convertSourceItems(self, startDate=None, endDate=None): """Primary run function to process the contents of the stride_order_proc table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (startDate, endDate)); progress = ProgressDots(); conn = self.connFactory.connection(); try: for sourceItem in self.querySourceItems(startDate, endDate, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn); progress.Update(); finally: conn.close(); progress.PrintStatus();
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the raw source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeated conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)); progress = ProgressDots(); conn = self.connFactory.connection(); try: # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn); progress.Update(); finally: conn.close(); progress.PrintStatus();
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None ): """Update the database with the contents of a whitespace-delimited text file. Updates the contents of the <tableName> with the data from the <sourceFile>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character (specify None for any whitespace). These items will be inserted under the respective order of the given list of <columnNames>. If the columnNames parameter is not provided, assume the first line of the <sourceFile> contains the column names. To know which rows to update, assume the FIRST column listed in <columnNames> is the ID column to identify rows by. In that case, the data value there from the <sourceFile> will not be used to update the row, but will instead be used to identify the row to update the rest of the data by. If more than one column is necessary to identify a row (composite key), indicate how many of the first columns in <columnNames> should be used with <nIdCols>. Note that these key ID values must not be None / null. The query looks for rows where columnname = value, and the = operator always returns false when the value is null. Returns the total number of rows successfully updated. """ if columnNames is None or len(columnNames) < 1: headerLine = sourceFile.readline(); columnNames = headerLine.split(delim); conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() nCols = len(columnNames); try: # Prepare the SQL Statement sql = []; sql.append("update"); sql.append( tableName ); sql.append("set"); # Data Columns for i in xrange(nIdCols,nCols): sql.append(columnNames[i]); sql.append("="); sql.append(Env.SQL_PLACEHOLDER); sql.append(","); sql.pop(); # Remove extra comma at end # ID Columns sql.append("where") for i in xrange(nIdCols): sql.append(columnNames[i]); sql.append("="); sql.append(Env.SQL_PLACEHOLDER); sql.append("and"); sql.pop(); # Remove extra comma at end sql = str.join(" ",sql); log.debug(sql) # Loop through file and execute update statement for every line progress = ProgressDots() for iLine, line in enumerate(sourceFile): if not line.startswith(COMMENT_TAG): try: line = line[:-1]; # Strip the newline character params = line.split(delim); # Special handling for null / None string for iParam in xrange(len(params)): if params[iParam] == "" or params[iParam] == NULL_STRING: # Treat blank strings as NULL params[iParam] = None; # Reposition ID columns to end of parameter list idParams = params[:nIdCols]; dataParams = params[nIdCols:]; paramTuple = dataParams; paramTuple.extend( idParams ); paramTuple = tuple(paramTuple); cur.execute(sql, paramTuple); # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: conn.rollback(); # Reset changes and connection state log.critical(sql); log.critical(paramTuple); log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) ); if skipErrors: log.warning(err) else: raise err conn.commit() return progress.GetCounts();
def insertFile( sourceFile, tableName, columnNames=None, delim=None, idFile=None, skipErrors=False, dateColFormats=None, escapeStrings=False, estInput=None, connFactory=None ): """Insert the contents of a whitespace-delimited text file into the database. For PostgreSQL specifically, consider alternative direct COPY command that can run 10x: E.g., gzip -d -c TTeam_2014.tsv.gz | psql -U jonc101 -c "COPY tteamx ( pat_deid, enc_deid, relation, prov_id, prov_name, start_date, end_date ) FROM STDIN WITH (FORMAT csv, DELIMITER E'\t', HEADER, NULL 'None');" resident-access-log-2017 Inserts the contents of the <sourceFile> into the database under the <tableName>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character. These items will be inserted under the respective order of the given list of columnNames. Use the built-in csv module for parsing out lines and managing quotes, etc. If delimiter is not specified (None), then default to tab-delimited If idFile is provided, then will try to run SQL from identityQuery method after each insert, and write out the contents, one per line to the idFile. Will bypass above step if can find an insert column with the expected default ID column ("tableName_id") If dateColFormats provided, expect a dictionary keyed by the names of columns that should be as interpreted date strings, with values equal to the Python date format string to parse them by. If a format string is not provided, a series of standard date format strings will be attempted (but this is inefficient for repeated date text parsing and error handling). Returns the total number of rows successfully inserted. """ if columnNames is not None and len(columnNames) < 1: columnNames = None; # If empty columnNames list, then reset to null and look for it in first line of data reader = TabDictReader(sourceFile, fieldnames=columnNames, delimiter=delim); columnNames = reader.fieldnames; idCol = defaultIDColumn(tableName); iIdCol = None; # Index of manually specified ID column. May be null for iCol, colName in enumerate(columnNames): if colName == idCol: iIdCol = iCol; if dateColFormats is not None: # Ensure column keys are normalized dateCols = dateColFormats.keys(); for dateCol in dateCols: normalCol = normalizeColName(dateCol); dateColFormats[normalCol] = dateColFormats[dateCol]; conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() try: # Prepare the SQL Statement sqlParts = [] sqlParts.append("insert into") sqlParts.append( tableName ) sqlParts.append("(") sqlParts.append( str.join(",", columnNames) ); sqlParts.append(")") sqlParts.append("values") sqlParts.append("(") for i in range(len(columnNames)): sqlParts.append( Env.SQL_PLACEHOLDER ) # Parameter placeholder, depends on DB-API sqlParts.append(",") sqlParts.pop(); # Remove extra end comma sqlParts.append(")") sql = str.join(" ",sqlParts) log.debug(sql) # Loop through file and execute insert statement everytime find enough delimited parameters. nInserts = 0 nCols = len(columnNames) params = list(); progress = ProgressDots(total=estInput); for iLine, rowModel in enumerate(reader): # Parse out data values from strings for iCol, colName in enumerate(columnNames): value = parseValue(rowModel[colName], colName, dateColFormats, escapeStrings); params.append(value); log.debug(params) try: cur.execute(sql,tuple(params)) nInserts += cur.rowcount if idFile != None: rowId = None; if iIdCol is not None: # Look for manually assigned ID value first rowId = params[iIdCol]; else: cur.execute(identityQuery(tableName)); rowId = cur.fetchone()[0]; print >> idFile, rowId; # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: log.info(sql); log.info(tuple(params)) conn.rollback(); # Reset any changes since the last commit if skipErrors: log.warning("Error Executing in Script: "+ sql ) log.warning(err) else: raise; params = list(); conn.commit() return nInserts
def execute( query, parameters=None, includeColumnNames=False, incTypeCodes=False, formatter=None, conn=None, connFactory=None, autoCommit=True): """Execute a single SQL query / command against the database. If the description attribute is not None, this implies this was a select statement that produced a result set which will be returned by the fetchall() method. If the description is null, then at least return the rowcount affected by the query. This may be -1 or None still if it is a non-row affecting command (e.g. create / drop). If includeColumnNames is true and the query yields a result set, then one row (list) will be added to the beginning which contains the names of each column as extracted from the cursor.description. If incTypeCodes is true and the query yields a result set, a row (list) will be added to the beginning (but after column names if those are included as well), which contains the numerical type codes of each column as extracted from the cursor.description. This method is probably not terribly efficient and should only be used for prototype testing and short command line functions. For retrieving data to send to stdout or some other stream, add the formatter parameter as an instance of a ResultFormatter object to pipe the data through one fetch at a time. In that case, the full results (which are presumably large) will NOT be returned by the method. If the query object is actually a SQLQuery object, then will use the SQLQuery.getParams() as the params, and str(SQLQuery) as the query string. If autoCommit is True, will autoCommit. The function will also autoCommit if an external connection is NOT supplied. """ # Look for an explicitly specified external connection extConn = conn is not None if conn is None: # If no specific connection object provided, look for a connection factory # to produce one if connFactory is not None: conn = connFactory.connection() else: # No connection or factory specified, just fall back on default connection then conn = connection() cur = conn.cursor() if isinstance(query, SQLQuery): if parameters is None: parameters = tuple(query.getParams()) else: parameters = tuple(parameters) query = str(query) elif parameters is None: parameters = () #log.debug(parameterizeQueryString(query,parameters)); returnValue = None try: timer = time.time(); try: cur.execute( query, parameters ) except Exception, err: log.error(err); #log.error(parameterizeQueryString(query,parameters)); if (not extConn) or autoCommit: conn.rollback(); raise; timer = time.time() - timer; log.debug("Query Time: (%1.3f sec)" % timer ); if cur.description != None: returnValue = [] colNames = None; if includeColumnNames: colNames = columnNamesFromCursor(cur); returnValue.append(colNames) if incTypeCodes: typeCodes = typeCodesFromCursor(cur); returnValue.append(typeCodes); if formatter != None: # An output formatter was specified, pipe the data out one row at time if includeColumnNames: formatter.formatTuple(colNames) progress = ProgressDots(); row = cur.fetchone() while row != None: formatter.formatTuple(row) row = cur.fetchone() progress.Update(); log.info("%d Rows Completed",progress.GetCounts()); returnValue = cur.rowcount else: # No formatter specified, just return the entire result set dataTable = list(cur.fetchall()); for i, row in enumerate(dataTable): dataTable[i] = list(row); returnValue.extend(dataTable); else: returnValue = cur.rowcount if (not extConn) or autoCommit: conn.commit()
def __call__(self, analysisQuery, conn=None): """Go through the validation file to assess order set usage amongst test cases """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: conn = DBUtil.connection() preparer = PreparePatientItems() progress = ProgressDots(50, 1, "Patients") for patientItemData in preparer.loadPatientItemData(analysisQuery): patientId = patientItemData["patient_id"] analysisResults = \ self.analyzePatientItems \ ( patientItemData, analysisQuery, analysisQuery.baseRecQuery, patientId, analysisQuery.recommender, conn=conn ) if analysisResults is not None: (queryItemCountById, verifyItemCountById, recommendedItemIds, recommendedData, orderSetItemData) = analysisResults # Unpack results # Start aggregating and calculating result stats resultsStatData = self.calculateResultStats( patientItemData, queryItemCountById, verifyItemCountById, recommendedItemIds, self.supportRecommender.patientCountByItemId, analysisQuery.baseRecQuery, recommendedData) resultsStatData["usedOrderSetIds"] = orderSetItemData[ "allUsedOrderSetIds"] resultsStatData["numUsedOrderSets"] = len( orderSetItemData["allUsedOrderSetIds"]) resultsStatData["numUsedOrderSetItems"] = len( orderSetItemData["allUsedOrderSetItemIds"]) resultsStatData["numAvailableOrderSetItems"] = len( orderSetItemData["allAvailableOrderSetItemIds"]) resultsStatData["numRecommendableUsedOrderSetItems"] = len( orderSetItemData["recommendableUsedOrderSetItemIds"]) resultsStatData[ "numRecommendableAvailableOrderSetItems"] = len( orderSetItemData[ "recommendableAvailableOrderSetItemIds"]) resultsStatData["numRecommendableQueryItems"] = len( orderSetItemData["recommendableQueryItemIds"]) resultsStatData["numRecommendableVerifyItems"] = len( orderSetItemData["recommendableVerifyItemIds"]) resultsStatData["numRecommendableQueryVerifyItems"] = len( orderSetItemData["recommendableQueryItemIds"] | orderSetItemData["recommendableVerifyItemIds"]) # Union of two sets resultsStatData["orderSetItemUsageRate"] = 0.0 if resultsStatData["numAvailableOrderSetItems"] > 0: resultsStatData["orderSetItemUsageRate"] = float( resultsStatData["numUsedOrderSetItems"] ) / resultsStatData["numAvailableOrderSetItems"] resultsStatData[ "recommendableQueryVerifyItemFromOrderSetRate"] = 0.0 if resultsStatData["numRecommendableQueryVerifyItems"] > 0: resultsStatData[ "recommendableQueryVerifyItemFromOrderSetRate"] = float( resultsStatData[ "numRecommendableUsedOrderSetItems"] ) / resultsStatData[ "numRecommendableQueryVerifyItems"] yield resultsStatData progress.Update() # progress.PrintStatus(); finally: if not extConn: conn.close()
def __call__(self, analysisQuery, conn=None): extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Preload some lookup data to facilitate subsequent checks baseCountByItemId = self.dataManager.loadClinicalItemBaseCountByItemId( conn=conn) # Recommender to test with recommender = analysisQuery.recommender # Start building basic recommendation query to use for testing recQuery = analysisQuery.baseRecQuery # Start building results data resultsStatDataList = list() progress = ProgressDots(50, 1, "Patients") # Query for all of the order / item data for the test patients. Load one patient's data at a time preparer = PreparePatientItems() for patientItemData in preparer.loadPatientItemData(analysisQuery, conn=conn): patientId = patientItemData["patient_id"] analysisResults = \ self.analyzePatientItems \ ( patientItemData, analysisQuery, recQuery, patientId, recommender, preparer, conn=conn ) if analysisResults is not None: (queryItemCountById, verifyItemCountById, recommendedItemIds, recommendedData) = analysisResults # Unpack results # Start aggregating and calculating result stats resultsStatData = self.calculateResultStats( patientItemData, queryItemCountById, verifyItemCountById, recommendedItemIds, baseCountByItemId, recQuery, recommendedData) if "baseItemId" in patientItemData: analysisQuery.baseItemId = patientItemData[ "baseItemId"] # Record something here, so know to report back in result headers resultsStatDataList.append(resultsStatData) progress.Update() # progress.PrintStatus(); return resultsStatDataList finally: if not extConn: conn.close()
def syncTable(sourceConn, targetConn, syncTableName, rowIDStrSet=None, formatter=None): if formatter is None: idCol = DBUtil.defaultIDColumn(syncTableName) idQuery = "select %s from %s" % (idCol, syncTableName) # Collect all of the IDs known in the target database and store in memory for rapid lookup print("Querying for IDs from Target Database", file=sys.stderr) targetIdTable = DBUtil.execute(idQuery, conn=targetConn) targetIdSet = set() for row in targetIdTable: targetId = row[0] targetIdSet.add(targetId) # Query data out of the source table, but do it by a cursor so we can stream through large data tables print("Querying for Source Data", file=sys.stderr) dataQuery = "select * from %s" % (syncTableName) sourceCursor = sourceConn.cursor() sourceCursor.execute(dataQuery) colNames = DBUtil.columnNamesFromCursor(sourceCursor) targetCursor = targetConn.cursor() insertQuery = None updateQuery = None progress = ProgressDots() row = sourceCursor.fetchone() while row is not None: dataModel = RowItemModel(row, colNames) if rowIDStrSet is None or str(dataModel[idCol]) in rowIDStrSet: if rowIDStrSet is not None: print("Syncing record: %s" % dataModel[idCol], file=sys.stderr) if dataModel[idCol] not in targetIdSet: # Row does not yet exist in target database, need to insert it if insertQuery is None: insertQuery = DBUtil.buildInsertQuery( syncTableName, list(dataModel.keys())) insertParams = list(dataModel.values()) targetCursor.execute(insertQuery, insertParams) else: # Row already exists in target database, just update values if updateQuery is None: updateQuery = DBUtil.buildUpdateQuery( syncTableName, list(dataModel.keys())) updateParams = [] updateParams.extend(list(dataModel.values())) updateParams.append(dataModel[idCol]) targetCursor.execute(updateQuery, updateParams) if progress.GetCounts() % progress.big == 0: targetConn.commit() row = sourceCursor.fetchone() progress.Update() progress.PrintStatus() targetConn.commit() else: ##Do something with thr formatter ##Set up the formatter theFormatter = formatter(syncTableName, targetConn, includeColumnNames=True, autoCommit=True) #Call DB execute res = DBUtil.execute("select * from %s" % syncTableName, includeColumnNames=True, conn=sourceConn, formatter=theFormatter)