def _getConn(self): """ Local wrapper to get database connection """ if hasattr(self, 'access'): return self.access = DirectAccess(database=self.database, user=self.user, password=self.password)
def reportable_region(self, value): if 'reportable_region' in self._lock_attrs: raise AttributeError("can't set attribute") # Confirm the requested region is in the db. if value: connection = DirectAccess(database=self.database, user=self.user, password=self.password) cursor = connection.raw_query("SELECT count(*) FROM "\ "internal_reportable_region "\ "WHERE region_name = '%s'" % value) if cursor.next()[0] < 1: self.error_callback("%s region not found in "\ "internal_reportable_region table" % value) connection.close() self._crit['reportable_region'] = value
class GenerateReport(object): """ Process options and generate the requested report. Optionally persists the file to the filesystem, and uploads to the DOH sftp server. """ __version__ = '0.2' config = Config() IGNORE_SITE = config.get('longitudinal', 'ignore_site', default='') # Order matters, create a tuple of paired values (reportColumn, # essenceColumn) - NB, the Diagnosis column is being bastardized. # Previously there was an SQL function to do the subselect, but it # ran way too slow. Now contains the foreign key to join w/ the # diagnosis for the respective visit. diagnosis_column_index = 7 patient_class_column_index = 11 columns = ( ('Hosp', 'hospital'), ('Reg Date', 'visit_date'), ('Time', 'visit_time'), ('Sex', 'gender'), ('Age', 'age'), ('Reason For Visit', 'chief_complaint'), ('Zip Code', 'zip'), ('Diagnosis', 'visit_pk'), ('Admit Status', 'gipse_disposition'), ('Medical Record No.', 'patient_id'), ('Visit Record No.', 'visit_id'), ('Service Area', 'patient_class'), ) assert (columns[diagnosis_column_index][1] == 'visit_pk') assert (columns[patient_class_column_index][1] == 'patient_class') def __init__(self, user=None, password=None, report_criteria=None, datefile=None): """Initialize report generation. :param user: database user :param password: database password :param report_criteria: ReportCriteria defining specifics :param datefile: useful for persistent walks through time """ self.user = user self.password = password self.criteria = report_criteria self.database = self.criteria.database if datefile: assert ((self.criteria.start_date, self.criteria.end_date) == datefile.get_date_range()) self.datePersistence = datefile self._diags = {} self._prepare_output_file() self._prepare_columns() self._set_transport() def _prepare_columns(self): # Don't include the patient_class column if splitting out by # patient_class if self.criteria.patient_class: len_b4 = len(self.columns) self.columns =\ self.columns[:self.patient_class_column_index] \ + self.columns[self.patient_class_column_index + 1:] assert (len(self.columns) + 1 == len_b4) def _set_transport(self): """Plug in the appropriate transport mechanism""" # Transport strategies differ for the different reports if self.criteria.reportable_region: self._transport = Distribute_client(zip_first=True) else: self._transport = PHINMS_client(zip_first=True) def _generate_output_filename(self, start_date=None, end_date=None): start_date = self.criteria.start_date if start_date is None\ else start_date end_date = self.criteria.end_date if end_date is None else end_date datestr = end_date.strftime('%Y%m%d') if start_date != end_date: datestr = '-'.join( (start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d'))) filename = self.criteria.report_method + '-' + datestr + '.txt' config = Config() tmp_dir = config.get('general', 'tmp_dir', default='/tmp') filepath = os.path.join(tmp_dir, filename) return filepath def _prepare_output_file(self): """Open the local filesystem file for output""" filepath = self.\ _generate_output_filename(start_date=self.criteria.start_date, end_date=self.criteria.end_date) # watch for oversight errors; notify if like report exists - # unless it's size zero (from a previous failed run) if os.path.exists(filepath) and os.path.getsize(filepath): logging.warning("Found requested report file already "\ "exists - overwriting: '%s'"\ % filepath) self.output = open(filepath, 'w') self._output_filename = self.output.name @property def output_filename(self): if not hasattr(self, '_output_filename'): raise RuntimeError("prerequisite call to "\ "_prepare_output_file() "\ "didn't happen!") return self._output_filename def _header(self): if self.criteria.include_vitals: columns = [c[0] for c in self.columns] columns += ('Measured Temperature', 'O2 Saturation', 'Self-Reported Influenza Vaccine', 'Self-Reported H1N1 Vaccine') return '|'.join(columns) else: return '|'.join([c[0] for c in self.columns]) def _build_join_tables(self): """ Scope continues to grow, build all join tables necessary for the query. Some are only necessary with certain features on. """ # Always need the list of reportable visits self._build_visit_join_table() if self.criteria.include_vitals: self._build_vitals_join_table() def _build_visit_join_table(self): """ Helper in selection of visits for the report - this method builds a temporary table and populates it with the visit_pks that belong in the report. This should include all visit_pks with the matching admit_datetime as well as any that have received updates since the last like report was produced. """ # If include_vitals is on, we also need the visit_id to keep # the joins managable. vitals don't have a patient class, so # you can't join on the same values. sql = "CREATE TEMPORARY TABLE reportable_pks (pk "\ "integer not null unique)" selectCols = "fact_visit.pk" self._getConn() self.access.raw_query(sql) # If we're only selecting those facilites in a region, the SQL # is more complicated - build up the respective clauses. joinClause = regionClause = "" if self.criteria.reportable_region: joinClause = "JOIN internal_reportable_region ON "\ "internal_reportable_region.dim_facility_pk = "\ "fact_visit.dim_facility_pk" regionClause = "AND region_name = '%s'" %\ self.criteria.reportable_region # Another HACK! One site is not even wanted by the state DOH, # as it's being duplicated from another source, and ESSENCE # can't help but count them twice. Remove this one site # regardless else: joinClause = "JOIN internal_reportable_region ON "\ "internal_reportable_region.dim_facility_pk = "\ "fact_visit.dim_facility_pk" regionClause = "AND region_name = '%s'" % self.IGNORE_SITE # Limit by patient_class if requested. Note we may still end # up with visit ids that have changed patient classes, so more # pruning later is necessary. pc_limit = "" if self.criteria.patient_class: pc_limit = "AND patient_class = '%c'" %\ self.criteria.patient_class # Start with all visits for the requested date range sql = "INSERT INTO reportable_pks SELECT %s FROM "\ "fact_visit %s WHERE admit_datetime BETWEEN '%s' AND "\ "'%s' %s %s" %\ (selectCols, joinClause, self.criteria.start_date, self.criteria.end_date + timedelta(days=1), pc_limit, regionClause) self.access.raw_query(sql) if self.criteria.include_updates: # In this case, add all visits with updates since the # last run, but no newer than the requested date (in case # we're building reports forward from historical data) sql = "SELECT max(processed_datetime) FROM internal_report "\ "WHERE report_method = '%s'" % self.criteria.report_method cursor = self.access.raw_query(sql) last_report_generated = cursor.fetchall()[0][0] if last_report_generated is None: last_report_generated = '2009-01-01' # our epoch logging.debug("including updates, last_report_generated: "\ "%s", last_report_generated) sql = "INSERT INTO reportable_pks SELECT %(sel_cols)s FROM "\ "fact_visit %(join_clause)s LEFT JOIN reportable_pks ON "\ "reportable_pks.pk = fact_visit.pk WHERE "\ "last_updated > '%(last_report)s' AND admit_datetime "\ "< '%(date)s' AND reportable_pks.pk IS NULL "\ "%(pc_limit)s %(region_clause)s" %\ {'sel_cols': selectCols, 'last_report': last_report_generated, 'date': self.criteria.end_date + timedelta(days=1), 'pc_limit': pc_limit, 'join_clause': joinClause, 'region_clause': regionClause} self.access.raw_query(sql) cursor = self.access.raw_query("SELECT COUNT(*) FROM "\ "reportable_pks") logging.debug("%d visits to report on", cursor.fetchall()[0][0]) def _build_vitals_join_table(self): """When report is to include vitals - we use an additional temporary table (visit_loinc_data) to hold the data for more timely queries. Like the rest of the report, the list of interesting visits is limited to the rows in the reportable_pks - see _build_join_table() for details. """ raise ValueError('not ported yet') sql = """ CREATE TEMPORARY TABLE visit_loinc_data ( visit_id VARCHAR(255) not null, patient_class CHAR(1) default null, observation_id VARCHAR(255) not null, observation_result VARCHAR(255) not null) """ self._getConn() self.access.raw_query(sql) sql = """ INSERT INTO visit_loinc_data (visit_id, patient_class, observation_id, observation_result) SELECT visit.visit_id, visit.patient_class, observation_id, observation_result FROM visit JOIN hl7_visit ON visit.visit_id = hl7_visit.visit_id JOIN hl7_obx ON hl7_visit.hl7_msh_id = hl7_obx.hl7_msh_id JOIN reportable_pks ON reportable_pks.visit_id = visit.visit_id AND reportable_pks.patient_class = visit.patient_class WHERE observation_id in ('8310-5', '20564-1', '46077-4', '29544-4') """ self.access.raw_query(sql) def _select_from_essence_view(self): """Build up the SQL select statement to be used in gathering the data for this report. """ stmt = """SELECT %s FROM essence e JOIN reportable_pks ri ON e.visit_pk = ri.pk""" %\ (','.join(['e.' + c[1] for c in self.columns])) return stmt def _select_diagnosis(self): """ Need to pull in all the diagnosis data for this report. This is saved in an instance dictionary for use in self._diagnosis to generate the list of diagnoses for each respective visit. A list of unique diagnoses ordered by rank is required. """ # We order descending on dx_datetime as the most recent should # be best. Add any others as the persistence mechanism only # saves a unique icd9 dx that has changed status. stmt = "SELECT fact_visit_pk, rank, icd9 "\ "FROM assoc_visit_dx JOIN "\ "dim_dx ON dim_dx_pk = dim_dx.pk JOIN "\ "reportable_pks ON "\ "assoc_visit_dx.fact_visit_pk = reportable_pks.pk "\ "ORDER BY dx_datetime DESC" cursor = self.access.raw_query(stmt) for row in cursor.fetchall(): visit_pk = row[0] if visit_pk in self._diags: self._diags[visit_pk].add(row[0], row[1], row[2]) else: self._diags[visit_pk] = \ SortedDiagnosis(row[0], row[1], row[2]) def _diagnosis(self, visit_pk): if visit_pk in self._diags: return [ self._diags[visit_pk].__repr__(), ] else: return [ '', ] def _select_vitals(self): """ Need to pull in all the vitals data for this report. This is saved in an instance dictionary for use in self._vitals_for_visit to generate the list of vitals for each respective visit. This is an effective NOP when self.criteria.include_vitals = False """ if not self.criteria.include_vitals: return None self._vitals = {} stmt = """SELECT reportable_pks.visit_pk, observation_id, observation_result FROM visit_loinc_data JOIN reportable_pks ON reportable_pks.visit_id = visit_loinc_data.visit_id""" cursor = self.access.raw_query(stmt) for row in cursor.fetchall(): visit_pk = row[0] if visit_pk in self._vitals: self._vitals[visit_pk].add(row[1], row[2]) else: self._vitals[visit_pk] = \ Vitals(row[1], row[2]) def _vitals_for_visit(self, visit_pk): """Returns the list of vitals for the visit in question. This is an effective NOP when self.criteria.include_vitals = False """ if not self.criteria.include_vitals: return [] if visit_pk in self._vitals: return self._vitals[visit_pk].__repr__() else: return Vitals().__repr__() def _write_report(self, save_report=False): """ Write out and potentially store the results. Generate results via database queries and write the results to self.output. :param save_report: If set, persist the document and related metadata to the mbds archive. returns the document ID, the mbds archive key, if saved """ out = self.output print >> out, self._header() self._build_join_tables() self._select_diagnosis() self._select_vitals() cursor = self.access.raw_query(self._select_from_essence_view()) for row in cursor.fetchall(): # Each row is the colums up to the diagnosis + the # comma separated diagnosis + the rest of the columns # and finally with vitals if configured for such visit_pk = row[self.diagnosis_column_index] # yuck, but true print >> out,\ '|'.join([strSansNone(column) for column in row[:self.diagnosis_column_index]] + self._diagnosis(visit_pk) + [strSansNone(column) for column in row[self.diagnosis_column_index + 1:]] + self._vitals_for_visit(visit_pk)) # Close the file and persist to the document archive if # requested self.output.close() if save_report: metadata = { k: v for k, v in self.criteria._crit.items() if v is not None } # At this point, all documents are of 'essence' type return document_store(document=self.output.name, allow_duplicate_filename=True, document_type='essence', **metadata) def _record_report(self, report_oid): """Record the details from this report generation in the db""" if not report_oid: return report = Report(processed_datetime=datetime.now(), file_path=report_oid, report_method=self.criteria.report_method) alchemy = AlchemyAccess(database=self.database) alchemy.session.add(report) alchemy.session.commit() alchemy.disconnect() def _transmit_report(self, report): """Transmit report using self._transport()""" logging.info("initiate upload of %s", report) self._transport.transfer_file(report) def _transmit_differences(self, report): """Compute differences from yesterday's like report; transport""" # This option really only makes sense on date range reports, # as updates hit older data than just 'yesterday'. if self.criteria.start_date == self.criteria.end_date: raise ValueError("difference calculation not supported on "\ "single day reports") # See if we can find a similar report in the archive from # yesterday search_criteria = { 'report_method': self.criteria.report_method, 'start_date': self.criteria.start_date - timedelta(days=1), 'end_date': self.criteria.end_date - timedelta(days=1) } old_doc = document_find(search_criteria, limit=1) if old_doc is None: logging.info("No comparable report found for difference "\ "generation") self._transmit_report(report) else: target_filename = self.\ _generate_output_filename(start_date=self.criteria.start_date, end_date=self.criteria.end_date) # RemoveDuplicates not yet ported!! raise ValueError("RemoveDuplicates not ported") #from pheme.essence.remove_duplicates import RemoveDuplicates #rd = RemoveDuplicates(new_report=report, # old_report=old_doc, # out=target_filename) #rd.generate_report() #logging.info("initiate upload of difference %s", target_filename) #self._transport.transfer_file(target_filename) def _getConn(self): """ Local wrapper to get database connection """ if hasattr(self, 'access'): return self.access = DirectAccess(database=self.database, user=self.user, password=self.password) def _closeConn(self): """ Local wrapper to close database connection """ if hasattr(self, 'access'): self.access.close() def tearDown(self): "Public interface to clean up internals" self._closeConn() def execute(self, save_report=False, transmit_report=False, transmit_differences=False): """Execute the report generation """ logging.info("Initiate ESSENCE report generation [%s-%s] for %s", self.criteria.start_date, self.criteria.end_date, self.criteria.report_method) self._getConn() report_oid = self._write_report(save_report) self._record_report(report_oid) if transmit_report: self._transmit_report(report_oid) if transmit_differences: self._transmit_differences(report_oid) self._closeConn() if hasattr(self, 'datePersistence'): self.datePersistence.bump_date() logging.info("Completed ESSENCE report generation [%s-%s] for %s", self.criteria.start_date, self.criteria.end_date, self.criteria.report_method)
class GenerateReport(object): """ Process options and generate the requested report. Optionally persists the file to the filesystem, and uploads to the DOH sftp server. """ __version__ = '0.2' config = Config() IGNORE_SITE = config.get('longitudinal', 'ignore_site', default='') # Order matters, create a tuple of paired values (reportColumn, # essenceColumn) - NB, the Diagnosis column is being bastardized. # Previously there was an SQL function to do the subselect, but it # ran way too slow. Now contains the foreign key to join w/ the # diagnosis for the respective visit. diagnosis_column_index = 7 patient_class_column_index = 11 columns = (('Hosp', 'hospital'), ('Reg Date', 'visit_date'), ('Time', 'visit_time'), ('Sex', 'gender'), ('Age', 'age'), ('Reason For Visit', 'chief_complaint'), ('Zip Code', 'zip'), ('Diagnosis', 'visit_pk'), ('Admit Status', 'gipse_disposition'), ('Medical Record No.', 'patient_id'), ('Visit Record No.', 'visit_id'), ('Service Area', 'patient_class'),) assert(columns[diagnosis_column_index][1] == 'visit_pk') assert(columns[patient_class_column_index][1] == 'patient_class') def __init__(self, user=None, password=None, report_criteria=None, datefile=None): """Initialize report generation. :param user: database user :param password: database password :param report_criteria: ReportCriteria defining specifics :param datefile: useful for persistent walks through time """ self.user = user self.password = password self.criteria = report_criteria self.database = self.criteria.database if datefile: assert((self.criteria.start_date, self.criteria.end_date) == datefile.get_date_range()) self.datePersistence = datefile self._diags = {} self._prepare_output_file() self._prepare_columns() self._set_transport() def _prepare_columns(self): # Don't include the patient_class column if splitting out by # patient_class if self.criteria.patient_class: len_b4 = len(self.columns) self.columns =\ self.columns[:self.patient_class_column_index] \ + self.columns[self.patient_class_column_index + 1:] assert(len(self.columns) + 1 == len_b4) def _set_transport(self): """Plug in the appropriate transport mechanism""" # Transport strategies differ for the different reports if self.criteria.reportable_region: self._transport = Distribute_client(zip_first=True) else: self._transport = PHINMS_client(zip_first=True) def _generate_output_filename(self, start_date=None, end_date=None): start_date = self.criteria.start_date if start_date is None\ else start_date end_date = self.criteria.end_date if end_date is None else end_date datestr = end_date.strftime('%Y%m%d') if start_date != end_date: datestr = '-'.join((start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d'))) filename = self.criteria.report_method + '-' + datestr + '.txt' config = Config() tmp_dir = config.get('general', 'tmp_dir', default='/tmp') filepath = os.path.join(tmp_dir, filename) return filepath def _prepare_output_file(self): """Open the local filesystem file for output""" filepath = self.\ _generate_output_filename(start_date=self.criteria.start_date, end_date=self.criteria.end_date) # watch for oversight errors; notify if like report exists - # unless it's size zero (from a previous failed run) if os.path.exists(filepath) and os.path.getsize(filepath): logging.warning("Found requested report file already "\ "exists - overwriting: '%s'"\ % filepath) self.output = open(filepath, 'w') self._output_filename = self.output.name @property def output_filename(self): if not hasattr(self, '_output_filename'): raise RuntimeError("prerequisite call to "\ "_prepare_output_file() "\ "didn't happen!") return self._output_filename def _header(self): if self.criteria.include_vitals: columns = [c[0] for c in self.columns] columns += ('Measured Temperature', 'O2 Saturation', 'Self-Reported Influenza Vaccine', 'Self-Reported H1N1 Vaccine') return '|'.join(columns) else: return '|'.join([c[0] for c in self.columns]) def _build_join_tables(self): """ Scope continues to grow, build all join tables necessary for the query. Some are only necessary with certain features on. """ # Always need the list of reportable visits self._build_visit_join_table() if self.criteria.include_vitals: self._build_vitals_join_table() def _build_visit_join_table(self): """ Helper in selection of visits for the report - this method builds a temporary table and populates it with the visit_pks that belong in the report. This should include all visit_pks with the matching admit_datetime as well as any that have received updates since the last like report was produced. """ # If include_vitals is on, we also need the visit_id to keep # the joins managable. vitals don't have a patient class, so # you can't join on the same values. sql = "CREATE TEMPORARY TABLE reportable_pks (pk "\ "integer not null unique)" selectCols = "fact_visit.pk" self._getConn() self.access.raw_query(sql) # If we're only selecting those facilites in a region, the SQL # is more complicated - build up the respective clauses. joinClause = regionClause = "" if self.criteria.reportable_region: joinClause = "JOIN internal_reportable_region ON "\ "internal_reportable_region.dim_facility_pk = "\ "fact_visit.dim_facility_pk" regionClause = "AND region_name = '%s'" %\ self.criteria.reportable_region # Another HACK! One site is not even wanted by the state DOH, # as it's being duplicated from another source, and ESSENCE # can't help but count them twice. Remove this one site # regardless else: joinClause = "JOIN internal_reportable_region ON "\ "internal_reportable_region.dim_facility_pk = "\ "fact_visit.dim_facility_pk" regionClause = "AND region_name = '%s'" % self.IGNORE_SITE # Limit by patient_class if requested. Note we may still end # up with visit ids that have changed patient classes, so more # pruning later is necessary. pc_limit = "" if self.criteria.patient_class: pc_limit = "AND patient_class = '%c'" %\ self.criteria.patient_class # Start with all visits for the requested date range sql = "INSERT INTO reportable_pks SELECT %s FROM "\ "fact_visit %s WHERE admit_datetime BETWEEN '%s' AND "\ "'%s' %s %s" %\ (selectCols, joinClause, self.criteria.start_date, self.criteria.end_date + timedelta(days=1), pc_limit, regionClause) self.access.raw_query(sql) if self.criteria.include_updates: # In this case, add all visits with updates since the # last run, but no newer than the requested date (in case # we're building reports forward from historical data) sql = "SELECT max(processed_datetime) FROM internal_report "\ "WHERE report_method = '%s'" % self.criteria.report_method cursor = self.access.raw_query(sql) last_report_generated = cursor.fetchall()[0][0] if last_report_generated is None: last_report_generated = '2009-01-01' # our epoch logging.debug("including updates, last_report_generated: "\ "%s", last_report_generated) sql = "INSERT INTO reportable_pks SELECT %(sel_cols)s FROM "\ "fact_visit %(join_clause)s LEFT JOIN reportable_pks ON "\ "reportable_pks.pk = fact_visit.pk WHERE "\ "last_updated > '%(last_report)s' AND admit_datetime "\ "< '%(date)s' AND reportable_pks.pk IS NULL "\ "%(pc_limit)s %(region_clause)s" %\ {'sel_cols': selectCols, 'last_report': last_report_generated, 'date': self.criteria.end_date + timedelta(days=1), 'pc_limit': pc_limit, 'join_clause': joinClause, 'region_clause': regionClause} self.access.raw_query(sql) cursor = self.access.raw_query("SELECT COUNT(*) FROM "\ "reportable_pks") logging.debug("%d visits to report on", cursor.fetchall()[0][0]) def _build_vitals_join_table(self): """When report is to include vitals - we use an additional temporary table (visit_loinc_data) to hold the data for more timely queries. Like the rest of the report, the list of interesting visits is limited to the rows in the reportable_pks - see _build_join_table() for details. """ raise ValueError('not ported yet') sql = """ CREATE TEMPORARY TABLE visit_loinc_data ( visit_id VARCHAR(255) not null, patient_class CHAR(1) default null, observation_id VARCHAR(255) not null, observation_result VARCHAR(255) not null) """ self._getConn() self.access.raw_query(sql) sql = """ INSERT INTO visit_loinc_data (visit_id, patient_class, observation_id, observation_result) SELECT visit.visit_id, visit.patient_class, observation_id, observation_result FROM visit JOIN hl7_visit ON visit.visit_id = hl7_visit.visit_id JOIN hl7_obx ON hl7_visit.hl7_msh_id = hl7_obx.hl7_msh_id JOIN reportable_pks ON reportable_pks.visit_id = visit.visit_id AND reportable_pks.patient_class = visit.patient_class WHERE observation_id in ('8310-5', '20564-1', '46077-4', '29544-4') """ self.access.raw_query(sql) def _select_from_essence_view(self): """Build up the SQL select statement to be used in gathering the data for this report. """ stmt = """SELECT %s FROM essence e JOIN reportable_pks ri ON e.visit_pk = ri.pk""" %\ (','.join(['e.' + c[1] for c in self.columns])) return stmt def _select_diagnosis(self): """ Need to pull in all the diagnosis data for this report. This is saved in an instance dictionary for use in self._diagnosis to generate the list of diagnoses for each respective visit. A list of unique diagnoses ordered by rank is required. """ # We order descending on dx_datetime as the most recent should # be best. Add any others as the persistence mechanism only # saves a unique icd9 dx that has changed status. stmt = "SELECT fact_visit_pk, rank, icd9 "\ "FROM assoc_visit_dx JOIN "\ "dim_dx ON dim_dx_pk = dim_dx.pk JOIN "\ "reportable_pks ON "\ "assoc_visit_dx.fact_visit_pk = reportable_pks.pk "\ "ORDER BY dx_datetime DESC" cursor = self.access.raw_query(stmt) for row in cursor.fetchall(): visit_pk = row[0] if visit_pk in self._diags: self._diags[visit_pk].add(row[0], row[1], row[2]) else: self._diags[visit_pk] = \ SortedDiagnosis(row[0], row[1], row[2]) def _diagnosis(self, visit_pk): if visit_pk in self._diags: return [self._diags[visit_pk].__repr__(), ] else: return ['', ] def _select_vitals(self): """ Need to pull in all the vitals data for this report. This is saved in an instance dictionary for use in self._vitals_for_visit to generate the list of vitals for each respective visit. This is an effective NOP when self.criteria.include_vitals = False """ if not self.criteria.include_vitals: return None self._vitals = {} stmt = """SELECT reportable_pks.visit_pk, observation_id, observation_result FROM visit_loinc_data JOIN reportable_pks ON reportable_pks.visit_id = visit_loinc_data.visit_id""" cursor = self.access.raw_query(stmt) for row in cursor.fetchall(): visit_pk = row[0] if visit_pk in self._vitals: self._vitals[visit_pk].add(row[1], row[2]) else: self._vitals[visit_pk] = \ Vitals(row[1], row[2]) def _vitals_for_visit(self, visit_pk): """Returns the list of vitals for the visit in question. This is an effective NOP when self.criteria.include_vitals = False """ if not self.criteria.include_vitals: return [] if visit_pk in self._vitals: return self._vitals[visit_pk].__repr__() else: return Vitals().__repr__() def _write_report(self, save_report=False): """ Write out and potentially store the results. Generate results via database queries and write the results to self.output. :param save_report: If set, persist the document and related metadata to the mbds archive. returns the document ID, the mbds archive key, if saved """ out = self.output print >> out, self._header() self._build_join_tables() self._select_diagnosis() self._select_vitals() cursor = self.access.raw_query(self._select_from_essence_view()) for row in cursor.fetchall(): # Each row is the colums up to the diagnosis + the # comma separated diagnosis + the rest of the columns # and finally with vitals if configured for such visit_pk = row[self.diagnosis_column_index] # yuck, but true print >> out,\ '|'.join([strSansNone(column) for column in row[:self.diagnosis_column_index]] + self._diagnosis(visit_pk) + [strSansNone(column) for column in row[self.diagnosis_column_index + 1:]] + self._vitals_for_visit(visit_pk)) # Close the file and persist to the document archive if # requested self.output.close() if save_report: metadata = {k: v for k, v in self.criteria._crit.items() if v is not None} # At this point, all documents are of 'essence' type return document_store(document=self.output.name, allow_duplicate_filename=True, document_type='essence', **metadata) def _record_report(self, report_oid): """Record the details from this report generation in the db""" if not report_oid: return report = Report(processed_datetime=datetime.now(), file_path=report_oid, report_method=self.criteria.report_method) alchemy = AlchemyAccess(database=self.database) alchemy.session.add(report) alchemy.session.commit() alchemy.disconnect() def _transmit_report(self, report): """Transmit report using self._transport()""" logging.info("initiate upload of %s", report) self._transport.transfer_file(report) def _transmit_differences(self, report): """Compute differences from yesterday's like report; transport""" # This option really only makes sense on date range reports, # as updates hit older data than just 'yesterday'. if self.criteria.start_date == self.criteria.end_date: raise ValueError("difference calculation not supported on "\ "single day reports") # See if we can find a similar report in the archive from # yesterday search_criteria = {'report_method': self.criteria.report_method, 'start_date': self.criteria.start_date - timedelta(days=1), 'end_date': self.criteria.end_date - timedelta(days=1)} old_doc = document_find(search_criteria, limit=1) if old_doc is None: logging.info("No comparable report found for difference "\ "generation") self._transmit_report(report) else: target_filename = self.\ _generate_output_filename(start_date=self.criteria.start_date, end_date=self.criteria.end_date) # RemoveDuplicates not yet ported!! raise ValueError("RemoveDuplicates not ported") #from pheme.essence.remove_duplicates import RemoveDuplicates #rd = RemoveDuplicates(new_report=report, # old_report=old_doc, # out=target_filename) #rd.generate_report() #logging.info("initiate upload of difference %s", target_filename) #self._transport.transfer_file(target_filename) def _getConn(self): """ Local wrapper to get database connection """ if hasattr(self, 'access'): return self.access = DirectAccess(database=self.database, user=self.user, password=self.password) def _closeConn(self): """ Local wrapper to close database connection """ if hasattr(self, 'access'): self.access.close() def tearDown(self): "Public interface to clean up internals" self._closeConn() def execute(self, save_report=False, transmit_report=False, transmit_differences=False): """Execute the report generation """ logging.info("Initiate ESSENCE report generation [%s-%s] for %s", self.criteria.start_date, self.criteria.end_date, self.criteria.report_method) self._getConn() report_oid = self._write_report(save_report) self._record_report(report_oid) if transmit_report: self._transmit_report(report_oid) if transmit_differences: self._transmit_differences(report_oid) self._closeConn() if hasattr(self, 'datePersistence'): self.datePersistence.bump_date() logging.info("Completed ESSENCE report generation [%s-%s] for %s", self.criteria.start_date, self.criteria.end_date, self.criteria.report_method)
class LongitudinalManager(object): """ Abstraction to handle which db, user, etc. the deduplication process should be run on. Handles runtime arguments and execution. Manages the process by farming out the individual visit deduplication to a number of worker processes (necessary to take advatage of multi-core processor and database as the limiting factor). """ # The gating issue is the number of postgres connections that are # allowed to run concurrently. Setting this to N-1 (where N is # the number of cores) has proven the fastest and most reliable. NUM_PROCS = 5 def __init__( self, data_warehouse=None, data_mart=None, reportDate=None, database_user=None, database_password=None, verbosity=0, ): self.data_warehouse = data_warehouse self.warehouse_port = 5432 # postgres default self.data_mart = data_mart self.mart_port = 5432 # postgres default self.reportDate = reportDate and parseDate(reportDate) or None self.database_user = database_user self.database_password = database_password self.dir, thisFile = os.path.split(__file__) self.verbosity = verbosity self.queue = JoinableQueue() self.datefile = "/tmp/longitudinal_datefile" self.datePersistence = Datefile(initial_date=self.reportDate) self.lock = FileLock(LOCKFILE) self.skip_prep = False def __call__(self): return self.execute() def processArgs(self): """ Process any optional arguments and possitional parameters """ parser = OptionParser(usage=usage) parser.add_option( "-c", "--countdown", dest="countdown", default=None, help="count {down,up} date using date string in " "%s - set to 'forwards' or 'backwards' " "if desired" % self.datefile, ) parser.add_option( "-d", "--date", dest="date", default=None, help="single admission date to dedup " "(by default, checks the entire database)", ) parser.add_option( "-s", "--skip-prep", dest="skip_prep", default=False, action="store_true", help="skip the expense of looking for new " "messages", ) parser.add_option( "-v", "--verbose", dest="verbosity", action="count", default=self.verbosity, help="increase output verbosity", ) parser.add_option( "-m", "--mart-port", dest="mart_port", default=self.mart_port, type="int", help="alternate port number for data mart", ) parser.add_option( "-w", "--warehouse-port", dest="warehouse_port", default=self.warehouse_port, type="int", help="alternate port number for data warehouse", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("incorrect number of arguments") self.data_warehouse = args[0] self.data_mart = args[1] self.warehouse_port = parser.values.warehouse_port self.mart_port = parser.values.mart_port self.verbosity = parser.values.verbosity self.skip_prep = parser.values.skip_prep initial_date = parser.values.date and parseDate(parser.values.date) or None self.datePersistence = Datefile( initial_date=initial_date, persistence_file=self.datefile, direction=parser.values.countdown ) self.reportDate = self.datePersistence.get_date() def _prepDeduplicateTables(self): """ Add any missing rows to the MessageProcessed table This is the bridge between the data warehouse and the data mart. In an effort to make the data mart independent of the warehouse, the processed message data is kept in the mart. As we're dealing with two distinct databases, there's no referential integrity available at the database level, so care should be taken. """ startTime = time.time() logging.info("Starting INSERT INTO internal_message_processed " "at %s", startTime) # We can take advantage of an "add only" data_warehouse, # knowing the hl7_msh_id is a sequence moving in the positive # direction. Simply add any values greater than the previous # max. stmt = "SELECT max(hl7_msh_id) from internal_message_processed" max_id = self.data_mart_access.engine.execute(stmt).first()[0] if not max_id: max_id = 0 new_msgs = list() stmt = ( """SELECT hl7_msh_id, message_datetime, visit_id FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE hl7_msh_id > %d """ % max_id ) rs = self.data_warehouse_access.engine.execute(stmt) many = 500 while True: results = rs.fetchmany(many) if not results: break for r in results: new_msgs.append(MessageProcessed(hl7_msh_id=r[0], message_datetime=r[1], visit_id=r[2])) self.data_mart_access.session.add_all(new_msgs) self.data_mart_access.session.commit() logging.debug("added %d new messages" % len(new_msgs)) new_msgs = list() logging.info("Added new rows to internal_message_processed in %s", time.time() - startTime) def _visitsToProcess(self): """ Look up all distinct visit ids needing attention Obtain unique list of visit_ids that have messages that haven't previously been processed. If the user requested just one days worth (i.e. -d) only that days visits will be returned. """ visit_ids = list() if not self.reportDate: logging.info("Launch deduplication for entire database") # Do the whole batch, that is, all that haven't been # processed before. stmt = """SELECT DISTINCT(visit_id) FROM internal_message_processed WHERE processed_datetime IS NULL""" rs = self.data_mart_access.engine.execute(stmt) many = 10000 while True: results = rs.fetchmany(many) if not results: break for r in results: visit_ids.append(r[0]) else: logging.info("Launch deduplication for %s", self.reportDate) # Process the requested day only - as we can't join across # db boundaries - first acquire the full list of visits # for the requested day from the data_warehouse to use in # a massive 'in' clause stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE admit_datetime BETWEEN '%s' AND '%s';""" % ( self.reportDate, self.reportDate + timedelta(days=1), ) self.access.raw_query(stmt) rs = self.data_warehouse_access.engine.execute(stmt) many = 1000 potential_visit_ids = list() while True: results = rs.fetchmany(many) if not results: break for r in results: # tmp_table.insert(r[0]) potential_visit_ids.append(r[0]) if potential_visit_ids: query = ( self.data_mart_access.session.query(MessageProcessed.visit_id) .distinct() .filter( and_( MessageProcessed.processed_datetime == None, MessageProcessed.visit_id.in_(potential_visit_ids), ) ) ) for r in query: visit_ids.append(r[0]) logging.info("Found %d visits needing attention", len(visit_ids)) return visit_ids def tearDown(self): """ Clean up any open handles/connections """ # now done in execute when we're done with teh connections def execute(self): """ Start the process """ # Initialize logging now (verbosity is now set regardless of # invocation method) configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log") logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database")) # Only allow one instance of the manager to run at a time. if self.lock.is_locked(): logging.warn("Can't continue, %s is locked ", LOCKFILE) return if systemUnderLoad(): logging.warn("system under load - continue anyhow") try: self.lock.acquire() self.access = DirectAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password, ) self.data_warehouse_access = AlchemyAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password, ) self.data_mart_access = AlchemyAccess( database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password ) startTime = time.time() if not self.skip_prep: self._prepDeduplicateTables() visits_to_process = self._visitsToProcess() # Now done with db access needs at the manager level # free up resources: self.data_mart_access.disconnect() self.data_warehouse_access.disconnect() self.access.close() # Set of locks used, one for each table needing protection # from asynchronous inserts. Names should match table # minus 'dim_' prefix, plus '_lock' suffix # i.e. dim_location -> 'location_lock' table_locks = { "admission_source_lock": Lock(), "admission_o2sat_lock": Lock(), "admission_temp_lock": Lock(), "assigned_location_lock": Lock(), "admit_reason_lock": Lock(), "chief_complaint_lock": Lock(), "diagnosis_lock": Lock(), "disposition_lock": Lock(), "flu_vaccine_lock": Lock(), "h1n1_vaccine_lock": Lock(), "lab_flag_lock": Lock(), "lab_result_lock": Lock(), "location_lock": Lock(), "note_lock": Lock(), "order_number_lock": Lock(), "performing_lab_lock": Lock(), "pregnancy_lock": Lock(), "race_lock": Lock(), "reference_range_lock": Lock(), "service_area_lock": Lock(), "specimen_source_lock": Lock(), } # If we have visits to process, fire up the workers... if len(visits_to_process) > 1: for i in range(self.NUM_PROCS): dw = Process( target=LongitudinalWorker, kwargs={ "queue": self.queue, "procNumber": i, "data_warehouse": self.data_warehouse, "warehouse_port": self.warehouse_port, "data_mart": self.data_mart, "mart_port": self.mart_port, "dbUser": self.database_user, "dbPass": self.database_password, "table_locks": table_locks, "verbosity": self.verbosity, }, ) dw.daemon = True dw.start() # Populate the queue for v in visits_to_process: self.queue.put(v) # Wait on the queue until empty self.queue.join() # Common cleanup self.tearDown() self.datePersistence.bump_date() logging.info("Queue is empty - done in %s", time.time() - startTime) finally: self.lock.release()
def execute(self): """ Start the process """ # Initialize logging now (verbosity is now set regardless of # invocation method) configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log") logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database")) # Only allow one instance of the manager to run at a time. if self.lock.is_locked(): logging.warn("Can't continue, %s is locked ", LOCKFILE) return if systemUnderLoad(): logging.warn("system under load - continue anyhow") try: self.lock.acquire() self.access = DirectAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password, ) self.data_warehouse_access = AlchemyAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password, ) self.data_mart_access = AlchemyAccess( database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password ) startTime = time.time() if not self.skip_prep: self._prepDeduplicateTables() visits_to_process = self._visitsToProcess() # Now done with db access needs at the manager level # free up resources: self.data_mart_access.disconnect() self.data_warehouse_access.disconnect() self.access.close() # Set of locks used, one for each table needing protection # from asynchronous inserts. Names should match table # minus 'dim_' prefix, plus '_lock' suffix # i.e. dim_location -> 'location_lock' table_locks = { "admission_source_lock": Lock(), "admission_o2sat_lock": Lock(), "admission_temp_lock": Lock(), "assigned_location_lock": Lock(), "admit_reason_lock": Lock(), "chief_complaint_lock": Lock(), "diagnosis_lock": Lock(), "disposition_lock": Lock(), "flu_vaccine_lock": Lock(), "h1n1_vaccine_lock": Lock(), "lab_flag_lock": Lock(), "lab_result_lock": Lock(), "location_lock": Lock(), "note_lock": Lock(), "order_number_lock": Lock(), "performing_lab_lock": Lock(), "pregnancy_lock": Lock(), "race_lock": Lock(), "reference_range_lock": Lock(), "service_area_lock": Lock(), "specimen_source_lock": Lock(), } # If we have visits to process, fire up the workers... if len(visits_to_process) > 1: for i in range(self.NUM_PROCS): dw = Process( target=LongitudinalWorker, kwargs={ "queue": self.queue, "procNumber": i, "data_warehouse": self.data_warehouse, "warehouse_port": self.warehouse_port, "data_mart": self.data_mart, "mart_port": self.mart_port, "dbUser": self.database_user, "dbPass": self.database_password, "table_locks": table_locks, "verbosity": self.verbosity, }, ) dw.daemon = True dw.start() # Populate the queue for v in visits_to_process: self.queue.put(v) # Wait on the queue until empty self.queue.join() # Common cleanup self.tearDown() self.datePersistence.bump_date() logging.info("Queue is empty - done in %s", time.time() - startTime) finally: self.lock.release()
class LongitudinalManager(object): """ Abstraction to handle which db, user, etc. the deduplication process should be run on. Handles runtime arguments and execution. Manages the process by farming out the individual visit deduplication to a number of worker processes (necessary to take advatage of multi-core processor and database as the limiting factor). """ # The gating issue is the number of postgres connections that are # allowed to run concurrently. Setting this to N-1 (where N is # the number of cores) has proven the fastest and most reliable. NUM_PROCS = 5 def __init__(self, data_warehouse=None, data_mart=None, reportDate=None, database_user=None, database_password=None, verbosity=0): self.data_warehouse = data_warehouse self.warehouse_port = 5432 # postgres default self.data_mart = data_mart self.mart_port = 5432 # postgres default self.reportDate = reportDate and parseDate(reportDate) or None self.database_user = database_user self.database_password = database_password self.dir, thisFile = os.path.split(__file__) self.verbosity = verbosity self.queue = JoinableQueue() self.datefile = "/tmp/longitudinal_datefile" self.datePersistence = Datefile(initial_date=self.reportDate) self.lock = FileLock(LOCKFILE) self.skip_prep = False def __call__(self): return self.execute() def processArgs(self): """ Process any optional arguments and possitional parameters """ parser = OptionParser(usage=usage) parser.add_option("-c", "--countdown", dest="countdown", default=None, help="count {down,up} date using date string in "\ "%s - set to 'forwards' or 'backwards' "\ "if desired" % self.datefile) parser.add_option("-d", "--date", dest="date", default=None, help="single admission date to dedup "\ "(by default, checks the entire database)") parser.add_option("-s", "--skip-prep", dest="skip_prep", default=False, action="store_true", help="skip the expense of looking for new "\ "messages") parser.add_option("-v", "--verbose", dest="verbosity", action="count", default=self.verbosity, help="increase output verbosity") parser.add_option("-m", "--mart-port", dest="mart_port", default=self.mart_port, type="int", help="alternate port number for data mart") parser.add_option("-w", "--warehouse-port", dest="warehouse_port", default=self.warehouse_port, type="int", help="alternate port number for data warehouse") (options, args) = parser.parse_args() if len(args) != 2: parser.error("incorrect number of arguments") self.data_warehouse = args[0] self.data_mart = args[1] self.warehouse_port = parser.values.warehouse_port self.mart_port = parser.values.mart_port self.verbosity = parser.values.verbosity self.skip_prep = parser.values.skip_prep initial_date = parser.values.date and \ parseDate(parser.values.date) or None self.datePersistence = Datefile(initial_date=initial_date, persistence_file=self.datefile, direction=parser.values.countdown) self.reportDate = self.datePersistence.get_date() def _prepDeduplicateTables(self): """ Add any missing rows to the MessageProcessed table This is the bridge between the data warehouse and the data mart. In an effort to make the data mart independent of the warehouse, the processed message data is kept in the mart. As we're dealing with two distinct databases, there's no referential integrity available at the database level, so care should be taken. """ startTime = time.time() logging.info("Starting INSERT INTO internal_message_processed " "at %s", startTime) # We can take advantage of an "add only" data_warehouse, # knowing the hl7_msh_id is a sequence moving in the positive # direction. Simply add any values greater than the previous # max. stmt = "SELECT max(hl7_msh_id) from internal_message_processed" max_id = self.data_mart_access.engine.execute(stmt).first()[0] if not max_id: max_id = 0 new_msgs = list() stmt = """SELECT hl7_msh_id, message_datetime, visit_id FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE hl7_msh_id > %d """ % max_id rs = self.data_warehouse_access.engine.execute(stmt) many = 500 while True: results = rs.fetchmany(many) if not results: break for r in results: new_msgs.append(MessageProcessed(hl7_msh_id=r[0], message_datetime=r[1], visit_id=r[2])) self.data_mart_access.session.add_all(new_msgs) self.data_mart_access.session.commit() logging.debug("added %d new messages" % len(new_msgs)) new_msgs = list() logging.info("Added new rows to internal_message_processed in %s", time.time() - startTime) def _visitsToProcess(self): """ Look up all distinct visit ids needing attention Obtain unique list of visit_ids that have messages that haven't previously been processed. If the user requested just one days worth (i.e. -d) only that days visits will be returned. """ visit_ids = list() if not self.reportDate: logging.info("Launch deduplication for entire database") # Do the whole batch, that is, all that haven't been # processed before. stmt = """SELECT DISTINCT(visit_id) FROM internal_message_processed WHERE processed_datetime IS NULL""" rs = self.data_mart_access.engine.execute(stmt) many = 10000 while True: results = rs.fetchmany(many) if not results: break for r in results: visit_ids.append(r[0]) else: logging.info("Launch deduplication for %s", self.reportDate) # Process the requested day only - as we can't join across # db boundaries - first acquire the full list of visits # for the requested day from the data_warehouse to use in # a massive 'in' clause stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE admit_datetime BETWEEN '%s' AND '%s';""" %\ (self.reportDate, self.reportDate + timedelta(days=1)) self.access.raw_query(stmt) rs = self.data_warehouse_access.engine.execute(stmt) many = 1000 potential_visit_ids = list() while True: results = rs.fetchmany(many) if not results: break for r in results: #tmp_table.insert(r[0]) potential_visit_ids.append(r[0]) if potential_visit_ids: query = self.data_mart_access.session.query(\ MessageProcessed.visit_id).distinct().\ filter(and_(MessageProcessed.processed_datetime == None, MessageProcessed.visit_id.\ in_(potential_visit_ids))) for r in query: visit_ids.append(r[0]) logging.info("Found %d visits needing attention", len(visit_ids)) return visit_ids def tearDown(self): """ Clean up any open handles/connections """ # now done in execute when we're done with teh connections def execute(self): """ Start the process """ # Initialize logging now (verbosity is now set regardless of # invocation method) configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log") logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database")) # Only allow one instance of the manager to run at a time. if self.lock.is_locked(): logging.warn("Can't continue, %s is locked ", LOCKFILE) return if systemUnderLoad(): logging.warn("system under load - continue anyhow") try: self.lock.acquire() self.access = DirectAccess(database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password) self.data_warehouse_access = AlchemyAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password) self.data_mart_access = AlchemyAccess( database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password) startTime = time.time() if not self.skip_prep: self._prepDeduplicateTables() visits_to_process = self._visitsToProcess() # Now done with db access needs at the manager level # free up resources: self.data_mart_access.disconnect() self.data_warehouse_access.disconnect() self.access.close() # Set of locks used, one for each table needing protection # from asynchronous inserts. Names should match table # minus 'dim_' prefix, plus '_lock' suffix # i.e. dim_location -> 'location_lock' table_locks = {'admission_source_lock': Lock(), 'admission_o2sat_lock': Lock(), 'admission_temp_lock': Lock(), 'assigned_location_lock': Lock(), 'admit_reason_lock': Lock(), 'chief_complaint_lock': Lock(), 'diagnosis_lock': Lock(), 'disposition_lock': Lock(), 'flu_vaccine_lock': Lock(), 'h1n1_vaccine_lock': Lock(), 'lab_flag_lock': Lock(), 'lab_result_lock': Lock(), 'location_lock': Lock(), 'note_lock': Lock(), 'order_number_lock': Lock(), 'performing_lab_lock': Lock(), 'pregnancy_lock': Lock(), 'race_lock': Lock(), 'reference_range_lock': Lock(), 'service_area_lock': Lock(), 'specimen_source_lock': Lock(), } # If we have visits to process, fire up the workers... if len(visits_to_process) > 1: for i in range(self.NUM_PROCS): dw = Process(target=LongitudinalWorker, kwargs={'queue': self.queue, 'procNumber': i, 'data_warehouse': self.data_warehouse, 'warehouse_port': self.warehouse_port, 'data_mart': self.data_mart, 'mart_port': self.mart_port, 'dbUser': self.database_user, 'dbPass': self.database_password, 'table_locks': table_locks, 'verbosity': self.verbosity}) dw.daemon = True dw.start() # Populate the queue for v in visits_to_process: self.queue.put(v) # Wait on the queue until empty self.queue.join() # Common cleanup self.tearDown() self.datePersistence.bump_date() logging.info("Queue is empty - done in %s", time.time() - startTime) finally: self.lock.release()
def execute(self): """ Start the process """ # Initialize logging now (verbosity is now set regardless of # invocation method) configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log") logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database")) # Only allow one instance of the manager to run at a time. if self.lock.is_locked(): logging.warn("Can't continue, %s is locked ", LOCKFILE) return if systemUnderLoad(): logging.warn("system under load - continue anyhow") try: self.lock.acquire() self.access = DirectAccess(database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password) self.data_warehouse_access = AlchemyAccess( database=self.data_warehouse, port=self.warehouse_port, user=self.database_user, password=self.database_password) self.data_mart_access = AlchemyAccess( database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password) startTime = time.time() if not self.skip_prep: self._prepDeduplicateTables() visits_to_process = self._visitsToProcess() # Now done with db access needs at the manager level # free up resources: self.data_mart_access.disconnect() self.data_warehouse_access.disconnect() self.access.close() # Set of locks used, one for each table needing protection # from asynchronous inserts. Names should match table # minus 'dim_' prefix, plus '_lock' suffix # i.e. dim_location -> 'location_lock' table_locks = {'admission_source_lock': Lock(), 'admission_o2sat_lock': Lock(), 'admission_temp_lock': Lock(), 'assigned_location_lock': Lock(), 'admit_reason_lock': Lock(), 'chief_complaint_lock': Lock(), 'diagnosis_lock': Lock(), 'disposition_lock': Lock(), 'flu_vaccine_lock': Lock(), 'h1n1_vaccine_lock': Lock(), 'lab_flag_lock': Lock(), 'lab_result_lock': Lock(), 'location_lock': Lock(), 'note_lock': Lock(), 'order_number_lock': Lock(), 'performing_lab_lock': Lock(), 'pregnancy_lock': Lock(), 'race_lock': Lock(), 'reference_range_lock': Lock(), 'service_area_lock': Lock(), 'specimen_source_lock': Lock(), } # If we have visits to process, fire up the workers... if len(visits_to_process) > 1: for i in range(self.NUM_PROCS): dw = Process(target=LongitudinalWorker, kwargs={'queue': self.queue, 'procNumber': i, 'data_warehouse': self.data_warehouse, 'warehouse_port': self.warehouse_port, 'data_mart': self.data_mart, 'mart_port': self.mart_port, 'dbUser': self.database_user, 'dbPass': self.database_password, 'table_locks': table_locks, 'verbosity': self.verbosity}) dw.daemon = True dw.start() # Populate the queue for v in visits_to_process: self.queue.put(v) # Wait on the queue until empty self.queue.join() # Common cleanup self.tearDown() self.datePersistence.bump_date() logging.info("Queue is empty - done in %s", time.time() - startTime) finally: self.lock.release()