示例#1
0
 def _getConn(self):
     """ Local wrapper to get database connection
     """
     if hasattr(self, 'access'):
         return
     self.access = DirectAccess(database=self.database,
                                user=self.user,
                                password=self.password)
 def _getConn(self):
     """ Local wrapper to get database connection
     """
     if hasattr(self, 'access'):
         return
     self.access = DirectAccess(database=self.database,
                                user=self.user,
                                password=self.password)
示例#3
0
    def reportable_region(self, value):
        if 'reportable_region' in self._lock_attrs:
            raise AttributeError("can't set attribute")
        # Confirm the requested region is in the db.
        if value:
            connection = DirectAccess(database=self.database,
                                      user=self.user,
                                      password=self.password)

            cursor = connection.raw_query("SELECT count(*) FROM "\
                                          "internal_reportable_region "\
                                          "WHERE region_name = '%s'" %
                                          value)
            if cursor.next()[0] < 1:
                self.error_callback("%s region not found in "\
                                    "internal_reportable_region table" %
                                    value)
            connection.close()
        self._crit['reportable_region'] = value
    def reportable_region(self, value):
        if 'reportable_region' in self._lock_attrs:
            raise AttributeError("can't set attribute")
        # Confirm the requested region is in the db.
        if value:
            connection = DirectAccess(database=self.database,
                                      user=self.user,
                                      password=self.password)

            cursor = connection.raw_query("SELECT count(*) FROM "\
                                          "internal_reportable_region "\
                                          "WHERE region_name = '%s'" %
                                          value)
            if cursor.next()[0] < 1:
                self.error_callback("%s region not found in "\
                                    "internal_reportable_region table" %
                                    value)
            connection.close()
        self._crit['reportable_region'] = value
示例#5
0
class GenerateReport(object):
    """ Process options and generate the requested report.  Optionally
    persists the file to the filesystem, and uploads to the DOH sftp
    server.

    """
    __version__ = '0.2'
    config = Config()
    IGNORE_SITE = config.get('longitudinal', 'ignore_site', default='')

    # Order matters, create a tuple of paired values (reportColumn,
    # essenceColumn) - NB, the Diagnosis column is being bastardized.
    # Previously there was an SQL function to do the subselect, but it
    # ran way too slow.  Now contains the foreign key to join w/ the
    # diagnosis for the respective visit.

    diagnosis_column_index = 7
    patient_class_column_index = 11
    columns = (
        ('Hosp', 'hospital'),
        ('Reg Date', 'visit_date'),
        ('Time', 'visit_time'),
        ('Sex', 'gender'),
        ('Age', 'age'),
        ('Reason For Visit', 'chief_complaint'),
        ('Zip Code', 'zip'),
        ('Diagnosis', 'visit_pk'),
        ('Admit Status', 'gipse_disposition'),
        ('Medical Record No.', 'patient_id'),
        ('Visit Record No.', 'visit_id'),
        ('Service Area', 'patient_class'),
    )
    assert (columns[diagnosis_column_index][1] == 'visit_pk')
    assert (columns[patient_class_column_index][1] == 'patient_class')

    def __init__(self,
                 user=None,
                 password=None,
                 report_criteria=None,
                 datefile=None):
        """Initialize report generation.

        :param user: database user
        :param password: database password
        :param report_criteria: ReportCriteria defining specifics
        :param datefile: useful for persistent walks through time

        """
        self.user = user
        self.password = password
        self.criteria = report_criteria
        self.database = self.criteria.database
        if datefile:
            assert ((self.criteria.start_date,
                     self.criteria.end_date) == datefile.get_date_range())
            self.datePersistence = datefile

        self._diags = {}
        self._prepare_output_file()
        self._prepare_columns()
        self._set_transport()

    def _prepare_columns(self):
        # Don't include the patient_class column if splitting out by
        # patient_class
        if self.criteria.patient_class:
            len_b4 = len(self.columns)
            self.columns =\
                self.columns[:self.patient_class_column_index] \
                + self.columns[self.patient_class_column_index + 1:]
            assert (len(self.columns) + 1 == len_b4)

    def _set_transport(self):
        """Plug in the appropriate transport mechanism"""
        # Transport strategies differ for the different reports
        if self.criteria.reportable_region:
            self._transport = Distribute_client(zip_first=True)
        else:
            self._transport = PHINMS_client(zip_first=True)

    def _generate_output_filename(self, start_date=None, end_date=None):
        start_date = self.criteria.start_date if start_date is None\
            else start_date
        end_date = self.criteria.end_date if end_date is None else end_date

        datestr = end_date.strftime('%Y%m%d')
        if start_date != end_date:
            datestr = '-'.join(
                (start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')))

        filename = self.criteria.report_method + '-' + datestr + '.txt'

        config = Config()
        tmp_dir = config.get('general', 'tmp_dir', default='/tmp')

        filepath = os.path.join(tmp_dir, filename)
        return filepath

    def _prepare_output_file(self):
        """Open the local filesystem file for output"""
        filepath = self.\
            _generate_output_filename(start_date=self.criteria.start_date,
                                      end_date=self.criteria.end_date)

        # watch for oversight errors; notify if like report exists -
        # unless it's size zero (from a previous failed run)
        if os.path.exists(filepath) and os.path.getsize(filepath):
            logging.warning("Found requested report file already "\
                             "exists - overwriting: '%s'"\
                             % filepath)

        self.output = open(filepath, 'w')
        self._output_filename = self.output.name

    @property
    def output_filename(self):
        if not hasattr(self, '_output_filename'):
            raise RuntimeError("prerequisite call to "\
                               "_prepare_output_file() "\
                "didn't happen!")
        return self._output_filename

    def _header(self):
        if self.criteria.include_vitals:
            columns = [c[0] for c in self.columns]
            columns += ('Measured Temperature', 'O2 Saturation',
                        'Self-Reported Influenza Vaccine',
                        'Self-Reported H1N1 Vaccine')
            return '|'.join(columns)

        else:
            return '|'.join([c[0] for c in self.columns])

    def _build_join_tables(self):
        """ Scope continues to grow, build all join tables necessary
        for the query.  Some are only necessary with certain features
        on.
        """
        # Always need the list of reportable visits
        self._build_visit_join_table()

        if self.criteria.include_vitals:
            self._build_vitals_join_table()

    def _build_visit_join_table(self):
        """ Helper in selection of visits for the report - this method
        builds a temporary table and populates it with the visit_pks
        that belong in the report.  This should include all visit_pks
        with the matching admit_datetime as well as any that have
        received updates since the last like report was produced.

        """
        # If include_vitals is on, we also need the visit_id to keep
        # the joins managable.  vitals don't have a patient class, so
        # you can't join on the same values.

        sql = "CREATE TEMPORARY TABLE reportable_pks (pk "\
            "integer not null unique)"
        selectCols = "fact_visit.pk"

        self._getConn()
        self.access.raw_query(sql)

        # If we're only selecting those facilites in a region, the SQL
        # is more complicated - build up the respective clauses.
        joinClause = regionClause = ""
        if self.criteria.reportable_region:
            joinClause = "JOIN internal_reportable_region ON "\
                "internal_reportable_region.dim_facility_pk = "\
                "fact_visit.dim_facility_pk"
            regionClause = "AND region_name = '%s'" %\
                self.criteria.reportable_region

        # Another HACK!  One site is not even wanted by the state DOH,
        # as it's being duplicated from another source, and ESSENCE
        # can't help but count them twice.  Remove this one site
        # regardless
        else:
            joinClause = "JOIN internal_reportable_region ON "\
                "internal_reportable_region.dim_facility_pk = "\
                "fact_visit.dim_facility_pk"
            regionClause = "AND region_name = '%s'" % self.IGNORE_SITE

        # Limit by patient_class if requested.  Note we may still end
        # up with visit ids that have changed patient classes, so more
        # pruning later is necessary.
        pc_limit = ""
        if self.criteria.patient_class:
            pc_limit = "AND patient_class = '%c'" %\
                self.criteria.patient_class

        # Start with all visits for the requested date range
        sql = "INSERT INTO reportable_pks SELECT %s FROM "\
              "fact_visit %s WHERE admit_datetime BETWEEN '%s' AND "\
              "'%s' %s %s" %\
              (selectCols, joinClause, self.criteria.start_date,
               self.criteria.end_date + timedelta(days=1),
               pc_limit, regionClause)

        self.access.raw_query(sql)

        if self.criteria.include_updates:
            # In this case, add all visits with updates since the
            # last run, but no newer than the requested date (in case
            # we're building reports forward from historical data)
            sql = "SELECT max(processed_datetime) FROM internal_report "\
                  "WHERE report_method = '%s'" % self.criteria.report_method

            cursor = self.access.raw_query(sql)
            last_report_generated = cursor.fetchall()[0][0]
            if last_report_generated is None:
                last_report_generated = '2009-01-01'  # our epoch
            logging.debug("including updates, last_report_generated: "\
                              "%s", last_report_generated)
            sql = "INSERT INTO reportable_pks SELECT %(sel_cols)s FROM "\
                  "fact_visit %(join_clause)s LEFT JOIN reportable_pks ON "\
                  "reportable_pks.pk = fact_visit.pk WHERE "\
                  "last_updated > '%(last_report)s' AND admit_datetime "\
                  "< '%(date)s' AND reportable_pks.pk IS NULL "\
                  "%(pc_limit)s %(region_clause)s" %\
                  {'sel_cols': selectCols,
                   'last_report': last_report_generated,
                   'date': self.criteria.end_date + timedelta(days=1),
                   'pc_limit': pc_limit,
                   'join_clause': joinClause,
                   'region_clause': regionClause}
            self.access.raw_query(sql)

        cursor = self.access.raw_query("SELECT COUNT(*) FROM "\
                                           "reportable_pks")
        logging.debug("%d visits to report on", cursor.fetchall()[0][0])

    def _build_vitals_join_table(self):
        """When report is to include vitals - we use an additional
        temporary table (visit_loinc_data) to hold the data for more
        timely queries.

        Like the rest of the report, the list of interesting visits is
        limited to the rows in the reportable_pks - see
        _build_join_table() for details.

        """
        raise ValueError('not ported yet')
        sql = """
          CREATE TEMPORARY TABLE visit_loinc_data (
            visit_id VARCHAR(255) not null,
            patient_class CHAR(1) default null,
            observation_id VARCHAR(255) not null,
            observation_result VARCHAR(255) not null)
          """
        self._getConn()
        self.access.raw_query(sql)

        sql = """
          INSERT INTO visit_loinc_data (visit_id, patient_class,
          observation_id, observation_result) SELECT visit.visit_id,
          visit.patient_class, observation_id,
          observation_result FROM visit JOIN hl7_visit ON
          visit.visit_id = hl7_visit.visit_id JOIN hl7_obx ON
          hl7_visit.hl7_msh_id = hl7_obx.hl7_msh_id JOIN
          reportable_pks ON reportable_pks.visit_id = visit.visit_id
          AND reportable_pks.patient_class = visit.patient_class
          WHERE
          observation_id in ('8310-5', '20564-1', '46077-4',
          '29544-4')
          """
        self.access.raw_query(sql)

    def _select_from_essence_view(self):
        """Build up the SQL select statement to be used in gathering
        the data for this report.

        """
        stmt = """SELECT %s FROM essence e JOIN reportable_pks ri
        ON e.visit_pk = ri.pk""" %\
            (','.join(['e.' + c[1] for c in self.columns]))
        return stmt

    def _select_diagnosis(self):
        """ Need to pull in all the diagnosis data for this report.
        This is saved in an instance dictionary for use in
        self._diagnosis to generate the list of diagnoses for each
        respective visit.

        A list of unique diagnoses ordered by rank is required.
        """
        # We order descending on dx_datetime as the most recent should
        # be best.  Add any others as the persistence mechanism only
        # saves a unique icd9 dx that has changed status.
        stmt = "SELECT fact_visit_pk, rank, icd9 "\
               "FROM assoc_visit_dx JOIN "\
               "dim_dx ON dim_dx_pk = dim_dx.pk JOIN "\
               "reportable_pks ON "\
               "assoc_visit_dx.fact_visit_pk = reportable_pks.pk "\
               "ORDER BY dx_datetime DESC"
        cursor = self.access.raw_query(stmt)
        for row in cursor.fetchall():
            visit_pk = row[0]
            if visit_pk in self._diags:
                self._diags[visit_pk].add(row[0], row[1], row[2])
            else:
                self._diags[visit_pk] = \
                    SortedDiagnosis(row[0], row[1], row[2])

    def _diagnosis(self, visit_pk):
        if visit_pk in self._diags:
            return [
                self._diags[visit_pk].__repr__(),
            ]
        else:
            return [
                '',
            ]

    def _select_vitals(self):
        """ Need to pull in all the vitals data for this report.
        This is saved in an instance dictionary for use in
        self._vitals_for_visit to generate the list of vitals for each
        respective visit.

        This is an effective NOP when self.criteria.include_vitals = False

        """
        if not self.criteria.include_vitals:
            return None

        self._vitals = {}
        stmt = """SELECT reportable_pks.visit_pk,
          observation_id, observation_result
          FROM visit_loinc_data JOIN reportable_pks ON
          reportable_pks.visit_id = visit_loinc_data.visit_id"""

        cursor = self.access.raw_query(stmt)
        for row in cursor.fetchall():
            visit_pk = row[0]
            if visit_pk in self._vitals:
                self._vitals[visit_pk].add(row[1], row[2])
            else:
                self._vitals[visit_pk] = \
                    Vitals(row[1], row[2])

    def _vitals_for_visit(self, visit_pk):
        """Returns the list of vitals for the visit in question.

        This is an effective NOP when self.criteria.include_vitals = False

        """
        if not self.criteria.include_vitals:
            return []

        if visit_pk in self._vitals:
            return self._vitals[visit_pk].__repr__()
        else:
            return Vitals().__repr__()

    def _write_report(self, save_report=False):
        """ Write out and potentially store the results.

        Generate results via database queries and write the results to
        self.output.

        :param save_report: If set, persist the document and related
          metadata to the mbds archive.

        returns the document ID, the mbds archive key, if saved

        """
        out = self.output
        print >> out, self._header()
        self._build_join_tables()
        self._select_diagnosis()
        self._select_vitals()
        cursor = self.access.raw_query(self._select_from_essence_view())
        for row in cursor.fetchall():
            # Each row is the colums up to the diagnosis + the
            # comma separated diagnosis + the rest of the columns
            # and finally with vitals if configured for such
            visit_pk = row[self.diagnosis_column_index]  # yuck, but true
            print >> out,\
                '|'.join([strSansNone(column) for column in
                          row[:self.diagnosis_column_index]] +
                         self._diagnosis(visit_pk) +
                         [strSansNone(column) for column in
                          row[self.diagnosis_column_index + 1:]] +
                         self._vitals_for_visit(visit_pk))

        # Close the file and persist to the document archive if
        # requested
        self.output.close()
        if save_report:
            metadata = {
                k: v
                for k, v in self.criteria._crit.items() if v is not None
            }

            # At this point, all documents are of 'essence' type
            return document_store(document=self.output.name,
                                  allow_duplicate_filename=True,
                                  document_type='essence',
                                  **metadata)

    def _record_report(self, report_oid):
        """Record the details from this report generation in the db"""
        if not report_oid:
            return
        report = Report(processed_datetime=datetime.now(),
                        file_path=report_oid,
                        report_method=self.criteria.report_method)

        alchemy = AlchemyAccess(database=self.database)
        alchemy.session.add(report)
        alchemy.session.commit()
        alchemy.disconnect()

    def _transmit_report(self, report):
        """Transmit report using self._transport()"""
        logging.info("initiate upload of %s", report)
        self._transport.transfer_file(report)

    def _transmit_differences(self, report):
        """Compute differences from yesterday's like report; transport"""

        # This option really only makes sense on date range reports,
        # as updates hit older data than just 'yesterday'.
        if self.criteria.start_date == self.criteria.end_date:
            raise ValueError("difference calculation not supported on "\
                             "single day reports")
        # See if we can find a similar report in the archive from
        # yesterday
        search_criteria = {
            'report_method': self.criteria.report_method,
            'start_date': self.criteria.start_date - timedelta(days=1),
            'end_date': self.criteria.end_date - timedelta(days=1)
        }
        old_doc = document_find(search_criteria, limit=1)
        if old_doc is None:
            logging.info("No comparable report found for difference "\
                         "generation")
            self._transmit_report(report)
        else:
            target_filename = self.\
                _generate_output_filename(start_date=self.criteria.start_date,
                                          end_date=self.criteria.end_date)
            # RemoveDuplicates not yet ported!!
            raise ValueError("RemoveDuplicates not ported")
            #from pheme.essence.remove_duplicates import RemoveDuplicates
            #rd = RemoveDuplicates(new_report=report,
            #                      old_report=old_doc,
            #                      out=target_filename)
            #rd.generate_report()
            #logging.info("initiate upload of difference %s", target_filename)
            #self._transport.transfer_file(target_filename)

    def _getConn(self):
        """ Local wrapper to get database connection
        """
        if hasattr(self, 'access'):
            return
        self.access = DirectAccess(database=self.database,
                                   user=self.user,
                                   password=self.password)

    def _closeConn(self):
        """ Local wrapper to close database connection
        """
        if hasattr(self, 'access'):
            self.access.close()

    def tearDown(self):
        "Public interface to clean up internals"
        self._closeConn()

    def execute(self,
                save_report=False,
                transmit_report=False,
                transmit_differences=False):
        """Execute the report generation
        """
        logging.info("Initiate ESSENCE report generation [%s-%s] for %s",
                     self.criteria.start_date, self.criteria.end_date,
                     self.criteria.report_method)

        self._getConn()
        report_oid = self._write_report(save_report)
        self._record_report(report_oid)
        if transmit_report:
            self._transmit_report(report_oid)
        if transmit_differences:
            self._transmit_differences(report_oid)
        self._closeConn()
        if hasattr(self, 'datePersistence'):
            self.datePersistence.bump_date()

        logging.info("Completed ESSENCE report generation [%s-%s] for %s",
                     self.criteria.start_date, self.criteria.end_date,
                     self.criteria.report_method)
class GenerateReport(object):
    """ Process options and generate the requested report.  Optionally
    persists the file to the filesystem, and uploads to the DOH sftp
    server.

    """
    __version__ = '0.2'
    config = Config()
    IGNORE_SITE = config.get('longitudinal', 'ignore_site', default='')

    # Order matters, create a tuple of paired values (reportColumn,
    # essenceColumn) - NB, the Diagnosis column is being bastardized.
    # Previously there was an SQL function to do the subselect, but it
    # ran way too slow.  Now contains the foreign key to join w/ the
    # diagnosis for the respective visit.

    diagnosis_column_index = 7
    patient_class_column_index = 11
    columns = (('Hosp', 'hospital'),
               ('Reg Date', 'visit_date'),
               ('Time', 'visit_time'),
               ('Sex', 'gender'),
               ('Age', 'age'),
               ('Reason For Visit', 'chief_complaint'),
               ('Zip Code', 'zip'),
               ('Diagnosis', 'visit_pk'),
               ('Admit Status', 'gipse_disposition'),
               ('Medical Record No.', 'patient_id'),
               ('Visit Record No.', 'visit_id'),
               ('Service Area', 'patient_class'),)
    assert(columns[diagnosis_column_index][1] == 'visit_pk')
    assert(columns[patient_class_column_index][1] == 'patient_class')

    def __init__(self, user=None, password=None, report_criteria=None,
                 datefile=None):
        """Initialize report generation.

        :param user: database user
        :param password: database password
        :param report_criteria: ReportCriteria defining specifics
        :param datefile: useful for persistent walks through time

        """
        self.user = user
        self.password = password
        self.criteria = report_criteria
        self.database = self.criteria.database
        if datefile:
            assert((self.criteria.start_date, self.criteria.end_date)
                   == datefile.get_date_range())
            self.datePersistence = datefile

        self._diags = {}
        self._prepare_output_file()
        self._prepare_columns()
        self._set_transport()

    def _prepare_columns(self):
        # Don't include the patient_class column if splitting out by
        # patient_class
        if self.criteria.patient_class:
            len_b4 = len(self.columns)
            self.columns =\
                self.columns[:self.patient_class_column_index] \
                + self.columns[self.patient_class_column_index + 1:]
            assert(len(self.columns) + 1 == len_b4)

    def _set_transport(self):
        """Plug in the appropriate transport mechanism"""
        # Transport strategies differ for the different reports
        if self.criteria.reportable_region:
            self._transport = Distribute_client(zip_first=True)
        else:
            self._transport = PHINMS_client(zip_first=True)

    def _generate_output_filename(self, start_date=None,
                                  end_date=None):
        start_date = self.criteria.start_date if start_date is None\
            else start_date
        end_date = self.criteria.end_date if end_date is None else end_date

        datestr = end_date.strftime('%Y%m%d')
        if start_date != end_date:
            datestr = '-'.join((start_date.strftime('%Y%m%d'),
                         end_date.strftime('%Y%m%d')))

        filename = self.criteria.report_method + '-' + datestr + '.txt'

        config = Config()
        tmp_dir = config.get('general', 'tmp_dir', default='/tmp')

        filepath = os.path.join(tmp_dir, filename)
        return filepath

    def _prepare_output_file(self):
        """Open the local filesystem file for output"""
        filepath = self.\
            _generate_output_filename(start_date=self.criteria.start_date,
                                      end_date=self.criteria.end_date)

        # watch for oversight errors; notify if like report exists -
        # unless it's size zero (from a previous failed run)
        if os.path.exists(filepath) and os.path.getsize(filepath):
            logging.warning("Found requested report file already "\
                             "exists - overwriting: '%s'"\
                             % filepath)

        self.output = open(filepath, 'w')
        self._output_filename = self.output.name

    @property
    def output_filename(self):
        if not hasattr(self, '_output_filename'):
            raise RuntimeError("prerequisite call to "\
                               "_prepare_output_file() "\
                "didn't happen!")
        return self._output_filename

    def _header(self):
        if self.criteria.include_vitals:
            columns = [c[0] for c in self.columns]
            columns += ('Measured Temperature', 'O2 Saturation',
                        'Self-Reported Influenza Vaccine',
                        'Self-Reported H1N1 Vaccine')
            return '|'.join(columns)

        else:
            return '|'.join([c[0] for c in self.columns])

    def _build_join_tables(self):
        """ Scope continues to grow, build all join tables necessary
        for the query.  Some are only necessary with certain features
        on.
        """
        # Always need the list of reportable visits
        self._build_visit_join_table()

        if self.criteria.include_vitals:
            self._build_vitals_join_table()

    def _build_visit_join_table(self):
        """ Helper in selection of visits for the report - this method
        builds a temporary table and populates it with the visit_pks
        that belong in the report.  This should include all visit_pks
        with the matching admit_datetime as well as any that have
        received updates since the last like report was produced.

        """
        # If include_vitals is on, we also need the visit_id to keep
        # the joins managable.  vitals don't have a patient class, so
        # you can't join on the same values.

        sql = "CREATE TEMPORARY TABLE reportable_pks (pk "\
            "integer not null unique)"
        selectCols = "fact_visit.pk"

        self._getConn()
        self.access.raw_query(sql)

        # If we're only selecting those facilites in a region, the SQL
        # is more complicated - build up the respective clauses.
        joinClause = regionClause = ""
        if self.criteria.reportable_region:
            joinClause = "JOIN internal_reportable_region ON "\
                "internal_reportable_region.dim_facility_pk = "\
                "fact_visit.dim_facility_pk"
            regionClause = "AND region_name = '%s'" %\
                self.criteria.reportable_region

        # Another HACK!  One site is not even wanted by the state DOH,
        # as it's being duplicated from another source, and ESSENCE
        # can't help but count them twice.  Remove this one site
        # regardless
        else:
            joinClause = "JOIN internal_reportable_region ON "\
                "internal_reportable_region.dim_facility_pk = "\
                "fact_visit.dim_facility_pk"
            regionClause = "AND region_name = '%s'" % self.IGNORE_SITE

        # Limit by patient_class if requested.  Note we may still end
        # up with visit ids that have changed patient classes, so more
        # pruning later is necessary.
        pc_limit = ""
        if self.criteria.patient_class:
            pc_limit = "AND patient_class = '%c'" %\
                self.criteria.patient_class

        # Start with all visits for the requested date range
        sql = "INSERT INTO reportable_pks SELECT %s FROM "\
              "fact_visit %s WHERE admit_datetime BETWEEN '%s' AND "\
              "'%s' %s %s" %\
              (selectCols, joinClause, self.criteria.start_date,
               self.criteria.end_date + timedelta(days=1),
               pc_limit, regionClause)

        self.access.raw_query(sql)

        if self.criteria.include_updates:
            # In this case, add all visits with updates since the
            # last run, but no newer than the requested date (in case
            # we're building reports forward from historical data)
            sql = "SELECT max(processed_datetime) FROM internal_report "\
                  "WHERE report_method = '%s'" % self.criteria.report_method

            cursor = self.access.raw_query(sql)
            last_report_generated = cursor.fetchall()[0][0]
            if last_report_generated is None:
                last_report_generated = '2009-01-01'  # our epoch
            logging.debug("including updates, last_report_generated: "\
                              "%s", last_report_generated)
            sql = "INSERT INTO reportable_pks SELECT %(sel_cols)s FROM "\
                  "fact_visit %(join_clause)s LEFT JOIN reportable_pks ON "\
                  "reportable_pks.pk = fact_visit.pk WHERE "\
                  "last_updated > '%(last_report)s' AND admit_datetime "\
                  "< '%(date)s' AND reportable_pks.pk IS NULL "\
                  "%(pc_limit)s %(region_clause)s" %\
                  {'sel_cols': selectCols,
                   'last_report': last_report_generated,
                   'date': self.criteria.end_date + timedelta(days=1),
                   'pc_limit': pc_limit,
                   'join_clause': joinClause,
                   'region_clause': regionClause}
            self.access.raw_query(sql)

        cursor = self.access.raw_query("SELECT COUNT(*) FROM "\
                                           "reportable_pks")
        logging.debug("%d visits to report on", cursor.fetchall()[0][0])

    def _build_vitals_join_table(self):
        """When report is to include vitals - we use an additional
        temporary table (visit_loinc_data) to hold the data for more
        timely queries.

        Like the rest of the report, the list of interesting visits is
        limited to the rows in the reportable_pks - see
        _build_join_table() for details.

        """
        raise ValueError('not ported yet')
        sql = """
          CREATE TEMPORARY TABLE visit_loinc_data (
            visit_id VARCHAR(255) not null,
            patient_class CHAR(1) default null,
            observation_id VARCHAR(255) not null,
            observation_result VARCHAR(255) not null)
          """
        self._getConn()
        self.access.raw_query(sql)

        sql = """
          INSERT INTO visit_loinc_data (visit_id, patient_class,
          observation_id, observation_result) SELECT visit.visit_id,
          visit.patient_class, observation_id,
          observation_result FROM visit JOIN hl7_visit ON
          visit.visit_id = hl7_visit.visit_id JOIN hl7_obx ON
          hl7_visit.hl7_msh_id = hl7_obx.hl7_msh_id JOIN
          reportable_pks ON reportable_pks.visit_id = visit.visit_id
          AND reportable_pks.patient_class = visit.patient_class
          WHERE
          observation_id in ('8310-5', '20564-1', '46077-4',
          '29544-4')
          """
        self.access.raw_query(sql)

    def _select_from_essence_view(self):
        """Build up the SQL select statement to be used in gathering
        the data for this report.

        """
        stmt = """SELECT %s FROM essence e JOIN reportable_pks ri
        ON e.visit_pk = ri.pk""" %\
            (','.join(['e.' + c[1] for c in self.columns]))
        return stmt

    def _select_diagnosis(self):
        """ Need to pull in all the diagnosis data for this report.
        This is saved in an instance dictionary for use in
        self._diagnosis to generate the list of diagnoses for each
        respective visit.

        A list of unique diagnoses ordered by rank is required.
        """
        # We order descending on dx_datetime as the most recent should
        # be best.  Add any others as the persistence mechanism only
        # saves a unique icd9 dx that has changed status.
        stmt = "SELECT fact_visit_pk, rank, icd9 "\
               "FROM assoc_visit_dx JOIN "\
               "dim_dx ON dim_dx_pk = dim_dx.pk JOIN "\
               "reportable_pks ON "\
               "assoc_visit_dx.fact_visit_pk = reportable_pks.pk "\
               "ORDER BY dx_datetime DESC"
        cursor = self.access.raw_query(stmt)
        for row in cursor.fetchall():
            visit_pk = row[0]
            if visit_pk in self._diags:
                self._diags[visit_pk].add(row[0], row[1], row[2])
            else:
                self._diags[visit_pk] = \
                    SortedDiagnosis(row[0], row[1], row[2])

    def _diagnosis(self, visit_pk):
        if visit_pk in self._diags:
            return [self._diags[visit_pk].__repr__(), ]
        else:
            return ['', ]

    def _select_vitals(self):
        """ Need to pull in all the vitals data for this report.
        This is saved in an instance dictionary for use in
        self._vitals_for_visit to generate the list of vitals for each
        respective visit.

        This is an effective NOP when self.criteria.include_vitals = False

        """
        if not self.criteria.include_vitals:
            return None

        self._vitals = {}
        stmt = """SELECT reportable_pks.visit_pk,
          observation_id, observation_result
          FROM visit_loinc_data JOIN reportable_pks ON
          reportable_pks.visit_id = visit_loinc_data.visit_id"""

        cursor = self.access.raw_query(stmt)
        for row in cursor.fetchall():
            visit_pk = row[0]
            if visit_pk in self._vitals:
                self._vitals[visit_pk].add(row[1], row[2])
            else:
                self._vitals[visit_pk] = \
                    Vitals(row[1], row[2])

    def _vitals_for_visit(self, visit_pk):
        """Returns the list of vitals for the visit in question.

        This is an effective NOP when self.criteria.include_vitals = False

        """
        if not self.criteria.include_vitals:
            return []

        if visit_pk in self._vitals:
            return self._vitals[visit_pk].__repr__()
        else:
            return Vitals().__repr__()

    def _write_report(self, save_report=False):
        """ Write out and potentially store the results.

        Generate results via database queries and write the results to
        self.output.

        :param save_report: If set, persist the document and related
          metadata to the mbds archive.

        returns the document ID, the mbds archive key, if saved

        """
        out = self.output
        print >> out, self._header()
        self._build_join_tables()
        self._select_diagnosis()
        self._select_vitals()
        cursor = self.access.raw_query(self._select_from_essence_view())
        for row in cursor.fetchall():
            # Each row is the colums up to the diagnosis + the
            # comma separated diagnosis + the rest of the columns
            # and finally with vitals if configured for such
            visit_pk = row[self.diagnosis_column_index]  # yuck, but true
            print >> out,\
                '|'.join([strSansNone(column) for column in
                          row[:self.diagnosis_column_index]] +
                         self._diagnosis(visit_pk) +
                         [strSansNone(column) for column in
                          row[self.diagnosis_column_index + 1:]] +
                         self._vitals_for_visit(visit_pk))

        # Close the file and persist to the document archive if
        # requested
        self.output.close()
        if save_report:
            metadata = {k: v for k, v in self.criteria._crit.items() if v
                        is not None}

            # At this point, all documents are of 'essence' type
            return document_store(document=self.output.name,
                                  allow_duplicate_filename=True,
                                  document_type='essence', **metadata)

    def _record_report(self, report_oid):
        """Record the details from this report generation in the db"""
        if not report_oid:
            return
        report = Report(processed_datetime=datetime.now(),
                        file_path=report_oid,
                        report_method=self.criteria.report_method)

        alchemy = AlchemyAccess(database=self.database)
        alchemy.session.add(report)
        alchemy.session.commit()
        alchemy.disconnect()

    def _transmit_report(self, report):
        """Transmit report using self._transport()"""
        logging.info("initiate upload of %s", report)
        self._transport.transfer_file(report)

    def _transmit_differences(self, report):
        """Compute differences from yesterday's like report; transport"""

        # This option really only makes sense on date range reports,
        # as updates hit older data than just 'yesterday'.
        if self.criteria.start_date == self.criteria.end_date:
            raise ValueError("difference calculation not supported on "\
                             "single day reports")
        # See if we can find a similar report in the archive from
        # yesterday
        search_criteria = {'report_method':
                           self.criteria.report_method,
                           'start_date': self.criteria.start_date -
                           timedelta(days=1), 'end_date':
                           self.criteria.end_date - timedelta(days=1)}
        old_doc = document_find(search_criteria, limit=1)
        if old_doc is None:
            logging.info("No comparable report found for difference "\
                         "generation")
            self._transmit_report(report)
        else:
            target_filename = self.\
                _generate_output_filename(start_date=self.criteria.start_date,
                                          end_date=self.criteria.end_date)
            # RemoveDuplicates not yet ported!!
            raise ValueError("RemoveDuplicates not ported")
            #from pheme.essence.remove_duplicates import RemoveDuplicates
            #rd = RemoveDuplicates(new_report=report,
            #                      old_report=old_doc,
            #                      out=target_filename)
            #rd.generate_report()
            #logging.info("initiate upload of difference %s", target_filename)
            #self._transport.transfer_file(target_filename)

    def _getConn(self):
        """ Local wrapper to get database connection
        """
        if hasattr(self, 'access'):
            return
        self.access = DirectAccess(database=self.database,
                                   user=self.user,
                                   password=self.password)

    def _closeConn(self):
        """ Local wrapper to close database connection
        """
        if hasattr(self, 'access'):
            self.access.close()

    def tearDown(self):
        "Public interface to clean up internals"
        self._closeConn()

    def execute(self, save_report=False, transmit_report=False,
                 transmit_differences=False):
        """Execute the report generation
        """
        logging.info("Initiate ESSENCE report generation [%s-%s] for %s",
                     self.criteria.start_date,
                     self.criteria.end_date,
                     self.criteria.report_method)

        self._getConn()
        report_oid = self._write_report(save_report)
        self._record_report(report_oid)
        if transmit_report:
            self._transmit_report(report_oid)
        if transmit_differences:
            self._transmit_differences(report_oid)
        self._closeConn()
        if hasattr(self, 'datePersistence'):
            self.datePersistence.bump_date()

        logging.info("Completed ESSENCE report generation [%s-%s] for %s",
                     self.criteria.start_date,
                     self.criteria.end_date,
                     self.criteria.report_method)
class LongitudinalManager(object):
    """ Abstraction to handle which db, user, etc. the deduplication
    process should be run on.  Handles runtime arguments and
    execution.

    Manages the process by farming out the individual visit
    deduplication to a number of worker processes (necessary to take
    advatage of multi-core processor and database as the limiting
    factor).

    """

    # The gating issue is the number of postgres connections that are
    # allowed to run concurrently.  Setting this to N-1 (where N is
    # the number of cores) has proven the fastest and most reliable.
    NUM_PROCS = 5

    def __init__(
        self,
        data_warehouse=None,
        data_mart=None,
        reportDate=None,
        database_user=None,
        database_password=None,
        verbosity=0,
    ):
        self.data_warehouse = data_warehouse
        self.warehouse_port = 5432  # postgres default
        self.data_mart = data_mart
        self.mart_port = 5432  # postgres default
        self.reportDate = reportDate and parseDate(reportDate) or None
        self.database_user = database_user
        self.database_password = database_password
        self.dir, thisFile = os.path.split(__file__)
        self.verbosity = verbosity
        self.queue = JoinableQueue()
        self.datefile = "/tmp/longitudinal_datefile"
        self.datePersistence = Datefile(initial_date=self.reportDate)
        self.lock = FileLock(LOCKFILE)
        self.skip_prep = False

    def __call__(self):
        return self.execute()

    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option(
            "-c",
            "--countdown",
            dest="countdown",
            default=None,
            help="count {down,up} date using date string in "
            "%s - set to 'forwards' or 'backwards' "
            "if desired" % self.datefile,
        )
        parser.add_option(
            "-d",
            "--date",
            dest="date",
            default=None,
            help="single admission date to dedup " "(by default, checks the entire database)",
        )
        parser.add_option(
            "-s",
            "--skip-prep",
            dest="skip_prep",
            default=False,
            action="store_true",
            help="skip the expense of looking for new " "messages",
        )
        parser.add_option(
            "-v",
            "--verbose",
            dest="verbosity",
            action="count",
            default=self.verbosity,
            help="increase output verbosity",
        )
        parser.add_option(
            "-m",
            "--mart-port",
            dest="mart_port",
            default=self.mart_port,
            type="int",
            help="alternate port number for data mart",
        )
        parser.add_option(
            "-w",
            "--warehouse-port",
            dest="warehouse_port",
            default=self.warehouse_port,
            type="int",
            help="alternate port number for data warehouse",
        )

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and parseDate(parser.values.date) or None
        self.datePersistence = Datefile(
            initial_date=initial_date, persistence_file=self.datefile, direction=parser.values.countdown
        )

        self.reportDate = self.datePersistence.get_date()

    def _prepDeduplicateTables(self):
        """ Add any missing rows to the MessageProcessed table

        This is the bridge between the data warehouse and the data
        mart.  In an effort to make the data mart independent of the
        warehouse, the processed message data is kept in the mart.  As
        we're dealing with two distinct databases, there's no
        referential integrity available at the database level, so care
        should be taken.

        """
        startTime = time.time()
        logging.info("Starting INSERT INTO internal_message_processed " "at %s", startTime)

        # We can take advantage of an "add only" data_warehouse,
        # knowing the hl7_msh_id is a sequence moving in the positive
        # direction.  Simply add any values greater than the previous
        # max.

        stmt = "SELECT max(hl7_msh_id) from internal_message_processed"
        max_id = self.data_mart_access.engine.execute(stmt).first()[0]
        if not max_id:
            max_id = 0

        new_msgs = list()
        stmt = (
            """SELECT hl7_msh_id, message_datetime, visit_id
        FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE
        hl7_msh_id > %d """
            % max_id
        )
        rs = self.data_warehouse_access.engine.execute(stmt)
        many = 500
        while True:
            results = rs.fetchmany(many)
            if not results:
                break
            for r in results:
                new_msgs.append(MessageProcessed(hl7_msh_id=r[0], message_datetime=r[1], visit_id=r[2]))

            self.data_mart_access.session.add_all(new_msgs)
            self.data_mart_access.session.commit()
            logging.debug("added %d new messages" % len(new_msgs))
            new_msgs = list()

        logging.info("Added new rows to internal_message_processed in %s", time.time() - startTime)

    def _visitsToProcess(self):
        """ Look up all distinct visit ids needing attention

        Obtain unique list of visit_ids that have messages that
        haven't previously been processed.  If the user requested just
        one days worth (i.e. -d) only that days visits will be
        returned.

        """
        visit_ids = list()
        if not self.reportDate:
            logging.info("Launch deduplication for entire database")
            # Do the whole batch, that is, all that haven't been
            # processed before.
            stmt = """SELECT DISTINCT(visit_id) FROM
            internal_message_processed
            WHERE processed_datetime IS NULL"""
            rs = self.data_mart_access.engine.execute(stmt)
            many = 10000
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    visit_ids.append(r[0])

        else:
            logging.info("Launch deduplication for %s", self.reportDate)
            # Process the requested day only - as we can't join across
            # db boundaries - first acquire the full list of visits
            # for the requested day from the data_warehouse to use in
            # a massive 'in' clause

            stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE
            admit_datetime BETWEEN '%s' AND '%s';""" % (
                self.reportDate,
                self.reportDate + timedelta(days=1),
            )
            self.access.raw_query(stmt)
            rs = self.data_warehouse_access.engine.execute(stmt)
            many = 1000
            potential_visit_ids = list()
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    # tmp_table.insert(r[0])
                    potential_visit_ids.append(r[0])

            if potential_visit_ids:
                query = (
                    self.data_mart_access.session.query(MessageProcessed.visit_id)
                    .distinct()
                    .filter(
                        and_(
                            MessageProcessed.processed_datetime == None,
                            MessageProcessed.visit_id.in_(potential_visit_ids),
                        )
                    )
                )

                for r in query:
                    visit_ids.append(r[0])

        logging.info("Found %d visits needing attention", len(visit_ids))
        return visit_ids

    def tearDown(self):
        """ Clean up any open handles/connections """
        # now done in execute when we're done with teh connections

    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password
            )

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {
                "admission_source_lock": Lock(),
                "admission_o2sat_lock": Lock(),
                "admission_temp_lock": Lock(),
                "assigned_location_lock": Lock(),
                "admit_reason_lock": Lock(),
                "chief_complaint_lock": Lock(),
                "diagnosis_lock": Lock(),
                "disposition_lock": Lock(),
                "flu_vaccine_lock": Lock(),
                "h1n1_vaccine_lock": Lock(),
                "lab_flag_lock": Lock(),
                "lab_result_lock": Lock(),
                "location_lock": Lock(),
                "note_lock": Lock(),
                "order_number_lock": Lock(),
                "performing_lab_lock": Lock(),
                "pregnancy_lock": Lock(),
                "race_lock": Lock(),
                "reference_range_lock": Lock(),
                "service_area_lock": Lock(),
                "specimen_source_lock": Lock(),
            }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(
                        target=LongitudinalWorker,
                        kwargs={
                            "queue": self.queue,
                            "procNumber": i,
                            "data_warehouse": self.data_warehouse,
                            "warehouse_port": self.warehouse_port,
                            "data_mart": self.data_mart,
                            "mart_port": self.mart_port,
                            "dbUser": self.database_user,
                            "dbPass": self.database_password,
                            "table_locks": table_locks,
                            "verbosity": self.verbosity,
                        },
                    )
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() - startTime)
        finally:
            self.lock.release()
    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password
            )

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {
                "admission_source_lock": Lock(),
                "admission_o2sat_lock": Lock(),
                "admission_temp_lock": Lock(),
                "assigned_location_lock": Lock(),
                "admit_reason_lock": Lock(),
                "chief_complaint_lock": Lock(),
                "diagnosis_lock": Lock(),
                "disposition_lock": Lock(),
                "flu_vaccine_lock": Lock(),
                "h1n1_vaccine_lock": Lock(),
                "lab_flag_lock": Lock(),
                "lab_result_lock": Lock(),
                "location_lock": Lock(),
                "note_lock": Lock(),
                "order_number_lock": Lock(),
                "performing_lab_lock": Lock(),
                "pregnancy_lock": Lock(),
                "race_lock": Lock(),
                "reference_range_lock": Lock(),
                "service_area_lock": Lock(),
                "specimen_source_lock": Lock(),
            }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(
                        target=LongitudinalWorker,
                        kwargs={
                            "queue": self.queue,
                            "procNumber": i,
                            "data_warehouse": self.data_warehouse,
                            "warehouse_port": self.warehouse_port,
                            "data_mart": self.data_mart,
                            "mart_port": self.mart_port,
                            "dbUser": self.database_user,
                            "dbPass": self.database_password,
                            "table_locks": table_locks,
                            "verbosity": self.verbosity,
                        },
                    )
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() - startTime)
        finally:
            self.lock.release()
示例#9
0
class LongitudinalManager(object):
    """ Abstraction to handle which db, user, etc. the deduplication
    process should be run on.  Handles runtime arguments and
    execution.

    Manages the process by farming out the individual visit
    deduplication to a number of worker processes (necessary to take
    advatage of multi-core processor and database as the limiting
    factor).

    """
    # The gating issue is the number of postgres connections that are
    # allowed to run concurrently.  Setting this to N-1 (where N is
    # the number of cores) has proven the fastest and most reliable.
    NUM_PROCS = 5

    def __init__(self, data_warehouse=None, data_mart=None,
                 reportDate=None, database_user=None,
                 database_password=None, verbosity=0):
        self.data_warehouse = data_warehouse
        self.warehouse_port = 5432  # postgres default
        self.data_mart = data_mart
        self.mart_port = 5432  # postgres default
        self.reportDate = reportDate and parseDate(reportDate) or None
        self.database_user = database_user
        self.database_password = database_password
        self.dir, thisFile = os.path.split(__file__)
        self.verbosity = verbosity
        self.queue = JoinableQueue()
        self.datefile = "/tmp/longitudinal_datefile"
        self.datePersistence = Datefile(initial_date=self.reportDate)
        self.lock = FileLock(LOCKFILE)
        self.skip_prep = False

    def __call__(self):
        return self.execute()

    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option("-c", "--countdown", dest="countdown",
                          default=None,
                          help="count {down,up} date using date string in "\
                              "%s - set to 'forwards' or 'backwards' "\
                              "if desired" % self.datefile)
        parser.add_option("-d", "--date", dest="date", default=None,
                          help="single admission date to dedup "\
                          "(by default, checks the entire database)")
        parser.add_option("-s", "--skip-prep", dest="skip_prep",
                          default=False, action="store_true",
                          help="skip the expense of looking for new "\
                          "messages")
        parser.add_option("-v", "--verbose", dest="verbosity",
                          action="count", default=self.verbosity,
                          help="increase output verbosity")
        parser.add_option("-m", "--mart-port", dest="mart_port",
                          default=self.mart_port, type="int",
                          help="alternate port number for data mart")
        parser.add_option("-w", "--warehouse-port", dest="warehouse_port",
                          default=self.warehouse_port, type="int",
                          help="alternate port number for data warehouse")

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and \
            parseDate(parser.values.date) or None
        self.datePersistence = Datefile(initial_date=initial_date,
                                        persistence_file=self.datefile,
                                        direction=parser.values.countdown)

        self.reportDate = self.datePersistence.get_date()

    def _prepDeduplicateTables(self):
        """ Add any missing rows to the MessageProcessed table

        This is the bridge between the data warehouse and the data
        mart.  In an effort to make the data mart independent of the
        warehouse, the processed message data is kept in the mart.  As
        we're dealing with two distinct databases, there's no
        referential integrity available at the database level, so care
        should be taken.

        """
        startTime = time.time()
        logging.info("Starting INSERT INTO internal_message_processed "
                     "at %s", startTime)

        # We can take advantage of an "add only" data_warehouse,
        # knowing the hl7_msh_id is a sequence moving in the positive
        # direction.  Simply add any values greater than the previous
        # max.

        stmt = "SELECT max(hl7_msh_id) from internal_message_processed"
        max_id = self.data_mart_access.engine.execute(stmt).first()[0]
        if not max_id:
            max_id = 0

        new_msgs = list()
        stmt = """SELECT hl7_msh_id, message_datetime, visit_id
        FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE
        hl7_msh_id > %d """ % max_id
        rs = self.data_warehouse_access.engine.execute(stmt)
        many = 500
        while True:
            results = rs.fetchmany(many)
            if not results:
                break
            for r in results:
                new_msgs.append(MessageProcessed(hl7_msh_id=r[0],
                                                 message_datetime=r[1],
                                                 visit_id=r[2]))

            self.data_mart_access.session.add_all(new_msgs)
            self.data_mart_access.session.commit()
            logging.debug("added %d new messages" % len(new_msgs))
            new_msgs = list()

        logging.info("Added new rows to internal_message_processed in %s",
                     time.time() - startTime)

    def _visitsToProcess(self):
        """ Look up all distinct visit ids needing attention

        Obtain unique list of visit_ids that have messages that
        haven't previously been processed.  If the user requested just
        one days worth (i.e. -d) only that days visits will be
        returned.

        """
        visit_ids = list()
        if not self.reportDate:
            logging.info("Launch deduplication for entire database")
            # Do the whole batch, that is, all that haven't been
            # processed before.
            stmt = """SELECT DISTINCT(visit_id) FROM
            internal_message_processed
            WHERE processed_datetime IS NULL"""
            rs = self.data_mart_access.engine.execute(stmt)
            many = 10000
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    visit_ids.append(r[0])

        else:
            logging.info("Launch deduplication for %s",
                         self.reportDate)
            # Process the requested day only - as we can't join across
            # db boundaries - first acquire the full list of visits
            # for the requested day from the data_warehouse to use in
            # a massive 'in' clause

            stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE
            admit_datetime BETWEEN '%s' AND '%s';""" %\
            (self.reportDate, self.reportDate + timedelta(days=1))
            self.access.raw_query(stmt)
            rs = self.data_warehouse_access.engine.execute(stmt)
            many = 1000
            potential_visit_ids = list()
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    #tmp_table.insert(r[0])
                    potential_visit_ids.append(r[0])

            if potential_visit_ids:
                query = self.data_mart_access.session.query(\
                    MessageProcessed.visit_id).distinct().\
                    filter(and_(MessageProcessed.processed_datetime ==
                                None,
                                MessageProcessed.visit_id.\
                                in_(potential_visit_ids)))

                for r in query:
                    visit_ids.append(r[0])

        logging.info("Found %d visits needing attention",
                     len(visit_ids))
        return visit_ids

    def tearDown(self):
        """ Clean up any open handles/connections """
        # now done in execute when we're done with teh connections

    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity,
                          logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s",
                         (self.reportDate and self.reportDate or
                          "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(database=self.data_warehouse,
                                       port=self.warehouse_port,
                                       user=self.database_user,
                                       password=self.database_password)
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user, password=self.database_password)
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port,
                user=self.database_user, password=self.database_password)

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {'admission_source_lock': Lock(),
                           'admission_o2sat_lock': Lock(),
                           'admission_temp_lock': Lock(),
                           'assigned_location_lock': Lock(),
                           'admit_reason_lock': Lock(),
                           'chief_complaint_lock': Lock(),
                           'diagnosis_lock': Lock(),
                           'disposition_lock': Lock(),
                           'flu_vaccine_lock': Lock(),
                           'h1n1_vaccine_lock': Lock(),
                           'lab_flag_lock': Lock(),
                           'lab_result_lock': Lock(),
                           'location_lock': Lock(),
                           'note_lock': Lock(),
                           'order_number_lock': Lock(),
                           'performing_lab_lock': Lock(),
                           'pregnancy_lock': Lock(),
                           'race_lock': Lock(),
                           'reference_range_lock': Lock(),
                           'service_area_lock': Lock(),
                           'specimen_source_lock': Lock(),
                           }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(target=LongitudinalWorker,
                                 kwargs={'queue': self.queue,
                                         'procNumber': i,
                                         'data_warehouse': self.data_warehouse,
                                         'warehouse_port': self.warehouse_port,
                                         'data_mart': self.data_mart,
                                         'mart_port': self.mart_port,
                                         'dbUser': self.database_user,
                                         'dbPass': self.database_password,
                                         'table_locks': table_locks,
                                         'verbosity': self.verbosity})
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() -
                         startTime)
        finally:
            self.lock.release()
示例#10
0
    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity,
                          logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s",
                         (self.reportDate and self.reportDate or
                          "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(database=self.data_warehouse,
                                       port=self.warehouse_port,
                                       user=self.database_user,
                                       password=self.database_password)
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user, password=self.database_password)
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port,
                user=self.database_user, password=self.database_password)

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {'admission_source_lock': Lock(),
                           'admission_o2sat_lock': Lock(),
                           'admission_temp_lock': Lock(),
                           'assigned_location_lock': Lock(),
                           'admit_reason_lock': Lock(),
                           'chief_complaint_lock': Lock(),
                           'diagnosis_lock': Lock(),
                           'disposition_lock': Lock(),
                           'flu_vaccine_lock': Lock(),
                           'h1n1_vaccine_lock': Lock(),
                           'lab_flag_lock': Lock(),
                           'lab_result_lock': Lock(),
                           'location_lock': Lock(),
                           'note_lock': Lock(),
                           'order_number_lock': Lock(),
                           'performing_lab_lock': Lock(),
                           'pregnancy_lock': Lock(),
                           'race_lock': Lock(),
                           'reference_range_lock': Lock(),
                           'service_area_lock': Lock(),
                           'specimen_source_lock': Lock(),
                           }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(target=LongitudinalWorker,
                                 kwargs={'queue': self.queue,
                                         'procNumber': i,
                                         'data_warehouse': self.data_warehouse,
                                         'warehouse_port': self.warehouse_port,
                                         'data_mart': self.data_mart,
                                         'mart_port': self.mart_port,
                                         'dbUser': self.database_user,
                                         'dbPass': self.database_password,
                                         'table_locks': table_locks,
                                         'verbosity': self.verbosity})
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() -
                         startTime)
        finally:
            self.lock.release()