예제 #1
0
 def testRangeStepSansDirection(self):
     "give step and no directon"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   step=30)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2009-10-02')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2009-10-31')
예제 #2
0
 def testRangeSansFile(self):
     "range w/o step"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'))
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2009-10-31')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2009-10-31')
예제 #3
0
 def testRangeSansStep(self):
     "range w/o step"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='forwards')
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2009-10-31')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2009-10-31')
예제 #4
0
 def testStep(self):
     df = Datefile(datetime.strptime('2009-01-01', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards', step=10)
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                       '2009-01-01')
     df.bump_date()
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                       '2008-12-22')
예제 #5
0
 def testStepDatesFwd(self):
     "steping forward, end date should be step-1 days after start"
     df = Datefile(datetime.strptime('2009-02-01', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='forwards',
                   step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2009-02-01')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2009-02-10')
예제 #6
0
 def testForwards(self):
     df = Datefile(datetime.strptime('2009-01-01', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='forwards')
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                       '2009-01-01')
     df.bump_date()
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                       '2009-01-02')
예제 #7
0
 def testStepDatesBack(self):
     "steping backward, end date should be step-1 days before start"
     df = Datefile(datetime.strptime('2009-01-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards',
                   step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2009-01-22')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2009-01-31')
예제 #8
0
 def testRangeSansStep(self):
     "range w/o step"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='forwards')
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2009-10-31')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2009-10-31')
예제 #9
0
 def testStepDatesFwd(self):
     "steping forward, end date should be step-1 days after start"
     df = Datefile(datetime.strptime('2009-02-01', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='forwards', step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2009-02-01')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2009-02-10')
예제 #10
0
 def testRangeStepSansDirection(self):
     "give step and no directon"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   step=30)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2009-10-02')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2009-10-31')
예제 #11
0
 def testStepDatesBack(self):
     "steping backward, end date should be step-1 days before start"
     df = Datefile(datetime.strptime('2009-01-31', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards', step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2009-01-22')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2009-01-31')
예제 #12
0
    def testInitialDate(self):
        # write a date to the file, then call with a different initial
        # date.  should get the date from the file.
        with open(self.persistence_file, 'w') as p_file:
            p_file.write('2009-06-06')

        df = Datefile(datetime.strptime('2009-01-01', '%Y-%m-%d'),
                      persistence_file=self.persistence_file,
                      direction='backwards')
        self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                          '2009-06-06')
        df.bump_date()
        self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                          '2009-06-05')
예제 #13
0
 def __init__(self, data_warehouse=None, data_mart=None,
              reportDate=None, database_user=None,
              database_password=None, verbosity=0):
     self.data_warehouse = data_warehouse
     self.warehouse_port = 5432  # postgres default
     self.data_mart = data_mart
     self.mart_port = 5432  # postgres default
     self.reportDate = reportDate and parseDate(reportDate) or None
     self.database_user = database_user
     self.database_password = database_password
     self.dir, thisFile = os.path.split(__file__)
     self.verbosity = verbosity
     self.queue = JoinableQueue()
     self.datefile = "/tmp/longitudinal_datefile"
     self.datePersistence = Datefile(initial_date=self.reportDate)
     self.lock = FileLock(LOCKFILE)
     self.skip_prep = False
예제 #14
0
 def testNoDirection(self):
     df = Datefile(datetime.now())
     self.assert_(df.get_date(), "Even w/o direction, should get "
                  "initial date back")
     datebefore = df.get_date()
     df.bump_date()
     dateafter = df.get_date()
     self.assertEquals(datebefore, dateafter)
예제 #15
0
    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option("-c", "--countdown", dest="countdown",
                          default=None,
                          help="count {down,up} date using date string in "\
                              "%s - set to 'forwards' or 'backwards' "\
                              "if desired" % self.datefile)
        parser.add_option("-d", "--date", dest="date", default=None,
                          help="single admission date to dedup "\
                          "(by default, checks the entire database)")
        parser.add_option("-s", "--skip-prep", dest="skip_prep",
                          default=False, action="store_true",
                          help="skip the expense of looking for new "\
                          "messages")
        parser.add_option("-v", "--verbose", dest="verbosity",
                          action="count", default=self.verbosity,
                          help="increase output verbosity")
        parser.add_option("-m", "--mart-port", dest="mart_port",
                          default=self.mart_port, type="int",
                          help="alternate port number for data mart")
        parser.add_option("-w", "--warehouse-port", dest="warehouse_port",
                          default=self.warehouse_port, type="int",
                          help="alternate port number for data warehouse")

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and \
            parseDate(parser.values.date) or None
        self.datePersistence = Datefile(initial_date=initial_date,
                                        persistence_file=self.datefile,
                                        direction=parser.values.countdown)

        self.reportDate = self.datePersistence.get_date()
예제 #16
0
 def testStepDatesBackThrice(self):
     "steping backward three times, duplication avoided?"
     df = Datefile(datetime.strptime('2011-04-30', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards',
                   step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2011-04-21')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2011-04-30')
     df.bump_date()
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2011-04-11')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2011-04-20')
     df.bump_date()
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2011-04-01')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2011-04-10')
예제 #17
0
 def testBackwards(self):
     df = Datefile(datetime.strptime('2009-01-01', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards')
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'), '2009-01-01')
     df.bump_date()
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'), '2008-12-31')
예제 #18
0
 def testNoDirection(self):
     df = Datefile(datetime.now())
     self.assert_(df.get_date(), "Even w/o direction, should get "
                  "initial date back")
     datebefore = df.get_date()
     df.bump_date()
     dateafter = df.get_date()
     self.assertEquals(datebefore, dateafter)
예제 #19
0
    def testInitialDate(self):
        # write a date to the file, then call with a different initial
        # date.  should get the date from the file.
        with open(self.persistence_file, 'w') as p_file:
            p_file.write('2009-06-06')

        df = Datefile(datetime.strptime('2009-01-01', '%Y-%m-%d'),
                      persistence_file=self.persistence_file,
                      direction='backwards')
        self.assertEquals(df.get_date().strftime('%Y-%m-%d'), '2009-06-06')
        df.bump_date()
        self.assertEquals(df.get_date().strftime('%Y-%m-%d'), '2009-06-05')
예제 #20
0
 def testStepDatesBackThrice(self):
     "steping backward three times, duplication avoided?"
     df = Datefile(datetime.strptime('2011-04-30', '%Y-%m-%d'),
                   persistence_file=self.persistence_file,
                   direction='backwards', step=10)
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2011-04-21')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2011-04-30')
     df.bump_date()
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2011-04-11')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2011-04-20')
     df.bump_date()
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'),
                       '2011-04-01')
     self.assertEquals(end.strftime('%Y-%m-%d'),
                       '2011-04-10')
 def __init__(
     self,
     data_warehouse=None,
     data_mart=None,
     reportDate=None,
     database_user=None,
     database_password=None,
     verbosity=0,
 ):
     self.data_warehouse = data_warehouse
     self.warehouse_port = 5432  # postgres default
     self.data_mart = data_mart
     self.mart_port = 5432  # postgres default
     self.reportDate = reportDate and parseDate(reportDate) or None
     self.database_user = database_user
     self.database_password = database_password
     self.dir, thisFile = os.path.split(__file__)
     self.verbosity = verbosity
     self.queue = JoinableQueue()
     self.datefile = "/tmp/longitudinal_datefile"
     self.datePersistence = Datefile(initial_date=self.reportDate)
     self.lock = FileLock(LOCKFILE)
     self.skip_prep = False
class LongitudinalManager(object):
    """ Abstraction to handle which db, user, etc. the deduplication
    process should be run on.  Handles runtime arguments and
    execution.

    Manages the process by farming out the individual visit
    deduplication to a number of worker processes (necessary to take
    advatage of multi-core processor and database as the limiting
    factor).

    """

    # The gating issue is the number of postgres connections that are
    # allowed to run concurrently.  Setting this to N-1 (where N is
    # the number of cores) has proven the fastest and most reliable.
    NUM_PROCS = 5

    def __init__(
        self,
        data_warehouse=None,
        data_mart=None,
        reportDate=None,
        database_user=None,
        database_password=None,
        verbosity=0,
    ):
        self.data_warehouse = data_warehouse
        self.warehouse_port = 5432  # postgres default
        self.data_mart = data_mart
        self.mart_port = 5432  # postgres default
        self.reportDate = reportDate and parseDate(reportDate) or None
        self.database_user = database_user
        self.database_password = database_password
        self.dir, thisFile = os.path.split(__file__)
        self.verbosity = verbosity
        self.queue = JoinableQueue()
        self.datefile = "/tmp/longitudinal_datefile"
        self.datePersistence = Datefile(initial_date=self.reportDate)
        self.lock = FileLock(LOCKFILE)
        self.skip_prep = False

    def __call__(self):
        return self.execute()

    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option(
            "-c",
            "--countdown",
            dest="countdown",
            default=None,
            help="count {down,up} date using date string in "
            "%s - set to 'forwards' or 'backwards' "
            "if desired" % self.datefile,
        )
        parser.add_option(
            "-d",
            "--date",
            dest="date",
            default=None,
            help="single admission date to dedup " "(by default, checks the entire database)",
        )
        parser.add_option(
            "-s",
            "--skip-prep",
            dest="skip_prep",
            default=False,
            action="store_true",
            help="skip the expense of looking for new " "messages",
        )
        parser.add_option(
            "-v",
            "--verbose",
            dest="verbosity",
            action="count",
            default=self.verbosity,
            help="increase output verbosity",
        )
        parser.add_option(
            "-m",
            "--mart-port",
            dest="mart_port",
            default=self.mart_port,
            type="int",
            help="alternate port number for data mart",
        )
        parser.add_option(
            "-w",
            "--warehouse-port",
            dest="warehouse_port",
            default=self.warehouse_port,
            type="int",
            help="alternate port number for data warehouse",
        )

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and parseDate(parser.values.date) or None
        self.datePersistence = Datefile(
            initial_date=initial_date, persistence_file=self.datefile, direction=parser.values.countdown
        )

        self.reportDate = self.datePersistence.get_date()

    def _prepDeduplicateTables(self):
        """ Add any missing rows to the MessageProcessed table

        This is the bridge between the data warehouse and the data
        mart.  In an effort to make the data mart independent of the
        warehouse, the processed message data is kept in the mart.  As
        we're dealing with two distinct databases, there's no
        referential integrity available at the database level, so care
        should be taken.

        """
        startTime = time.time()
        logging.info("Starting INSERT INTO internal_message_processed " "at %s", startTime)

        # We can take advantage of an "add only" data_warehouse,
        # knowing the hl7_msh_id is a sequence moving in the positive
        # direction.  Simply add any values greater than the previous
        # max.

        stmt = "SELECT max(hl7_msh_id) from internal_message_processed"
        max_id = self.data_mart_access.engine.execute(stmt).first()[0]
        if not max_id:
            max_id = 0

        new_msgs = list()
        stmt = (
            """SELECT hl7_msh_id, message_datetime, visit_id
        FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE
        hl7_msh_id > %d """
            % max_id
        )
        rs = self.data_warehouse_access.engine.execute(stmt)
        many = 500
        while True:
            results = rs.fetchmany(many)
            if not results:
                break
            for r in results:
                new_msgs.append(MessageProcessed(hl7_msh_id=r[0], message_datetime=r[1], visit_id=r[2]))

            self.data_mart_access.session.add_all(new_msgs)
            self.data_mart_access.session.commit()
            logging.debug("added %d new messages" % len(new_msgs))
            new_msgs = list()

        logging.info("Added new rows to internal_message_processed in %s", time.time() - startTime)

    def _visitsToProcess(self):
        """ Look up all distinct visit ids needing attention

        Obtain unique list of visit_ids that have messages that
        haven't previously been processed.  If the user requested just
        one days worth (i.e. -d) only that days visits will be
        returned.

        """
        visit_ids = list()
        if not self.reportDate:
            logging.info("Launch deduplication for entire database")
            # Do the whole batch, that is, all that haven't been
            # processed before.
            stmt = """SELECT DISTINCT(visit_id) FROM
            internal_message_processed
            WHERE processed_datetime IS NULL"""
            rs = self.data_mart_access.engine.execute(stmt)
            many = 10000
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    visit_ids.append(r[0])

        else:
            logging.info("Launch deduplication for %s", self.reportDate)
            # Process the requested day only - as we can't join across
            # db boundaries - first acquire the full list of visits
            # for the requested day from the data_warehouse to use in
            # a massive 'in' clause

            stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE
            admit_datetime BETWEEN '%s' AND '%s';""" % (
                self.reportDate,
                self.reportDate + timedelta(days=1),
            )
            self.access.raw_query(stmt)
            rs = self.data_warehouse_access.engine.execute(stmt)
            many = 1000
            potential_visit_ids = list()
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    # tmp_table.insert(r[0])
                    potential_visit_ids.append(r[0])

            if potential_visit_ids:
                query = (
                    self.data_mart_access.session.query(MessageProcessed.visit_id)
                    .distinct()
                    .filter(
                        and_(
                            MessageProcessed.processed_datetime == None,
                            MessageProcessed.visit_id.in_(potential_visit_ids),
                        )
                    )
                )

                for r in query:
                    visit_ids.append(r[0])

        logging.info("Found %d visits needing attention", len(visit_ids))
        return visit_ids

    def tearDown(self):
        """ Clean up any open handles/connections """
        # now done in execute when we're done with teh connections

    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity, logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s", (self.reportDate and self.reportDate or "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user,
                password=self.database_password,
            )
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port, user=self.database_user, password=self.database_password
            )

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {
                "admission_source_lock": Lock(),
                "admission_o2sat_lock": Lock(),
                "admission_temp_lock": Lock(),
                "assigned_location_lock": Lock(),
                "admit_reason_lock": Lock(),
                "chief_complaint_lock": Lock(),
                "diagnosis_lock": Lock(),
                "disposition_lock": Lock(),
                "flu_vaccine_lock": Lock(),
                "h1n1_vaccine_lock": Lock(),
                "lab_flag_lock": Lock(),
                "lab_result_lock": Lock(),
                "location_lock": Lock(),
                "note_lock": Lock(),
                "order_number_lock": Lock(),
                "performing_lab_lock": Lock(),
                "pregnancy_lock": Lock(),
                "race_lock": Lock(),
                "reference_range_lock": Lock(),
                "service_area_lock": Lock(),
                "specimen_source_lock": Lock(),
            }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(
                        target=LongitudinalWorker,
                        kwargs={
                            "queue": self.queue,
                            "procNumber": i,
                            "data_warehouse": self.data_warehouse,
                            "warehouse_port": self.warehouse_port,
                            "data_mart": self.data_mart,
                            "mart_port": self.mart_port,
                            "dbUser": self.database_user,
                            "dbPass": self.database_password,
                            "table_locks": table_locks,
                            "verbosity": self.verbosity,
                        },
                    )
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() - startTime)
        finally:
            self.lock.release()
예제 #23
0
 def testRangeSansFile(self):
     "range w/o step"
     df = Datefile(datetime.strptime('2009-10-31', '%Y-%m-%d'))
     start, end = df.get_date_range()
     self.assertEquals(start.strftime('%Y-%m-%d'), '2009-10-31')
     self.assertEquals(end.strftime('%Y-%m-%d'), '2009-10-31')
예제 #24
0
 def testDefaultAccess(self):
     df = Datefile(initial_date=datetime.strptime('2009-01-01', '%Y-%m-%d'))
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'), '2009-01-01')
    def process_args(self):
        """Process any optional arguments and possitional parameters

        Using the values provided, assemble ReportCriteria and
        Datefile instances to control report generation.

        """
        parser = OptionParser(usage=usage)
        # Provide the ReportCriteria instance an error callback so any
        # command line errors provoke the standard graceful exit with
        # warning text.
        self.criteria.error_callback = parser.error

        parser.add_option("-u", "--user", dest="user",
                          default=self.user, help="database user")
        parser.add_option("-p", "--password", dest="password",
                          default=self.password,
                          help="database password, or file containing "\
                              "just the password")
        parser.add_option("-c", "--countdown", dest="countdown",
                          default=None,
                          help="count {down,up} the start and end dates "\
                              "set to 'forwards' or 'backwards' "\
                              "if desired")
        parser.add_option("-i", "--include-updates",
                          action='store_true', dest="includeUpdates",
                          default=False, help="include "\
                              "visits updated since last similar report")
        parser.add_option("--include-vitals",
                          action='store_true', dest="includeVitals",
                          default=False, help="include "\
                              "vitals (measured temperature, O2 "\
                              "saturation, influenza and H1N1 vaccine "\
                              "data) as additional columns in the "\
                              "report")
        parser.add_option("-k", "--patient-class",
                          dest="patient_class",
                          default=None, help="use "\
                          "to filter report on a specific patient "\
                          "class [E,I,O]")
        parser.add_option("-r", "--region", dest="region",
                          default=None,
                          help="reportable region defining limited set "\
                              "of facilities to include, by default "\
                              "all  facilities are included")
        parser.add_option("-s", "--save-and-upload",
                          action='store_true', dest="save_upload",
                          default=False, help="save file and upload to "\
                              "DOH")
        parser.add_option("-x", "--save-without-upload",
                          action='store_true', dest="save_only",
                          default=False, help="save file but don't upload")
        parser.add_option("-d", "--upload-diff",
                          action='store_true', dest="upload_diff",
                          default=False, help="upload differences only "\
                              "(from yesterdays like report) to DOH")
        parser.add_option("-t", "--thirty-days",
                          action='store_true', dest="thirty_days",
                          default=False, help="include 30 days up to "\
                              "requested date ")
        parser.add_option("-v", "--verbose", dest="verbosity",
                          action="count", default=self.verbosity,
                          help="increase output verbosity")

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        # Database to query
        self.criteria.database = args[0]
        self.user = options.user
        self.password = options.password
        self.criteria.credentials(user=self.user,
                                  password=self.password)

        # Potential region restriction
        self.criteria.reportable_region = options.region

        # Potential patient class restriction
        self.criteria.patient_class = options.patient_class

        # Potential to include vitals (not tied to gipse format)
        self.criteria.include_vitals = options.includeVitals

        # Potential inclusion of updates
        self.criteria.include_updates = options.includeUpdates

        # Report date(s) and potential step direction.
        # NB - several options affect report_method and must be set
        # first!

        initial_date = parseDate(args[1])
        config = Config()
        ps_file = os.path.join(config.get('general', 'tmp_dir',
                                default='/tmp'),
                                self.criteria.report_method)
        step = options.thirty_days and 30 or None
        direction = options.countdown
        self.datefile = Datefile(initial_date=initial_date,
                                 persistence_file=ps_file,
                                 direction=direction,
                                 step=step)
        self.criteria.start_date, self.criteria.end_date =\
            self.datefile.get_date_range()

        # What to do once report is completed.  Complicated, protect
        # user from themselves!
        self.save_report = options.save_upload or \
            options.save_only or options.upload_diff
        self.transmit_report = options.save_upload
        self.transmit_differences = options.upload_diff

        if options.save_only and options.save_upload:
            parser.error("save-without-upload and save-and-upload "\
                         "are mutually exclusive")
        if options.save_only and options.upload_diff:
            parser.error("save-without-upload and upload-diff "\
                         "are mutually exclusive")
        if options.upload_diff and options.save_upload:
            parser.error("upload-diff and save-and-upload"\
                         "are mutually exclusive")

        # Can't transmit w/o saving
        if options.save_upload or options.upload_diff:
            assert(self.save_report)
        # Sanity check
        if options.save_only:
            assert(self.save_report and not self.transmit_report and
                   not self.transmit_differences)

        # How verbosely to log
        self.verbosity = options.verbosity
    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option(
            "-c",
            "--countdown",
            dest="countdown",
            default=None,
            help="count {down,up} date using date string in "
            "%s - set to 'forwards' or 'backwards' "
            "if desired" % self.datefile,
        )
        parser.add_option(
            "-d",
            "--date",
            dest="date",
            default=None,
            help="single admission date to dedup " "(by default, checks the entire database)",
        )
        parser.add_option(
            "-s",
            "--skip-prep",
            dest="skip_prep",
            default=False,
            action="store_true",
            help="skip the expense of looking for new " "messages",
        )
        parser.add_option(
            "-v",
            "--verbose",
            dest="verbosity",
            action="count",
            default=self.verbosity,
            help="increase output verbosity",
        )
        parser.add_option(
            "-m",
            "--mart-port",
            dest="mart_port",
            default=self.mart_port,
            type="int",
            help="alternate port number for data mart",
        )
        parser.add_option(
            "-w",
            "--warehouse-port",
            dest="warehouse_port",
            default=self.warehouse_port,
            type="int",
            help="alternate port number for data warehouse",
        )

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and parseDate(parser.values.date) or None
        self.datePersistence = Datefile(
            initial_date=initial_date, persistence_file=self.datefile, direction=parser.values.countdown
        )

        self.reportDate = self.datePersistence.get_date()
예제 #27
0
 def testDefaultAccess(self):
     df = Datefile(initial_date=
                   datetime.strptime('2009-01-01', '%Y-%m-%d'))
     self.assertEquals(df.get_date().strftime('%Y-%m-%d'),
                       '2009-01-01')
예제 #28
0
    def process_args(self):
        """Process any optional arguments and possitional parameters

        Using the values provided, assemble ReportCriteria and
        Datefile instances to control report generation.

        """
        parser = OptionParser(usage=usage)
        # Provide the ReportCriteria instance an error callback so any
        # command line errors provoke the standard graceful exit with
        # warning text.
        self.criteria.error_callback = parser.error

        parser.add_option("-u",
                          "--user",
                          dest="user",
                          default=self.user,
                          help="database user")
        parser.add_option("-p", "--password", dest="password",
                          default=self.password,
                          help="database password, or file containing "\
                              "just the password")
        parser.add_option("-c", "--countdown", dest="countdown",
                          default=None,
                          help="count {down,up} the start and end dates "\
                              "set to 'forwards' or 'backwards' "\
                              "if desired")
        parser.add_option("-i", "--include-updates",
                          action='store_true', dest="includeUpdates",
                          default=False, help="include "\
                              "visits updated since last similar report")
        parser.add_option("--include-vitals",
                          action='store_true', dest="includeVitals",
                          default=False, help="include "\
                              "vitals (measured temperature, O2 "\
                              "saturation, influenza and H1N1 vaccine "\
                              "data) as additional columns in the "\
                              "report")
        parser.add_option("-k", "--patient-class",
                          dest="patient_class",
                          default=None, help="use "\
                          "to filter report on a specific patient "\
                          "class [E,I,O]")
        parser.add_option("-r", "--region", dest="region",
                          default=None,
                          help="reportable region defining limited set "\
                              "of facilities to include, by default "\
                              "all  facilities are included")
        parser.add_option("-s", "--save-and-upload",
                          action='store_true', dest="save_upload",
                          default=False, help="save file and upload to "\
                              "DOH")
        parser.add_option("-x",
                          "--save-without-upload",
                          action='store_true',
                          dest="save_only",
                          default=False,
                          help="save file but don't upload")
        parser.add_option("-d", "--upload-diff",
                          action='store_true', dest="upload_diff",
                          default=False, help="upload differences only "\
                              "(from yesterdays like report) to DOH")
        parser.add_option("-t", "--thirty-days",
                          action='store_true', dest="thirty_days",
                          default=False, help="include 30 days up to "\
                              "requested date ")
        parser.add_option("-v",
                          "--verbose",
                          dest="verbosity",
                          action="count",
                          default=self.verbosity,
                          help="increase output verbosity")

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        # Database to query
        self.criteria.database = args[0]
        self.user = options.user
        self.password = options.password
        self.criteria.credentials(user=self.user, password=self.password)

        # Potential region restriction
        self.criteria.reportable_region = options.region

        # Potential patient class restriction
        self.criteria.patient_class = options.patient_class

        # Potential to include vitals (not tied to gipse format)
        self.criteria.include_vitals = options.includeVitals

        # Potential inclusion of updates
        self.criteria.include_updates = options.includeUpdates

        # Report date(s) and potential step direction.
        # NB - several options affect report_method and must be set
        # first!

        initial_date = parseDate(args[1])
        config = Config()
        ps_file = os.path.join(
            config.get('general', 'tmp_dir', default='/tmp'),
            self.criteria.report_method)
        step = options.thirty_days and 30 or None
        direction = options.countdown
        self.datefile = Datefile(initial_date=initial_date,
                                 persistence_file=ps_file,
                                 direction=direction,
                                 step=step)
        self.criteria.start_date, self.criteria.end_date =\
            self.datefile.get_date_range()

        # What to do once report is completed.  Complicated, protect
        # user from themselves!
        self.save_report = options.save_upload or \
            options.save_only or options.upload_diff
        self.transmit_report = options.save_upload
        self.transmit_differences = options.upload_diff

        if options.save_only and options.save_upload:
            parser.error("save-without-upload and save-and-upload "\
                         "are mutually exclusive")
        if options.save_only and options.upload_diff:
            parser.error("save-without-upload and upload-diff "\
                         "are mutually exclusive")
        if options.upload_diff and options.save_upload:
            parser.error("upload-diff and save-and-upload"\
                         "are mutually exclusive")

        # Can't transmit w/o saving
        if options.save_upload or options.upload_diff:
            assert (self.save_report)
        # Sanity check
        if options.save_only:
            assert (self.save_report and not self.transmit_report
                    and not self.transmit_differences)

        # How verbosely to log
        self.verbosity = options.verbosity
예제 #29
0
class LongitudinalManager(object):
    """ Abstraction to handle which db, user, etc. the deduplication
    process should be run on.  Handles runtime arguments and
    execution.

    Manages the process by farming out the individual visit
    deduplication to a number of worker processes (necessary to take
    advatage of multi-core processor and database as the limiting
    factor).

    """
    # The gating issue is the number of postgres connections that are
    # allowed to run concurrently.  Setting this to N-1 (where N is
    # the number of cores) has proven the fastest and most reliable.
    NUM_PROCS = 5

    def __init__(self, data_warehouse=None, data_mart=None,
                 reportDate=None, database_user=None,
                 database_password=None, verbosity=0):
        self.data_warehouse = data_warehouse
        self.warehouse_port = 5432  # postgres default
        self.data_mart = data_mart
        self.mart_port = 5432  # postgres default
        self.reportDate = reportDate and parseDate(reportDate) or None
        self.database_user = database_user
        self.database_password = database_password
        self.dir, thisFile = os.path.split(__file__)
        self.verbosity = verbosity
        self.queue = JoinableQueue()
        self.datefile = "/tmp/longitudinal_datefile"
        self.datePersistence = Datefile(initial_date=self.reportDate)
        self.lock = FileLock(LOCKFILE)
        self.skip_prep = False

    def __call__(self):
        return self.execute()

    def processArgs(self):
        """ Process any optional arguments and possitional parameters
        """
        parser = OptionParser(usage=usage)
        parser.add_option("-c", "--countdown", dest="countdown",
                          default=None,
                          help="count {down,up} date using date string in "\
                              "%s - set to 'forwards' or 'backwards' "\
                              "if desired" % self.datefile)
        parser.add_option("-d", "--date", dest="date", default=None,
                          help="single admission date to dedup "\
                          "(by default, checks the entire database)")
        parser.add_option("-s", "--skip-prep", dest="skip_prep",
                          default=False, action="store_true",
                          help="skip the expense of looking for new "\
                          "messages")
        parser.add_option("-v", "--verbose", dest="verbosity",
                          action="count", default=self.verbosity,
                          help="increase output verbosity")
        parser.add_option("-m", "--mart-port", dest="mart_port",
                          default=self.mart_port, type="int",
                          help="alternate port number for data mart")
        parser.add_option("-w", "--warehouse-port", dest="warehouse_port",
                          default=self.warehouse_port, type="int",
                          help="alternate port number for data warehouse")

        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.error("incorrect number of arguments")

        self.data_warehouse = args[0]
        self.data_mart = args[1]
        self.warehouse_port = parser.values.warehouse_port
        self.mart_port = parser.values.mart_port
        self.verbosity = parser.values.verbosity
        self.skip_prep = parser.values.skip_prep
        initial_date = parser.values.date and \
            parseDate(parser.values.date) or None
        self.datePersistence = Datefile(initial_date=initial_date,
                                        persistence_file=self.datefile,
                                        direction=parser.values.countdown)

        self.reportDate = self.datePersistence.get_date()

    def _prepDeduplicateTables(self):
        """ Add any missing rows to the MessageProcessed table

        This is the bridge between the data warehouse and the data
        mart.  In an effort to make the data mart independent of the
        warehouse, the processed message data is kept in the mart.  As
        we're dealing with two distinct databases, there's no
        referential integrity available at the database level, so care
        should be taken.

        """
        startTime = time.time()
        logging.info("Starting INSERT INTO internal_message_processed "
                     "at %s", startTime)

        # We can take advantage of an "add only" data_warehouse,
        # knowing the hl7_msh_id is a sequence moving in the positive
        # direction.  Simply add any values greater than the previous
        # max.

        stmt = "SELECT max(hl7_msh_id) from internal_message_processed"
        max_id = self.data_mart_access.engine.execute(stmt).first()[0]
        if not max_id:
            max_id = 0

        new_msgs = list()
        stmt = """SELECT hl7_msh_id, message_datetime, visit_id
        FROM hl7_msh JOIN hl7_visit USING (hl7_msh_id) WHERE
        hl7_msh_id > %d """ % max_id
        rs = self.data_warehouse_access.engine.execute(stmt)
        many = 500
        while True:
            results = rs.fetchmany(many)
            if not results:
                break
            for r in results:
                new_msgs.append(MessageProcessed(hl7_msh_id=r[0],
                                                 message_datetime=r[1],
                                                 visit_id=r[2]))

            self.data_mart_access.session.add_all(new_msgs)
            self.data_mart_access.session.commit()
            logging.debug("added %d new messages" % len(new_msgs))
            new_msgs = list()

        logging.info("Added new rows to internal_message_processed in %s",
                     time.time() - startTime)

    def _visitsToProcess(self):
        """ Look up all distinct visit ids needing attention

        Obtain unique list of visit_ids that have messages that
        haven't previously been processed.  If the user requested just
        one days worth (i.e. -d) only that days visits will be
        returned.

        """
        visit_ids = list()
        if not self.reportDate:
            logging.info("Launch deduplication for entire database")
            # Do the whole batch, that is, all that haven't been
            # processed before.
            stmt = """SELECT DISTINCT(visit_id) FROM
            internal_message_processed
            WHERE processed_datetime IS NULL"""
            rs = self.data_mart_access.engine.execute(stmt)
            many = 10000
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    visit_ids.append(r[0])

        else:
            logging.info("Launch deduplication for %s",
                         self.reportDate)
            # Process the requested day only - as we can't join across
            # db boundaries - first acquire the full list of visits
            # for the requested day from the data_warehouse to use in
            # a massive 'in' clause

            stmt = """SELECT DISTINCT(visit_id) FROM hl7_visit WHERE
            admit_datetime BETWEEN '%s' AND '%s';""" %\
            (self.reportDate, self.reportDate + timedelta(days=1))
            self.access.raw_query(stmt)
            rs = self.data_warehouse_access.engine.execute(stmt)
            many = 1000
            potential_visit_ids = list()
            while True:
                results = rs.fetchmany(many)
                if not results:
                    break
                for r in results:
                    #tmp_table.insert(r[0])
                    potential_visit_ids.append(r[0])

            if potential_visit_ids:
                query = self.data_mart_access.session.query(\
                    MessageProcessed.visit_id).distinct().\
                    filter(and_(MessageProcessed.processed_datetime ==
                                None,
                                MessageProcessed.visit_id.\
                                in_(potential_visit_ids)))

                for r in query:
                    visit_ids.append(r[0])

        logging.info("Found %d visits needing attention",
                     len(visit_ids))
        return visit_ids

    def tearDown(self):
        """ Clean up any open handles/connections """
        # now done in execute when we're done with teh connections

    def execute(self):
        """ Start the process """
        # Initialize logging now (verbosity is now set regardless of
        # invocation method)
        configure_logging(verbosity=self.verbosity,
                          logfile="longitudinal-manager.log")

        logging.info("Initiate deduplication for %s",
                         (self.reportDate and self.reportDate or
                          "whole database"))
        # Only allow one instance of the manager to run at a time.
        if self.lock.is_locked():
            logging.warn("Can't continue, %s is locked ", LOCKFILE)
            return

        if systemUnderLoad():
            logging.warn("system under load - continue anyhow")

        try:
            self.lock.acquire()

            self.access = DirectAccess(database=self.data_warehouse,
                                       port=self.warehouse_port,
                                       user=self.database_user,
                                       password=self.database_password)
            self.data_warehouse_access = AlchemyAccess(
                database=self.data_warehouse,
                port=self.warehouse_port,
                user=self.database_user, password=self.database_password)
            self.data_mart_access = AlchemyAccess(
                database=self.data_mart, port=self.mart_port,
                user=self.database_user, password=self.database_password)

            startTime = time.time()
            if not self.skip_prep:
                self._prepDeduplicateTables()
            visits_to_process = self._visitsToProcess()

            # Now done with db access needs at the manager level
            # free up resources:
            self.data_mart_access.disconnect()
            self.data_warehouse_access.disconnect()
            self.access.close()

            # Set of locks used, one for each table needing protection
            # from asynchronous inserts.  Names should match table
            # minus 'dim_' prefix, plus '_lock' suffix
            # i.e. dim_location -> 'location_lock'
            table_locks = {'admission_source_lock': Lock(),
                           'admission_o2sat_lock': Lock(),
                           'admission_temp_lock': Lock(),
                           'assigned_location_lock': Lock(),
                           'admit_reason_lock': Lock(),
                           'chief_complaint_lock': Lock(),
                           'diagnosis_lock': Lock(),
                           'disposition_lock': Lock(),
                           'flu_vaccine_lock': Lock(),
                           'h1n1_vaccine_lock': Lock(),
                           'lab_flag_lock': Lock(),
                           'lab_result_lock': Lock(),
                           'location_lock': Lock(),
                           'note_lock': Lock(),
                           'order_number_lock': Lock(),
                           'performing_lab_lock': Lock(),
                           'pregnancy_lock': Lock(),
                           'race_lock': Lock(),
                           'reference_range_lock': Lock(),
                           'service_area_lock': Lock(),
                           'specimen_source_lock': Lock(),
                           }

            # If we have visits to process, fire up the workers...
            if len(visits_to_process) > 1:
                for i in range(self.NUM_PROCS):
                    dw = Process(target=LongitudinalWorker,
                                 kwargs={'queue': self.queue,
                                         'procNumber': i,
                                         'data_warehouse': self.data_warehouse,
                                         'warehouse_port': self.warehouse_port,
                                         'data_mart': self.data_mart,
                                         'mart_port': self.mart_port,
                                         'dbUser': self.database_user,
                                         'dbPass': self.database_password,
                                         'table_locks': table_locks,
                                         'verbosity': self.verbosity})
                    dw.daemon = True
                    dw.start()

                # Populate the queue
                for v in visits_to_process:
                    self.queue.put(v)

                # Wait on the queue until empty
                self.queue.join()

            # Common cleanup
            self.tearDown()
            self.datePersistence.bump_date()
            logging.info("Queue is empty - done in %s", time.time() -
                         startTime)
        finally:
            self.lock.release()