Пример #1
0
    def run(self):
        start_time = datetime.now()

        user_dim = UserDimension()
        user_dim.sync_all()

        project_dim = ProjectDimension()
        project_dim.sync_all()

        discussion_dim = DiscussionDimension()
        discussion_dim.sync_all()

        end_time = datetime.now()

        conf.log.info("SCD2 done, took %s" % str(end_time - start_time))
Пример #2
0
    def run(self):
        start_time = datetime.now()

        user_dim = UserDimension()
        user_dim.sync_all()

        project_dim = ProjectDimension()
        project_dim.sync_all()

        discussion_dim = DiscussionDimension()
        discussion_dim.sync_all()

        end_time = datetime.now()

        conf.log.info("SCD2 done, took %s" % str(end_time - start_time))
Пример #3
0
    def __init__(self):
        self.user_dim = UserDimension()
        self.event_dim = EventDimension()
        self.project_dim = ProjectDimension()
        self.discussion_dim = DiscussionDimension()
        self.date_dim = DateDimension()
        self.context_dim = ContextDimension()

        # Extracted data transformed to sql
        self.event_insert_clause = (
            "INSERT IGNORE INTO event_fact VALUES(%(date_sk)d, %(project_sk)d, "
            "%(user_sk)d, %(event_sk)d, %(discussion_sk)d, '%(timestamp)s', %(micros)d);"
        )
        self.request_insert_clause = (
            "INSERT IGNORE INTO request_fact VALUES(%(project_sk)d, %(date_sk)d, "
            "%(context_sk)d, %(user_sk)d, '%(timestamp)s', %(micros)d);")

        # Extract will write into these
        self.sql_rows = []

        # Progress info
        self.trans_fail_count = 0
Пример #4
0
    def __init__(self):
        self.user_dim = UserDimension()
        self.event_dim = EventDimension()
        self.project_dim = ProjectDimension()
        self.discussion_dim = DiscussionDimension()
        self.date_dim = DateDimension()
        self.context_dim = ContextDimension()

        # Extracted data transformed to sql
        self.event_insert_clause = ("INSERT IGNORE INTO event_fact VALUES(%(date_sk)d, %(project_sk)d, "
                                    "%(user_sk)d, %(event_sk)d, %(discussion_sk)d, '%(timestamp)s', %(micros)d);")
        self.request_insert_clause = ("INSERT IGNORE INTO request_fact VALUES(%(project_sk)d, %(date_sk)d, "
                                      "%(context_sk)d, %(user_sk)d, '%(timestamp)s', %(micros)d);")

        # Extract will write into these
        self.sql_rows = []

        # Progress info
        self.trans_fail_count = 0
Пример #5
0
class EventLogETL(ETL):
    """
    Ran from cron scripts.

    .. NOTE:: Role needs to be documented better!
    """
    def __init__(self):
        self.user_dim = UserDimension()
        self.event_dim = EventDimension()
        self.project_dim = ProjectDimension()
        self.discussion_dim = DiscussionDimension()
        self.date_dim = DateDimension()
        self.context_dim = ContextDimension()

        # Extracted data transformed to sql
        self.event_insert_clause = ("INSERT IGNORE INTO event_fact VALUES(%(date_sk)d, %(project_sk)d, "
                                    "%(user_sk)d, %(event_sk)d, %(discussion_sk)d, '%(timestamp)s', %(micros)d);")
        self.request_insert_clause = ("INSERT IGNORE INTO request_fact VALUES(%(project_sk)d, %(date_sk)d, "
                                      "%(context_sk)d, %(user_sk)d, '%(timestamp)s', %(micros)d);")

        # Extract will write into these
        self.sql_rows = []

        # Progress info
        self.trans_fail_count = 0

    def run(self):
        start_time = datetime.now()
        conf.log.info("Running event ETL")

        self.transform()
        if not self.sql_rows:
            return

        if not self.load():
            conf.log.error("ETL Done. Load failed.")
            return

        end_time = datetime.now()
        conf.log.info("Event ETL done, took %s, %d rows of data inserted, %d events failed." %
                           (str(end_time - start_time), len(self.sql_rows), self.trans_fail_count))

    def consume_queue(self):
        """
        Iterates over queued unprocessed json data.
        Deletes all iterated rows.
        """
        with analytical_transaction() as cursor:
            cursor.execute('SELECT `id`, `data` FROM `data_queue`')
            rows = cursor.fetchall()
            for row in rows:
                cursor.execute('DELETE FROM data_queue WHERE id = %s', row[0])
                yield json.loads(row[1])

    def transform(self):
        """
        Convert database queue to dimensional form SQL inserts in :attr:`self.sql_rows`.
        """
        for item in self.consume_queue():
            try:
                event = self.to_event(item)
                if item['event'] == 'page_request':
                    self.sql_rows.append(self.request_insert_clause % event)
                else:
                    self.sql_rows.append(self.event_insert_clause % event)
            except Exception:
                conf.log.exception("transform failed: %s" % item)
                self.trans_fail_count += 1

    def to_event(self, log):
        """
        Transforms log row data to surrogate keys for dimensional model
        """
        # Make sure username exists
        log['username'] = log['username'] or 'anonymous'

        data = {}
        data['timestamp'] = log['timestamp']
        data['micros'] = int(log['timestamp'].split('.')[1])
        data['date_sk'] = self.date_dim.date_sk(log['timestamp'])
        data['event_sk'] = self.event_dim.event_sk(log['event'])
        data['project_sk'] = self.project_dim.project_sk(log['project'])
        data['user_sk'] = self.user_dim.user_sk(log['username'])

        # Use inapplicable sk if discussion not applicable for event
        data['discussion_sk'] = self.discussion_dim.inapplicable_sk
        if 'forum_id' in log:
            data['discussion_sk'] = self.discussion_dim.discussion_sk(log['project'], log['forum_id'])

        if log['event'] == 'page_request':
            data['context_sk'] = self.context_dim.context_sk(log['project'], log['path_info'])

        return data

    def load(self):
        """
        Load data into dimension and fact tables
        """
        try:
            return self.load_sql(self.sql_rows)
        except:
            conf.log.error("Failed to load sql")
        return False

    def dump_sql(self, sql):
        """
        Writes sql into analytics/dumps
        """
        dt = datetime.now()
        name = dt.strftime("%Y-%m-%d_%H%M%S_%f") + ".sql"
        filename = os.path.join(conf.analytics_log_file_path, "fail_dump", name)
        sql_file = None

        conf.log.error("Running data to database failed. See %s for possible errors." % filename)
        try:
            sql_file = open(filename, "w")
            sql_file.write(sql)
        except:
            conf.log.exception("Failed to open or write file %s", filename)
        finally:
            sql_file.close()

    def load_sql(self, sql_rows):
        """
        Helper method to be used for loading
        sql into analytical database.

        If sql fails we write sqldump
        """
        with analytical_transaction() as cursor:
            try:
                for row in sql_rows:
                    cursor.execute(row)
            except:
                conf.log.exception("Doing sql dump failed.")
                self.dump_sql("\n".join(sql_rows))
                # Used to return false here prior to rollback, but now we need to raise
                # the exception for the rollback to be called.
                raise

        return True
Пример #6
0
class EventLogETL(ETL):
    """
    Ran from cron scripts.

    .. NOTE:: Role needs to be documented better!
    """
    def __init__(self):
        self.user_dim = UserDimension()
        self.event_dim = EventDimension()
        self.project_dim = ProjectDimension()
        self.discussion_dim = DiscussionDimension()
        self.date_dim = DateDimension()
        self.context_dim = ContextDimension()

        # Extracted data transformed to sql
        self.event_insert_clause = (
            "INSERT IGNORE INTO event_fact VALUES(%(date_sk)d, %(project_sk)d, "
            "%(user_sk)d, %(event_sk)d, %(discussion_sk)d, '%(timestamp)s', %(micros)d);"
        )
        self.request_insert_clause = (
            "INSERT IGNORE INTO request_fact VALUES(%(project_sk)d, %(date_sk)d, "
            "%(context_sk)d, %(user_sk)d, '%(timestamp)s', %(micros)d);")

        # Extract will write into these
        self.sql_rows = []

        # Progress info
        self.trans_fail_count = 0

    def run(self):
        start_time = datetime.now()
        conf.log.info("Running event ETL")

        self.transform()
        if not self.sql_rows:
            return

        if not self.load():
            conf.log.error("ETL Done. Load failed.")
            return

        end_time = datetime.now()
        conf.log.info(
            "Event ETL done, took %s, %d rows of data inserted, %d events failed."
            % (str(end_time - start_time), len(
                self.sql_rows), self.trans_fail_count))

    def consume_queue(self):
        """
        Iterates over queued unprocessed json data.
        Deletes all iterated rows.
        """
        with analytical_transaction() as cursor:
            cursor.execute('SELECT `id`, `data` FROM `data_queue`')
            rows = cursor.fetchall()
            for row in rows:
                cursor.execute('DELETE FROM data_queue WHERE id = %s', row[0])
                yield json.loads(row[1])

    def transform(self):
        """
        Convert database queue to dimensional form SQL inserts in :attr:`self.sql_rows`.
        """
        for item in self.consume_queue():
            try:
                event = self.to_event(item)
                if item['event'] == 'page_request':
                    self.sql_rows.append(self.request_insert_clause % event)
                else:
                    self.sql_rows.append(self.event_insert_clause % event)
            except Exception:
                conf.log.exception("transform failed: %s" % item)
                self.trans_fail_count += 1

    def to_event(self, log):
        """
        Transforms log row data to surrogate keys for dimensional model
        """
        # Make sure username exists
        log['username'] = log['username'] or 'anonymous'

        data = {}
        data['timestamp'] = log['timestamp']
        data['micros'] = int(log['timestamp'].split('.')[1])
        data['date_sk'] = self.date_dim.date_sk(log['timestamp'])
        data['event_sk'] = self.event_dim.event_sk(log['event'])
        data['project_sk'] = self.project_dim.project_sk(log['project'])
        data['user_sk'] = self.user_dim.user_sk(log['username'])

        # Use inapplicable sk if discussion not applicable for event
        data['discussion_sk'] = self.discussion_dim.inapplicable_sk
        if 'forum_id' in log:
            data['discussion_sk'] = self.discussion_dim.discussion_sk(
                log['project'], log['forum_id'])

        if log['event'] == 'page_request':
            data['context_sk'] = self.context_dim.context_sk(
                log['project'], log['path_info'])

        return data

    def load(self):
        """
        Load data into dimension and fact tables
        """
        try:
            return self.load_sql(self.sql_rows)
        except:
            conf.log.error("Failed to load sql")
        return False

    def dump_sql(self, sql):
        """
        Writes sql into analytics/dumps
        """
        dt = datetime.now()
        name = dt.strftime("%Y-%m-%d_%H%M%S_%f") + ".sql"
        filename = os.path.join(conf.analytics_log_file_path, "fail_dump",
                                name)
        sql_file = None

        conf.log.error(
            "Running data to database failed. See %s for possible errors." %
            filename)
        try:
            sql_file = open(filename, "w")
            sql_file.write(sql)
        except:
            conf.log.exception("Failed to open or write file %s", filename)
        finally:
            sql_file.close()

    def load_sql(self, sql_rows):
        """
        Helper method to be used for loading
        sql into analytical database.

        If sql fails we write sqldump
        """
        with analytical_transaction() as cursor:
            try:
                for row in sql_rows:
                    cursor.execute(row)
            except:
                conf.log.exception("Doing sql dump failed.")
                self.dump_sql("\n".join(sql_rows))
                # Used to return false here prior to rollback, but now we need to raise
                # the exception for the rollback to be called.
                raise

        return True