Пример #1
0
 def createDB(self):
     """
     Creates new DB to load SQL dump files if required
     """
     db_create = MySQLDB(host=self.host,
                         port=self.port,
                         user=self.db_user,
                         passwd=self.db_passw)
     db_create.connect()
     db_create.create_database(self.db_name)
     db_create.close()
Пример #2
0
 def createDB(self):
     """
     Creates new DB to load SQL dump files if required
     """
     db_create = MySQLDB(host=self.host, port=self.port,
                         user=self.db_user, passwd=self.db_passw)
     db_create.connect()
     db_create.create_database(self.db_name)
     db_create.close()
Пример #3
0
 def DB_exists(self):
     db_check = MySQLDB(host=self.host,
                        port=self.port,
                        user=self.db_user,
                        passwd=self.db_passw)
     db_check.connect()
     db_exists = db_check.db_exists(self.db_name)
     db_check.close()
     return db_exists
Пример #4
0
 def DB_exists(self):
     db_check = MySQLDB(host=self.host, port=self.port, user=self.db_user,
                        passwd=self.db_passw)
     db_check.connect()
     db_exists = db_check.db_exists(self.db_name)
     db_check.close()
     return db_exists
Пример #5
0
    def execute(
        self,
        page_fan,
        rev_fan,
        page_cache_size,
        rev_cache_size,
        host,
        port,
        db_name,
        db_user,
        db_passw,
        db_engine,
        mirror,
        download_files,
        base_ports,
        control_ports,
        dumps_dir=None,
    ):
        """
        Run data retrieval and loading actions.
        Arguments:
            - page_fan = Number of workers to fan out page elements parsing
            - rev_fan = Number of workers to fan out rev elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print "Downloading new dump files from %s, for language %s" % (mirror, self.lang)
            self.down = RevHistDownloader(mirror, self.lang)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            print "Got files for lang %s, date: %s" % (self.lang, self.date)

            # db_name = self.lang + '_' + self.date.strip('/')

        else:
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                dumps_path = os.path.expanduser(dumps_dir)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print "No dump files will be downloaded and local folder "
                    print "with dump files not found. Please, specify a "
                    print "valid path to local folder containing dump files."
                    print "Program will exit now."
                    sys.exit()

                else:
                    # Attempt to find list of .7z or .xml files to be processed
                    self.paths = glob.glob(os.path.join(dumps_path, "*.7z"))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_path, "*.xml"))
                        if not self.paths:
                            print "Directory %s" % dumps_dir
                            print "does not contain any valid dump file."
                            print "Program will exit now."
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join(self.lang + "_dumps", self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print "Default directory %s" % dumps_dir
                    print " containing dump files not found."
                    print "Program will exit now."
                    sys.exit()

                else:
                    self.paths = glob.glob(dumps_dir + "/*.7z")

        print "paths: " + unicode(self.paths)

        # DB SCHEMA PREPARATION
        db_create = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw)
        db_create.connect()
        db_create.create_database(db_name)
        db_create.close()
        db_schema = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw, db=db_name)
        db_schema.connect()
        db_schema.create_schema(engine=db_engine)
        db_schema.close()

        # Complete the queue of paths to be processed and STOP flags for
        # each ETL subprocess
        paths_queue = mp.JoinableQueue()
        for path in self.paths:
            paths_queue.put(path)

        for x in range(self.etl_lines):
            paths_queue.put("STOP")

        for x in range(self.etl_lines):
            new_etl = PageRevisionETL(
                name="ETL-process-%s" % x,
                paths_queue=paths_queue,
                lang=self.lang,
                page_fan=page_fan,
                rev_fan=rev_fan,
                page_cache_size=page_cache_size,
                rev_cache_size=rev_cache_size,
                db_name=db_name,
                db_user=db_user,
                db_passw=db_passw,
                base_port=base_ports[x] + (20 * x),
                control_port=control_ports[x] + (20 * x),
            )
            self.etl_list.append(new_etl)

        print "ETL process for page and revision history defined OK."
        print "Proceeding with ETL workflows. This may take time..."
        # Extract, process and load information in local DB
        for etl in self.etl_list:
            etl.start()

        # Wait for ETL lines to finish
        for etl in self.etl_list:
            etl.join()

        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print "ETL process finished for language %s and date %s" % (self.lang, self.date)

        # Create primary keys for all tables
        # TODO: This must also be tracked by official logging module
        print "Now creating primary key indexes in database tables."
        print "This may take a while..."
        db_pks = MySQLDB(host="localhost", port=3306, user=db_user, passwd=db_passw, db=db_name)
        db_pks.connect()
        db_pks.create_pks()
        db_pks.close()
Пример #6
0
    def execute(self, page_fan, rev_fan, page_cache_size, rev_cache_size,
                mirror, download_files, base_ports, control_ports,
                dumps_dir=None, debug=False):
        """
        Run data retrieval and loading actions.
        Arguments:
            - page_fan = Number of workers to fan out page elements parsing
            - rev_fan = Number of workers to fan out rev elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        print "----------------------------------------------------------"
        print ("""Executing ETL:RevHistory on lang: {0} date: {1}"""
               .format(self.lang, self.date))
        print ("ETL lines = {0} page_fan = {1} rev_fan = {2}"
               .format(self.etl_lines, page_fan, rev_fan))
        print "Download files =", download_files
        print "Start time is {0}".format(time.strftime("%Y-%m-%d %H:%M:%S %Z",
                                                       time.localtime()))
        print "----------------------------------------------------------"
        print
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print "Downloading new dump files from %s, for language %s" % (
                  mirror, self.lang)
            self.down = download.RevHistDownloader(mirror,
                                                   self.lang, dumps_dir)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            if not self.paths:
                print "Error: dump files with pages-logging info not found."
                print "Program will exit now."
                sys.exit()

            print "Retrieved dump files for lang %s, date: %s" % (self.lang,
                                                                  self.date)
            print

        else:
            print "Looking for revision-history dump file(s) in data dir"
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                abs_dumps_path = os.path.expanduser(dumps_dir)
                dumps_path = os.path.join(abs_dumps_path,
                                          self.lang + '_dumps', self.date)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print "No dump files will be downloaded and local folder with dump files not found."
                    print "Please, specify a valid path to local folder containing dump files."
                    print "Program will exit now."
                    sys.exit()

                else:
                    # Attempt to find list of .7z or .xml files to be processed
                    self.paths = glob.glob(os.path.join(dumps_path,
                                                        '*pages-meta-hsitory*.7z'))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_path,
                                                            '*pages-meta-hsitory*.xml'))
                        if not self.paths:
                            print "Directory %s does not contain any valid dump file." % dumps_path
                            print "Program will exit now."
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join("data", self.lang + '_dumps',
                                         self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print "Default directory %s containing dump files not found." % dumps_dir
                    print "Program will exit now."
                    sys.exit()

                else:
                    self.paths = glob.glob(os.path.join(dumps_dir, '*pages-meta-history*.7z'))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_dir,
                                                            '*pages-meta-hsitory*.xml'))
                        if not self.paths:
                            print "Directory %s does not contain any valid dump file." % dumps_dir
                            print "Program will exit now."
                            sys.exit()
            print "Found revision-history dump file(s) to process."
            print
        # Print list of file paths in debug mode
        if debug:
            print "paths: " + unicode(self.paths)
            print

        # Create database
        # TODO: Empty correspoding tables if DB already exists
        # or let the user select behaviour with config argument
        if self.DB_exists():
            self.create_DB(complete=False)
        else:
            self.create_DB(complete=True)

        # First insert namespace info in DB
        dump = DumpFile(self.paths[0])
        db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user,
                            passwd=self.db_passw, db=self.db_name)
        db_schema.connect()
        db_schema.insert_namespaces(nsdict=dump.get_namespaces())
        db_schema.close()

        # Complete the queue of paths to be processed and STOP flags for
        # each ETL subprocess
        paths_queue = mp.JoinableQueue()
        for path in self.paths:
            paths_queue.put(path)

        for x in range(self.etl_lines):
            paths_queue.put('STOP')

        for x in range(self.etl_lines):
            new_etl = RevisionHistoryETL(
                name="[ETL:RevHistory-%s]" % x,
                paths_queue=paths_queue, lang=self.lang,
                page_fan=page_fan, rev_fan=rev_fan,
                page_cache_size=page_cache_size,
                rev_cache_size=rev_cache_size,
                db_name=self.db_name,
                db_user=self.db_user, db_passw=self.db_passw,
                base_port=base_ports[x]+(20*x),
                control_port=control_ports[x]+(20*x)
                )
            self.etl_list.append(new_etl)

        print "ETL:RevHistory task defined OK."
        print "Proceeding with ETL workflows. This may take time..."
        print
        # Extract, process and load information in local DB
        for etl in self.etl_list:
            etl.start()
            # Wait a second for new ETL process to start all subprocesses
            time.sleep(1)

        # Wait for ETL lines to finish
        for etl in self.etl_list:
            etl.join()

        # Insert user info after all ETL lines have finished
        # to ensure that all metadata are stored in Redis cache
        # disregarding of the execution order
        data_dir = os.path.join(os.getcwd(), os.path.split(self.paths[0])[0])
        db_users = MySQLDB(host=self.host, port=self.port, user=self.db_user,
                           passwd=self.db_passw, db=self.db_name)
        db_users.connect()
        users_file_to_db(con=db_users, lang=self.lang,
                         log_file=os.path.join(data_dir, 'logs', 'users.log'),
                         tmp_dir=os.path.join(data_dir, 'tmp')
                         )
        db_users.close()
        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print "ETL:RevHistory task finished for language %s and date %s" % (
              self.lang, self.date)
        print
        # Create primary keys for all tables
        # TODO: This must also be tracked by main logging module
        print "Now creating primary key indexes in database tables."
        print "This may take a while..."
        print
        db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user,
                         passwd=self.db_passw, db=self.db_name)
        db_pks.connect()
        db_pks.create_pks_revhist()
        db_pks.close()
Пример #7
0
 def create_DB(self, complete=False):
     if complete:
         db_create = MySQLDB(host=self.host, port=self.port,
                             user=self.db_user, passwd=self.db_passw)
         db_create.connect()
         db_create.create_database(self.db_name)
         db_create.close()
     db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user,
                         passwd=self.db_passw, db=self.db_name)
     db_schema.connect()
     db_schema.create_schema_revhist(engine=self.db_engine)
     db_schema.close()
Пример #8
0
    def execute(self, log_fan, log_cache_size,
                mirror, download_files, base_ports, control_ports,
                dumps_dir=None, debug=False):
        """
        Run data retrieval and loading actions.
        Arguments:
            - log_fan = Number of workers to fan out logitem elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        print "----------------------------------------------------------"
        print("Executing ETL:PagesLogging on lang: {0} date: {1}"
              .format(self.lang, self.date))
        print "log_fan =", log_fan
        print "Download files =", download_files
        print "Start time is {0}".format(time.strftime("%Y-%m-%d %H:%M:%S %Z",
                                                       time.localtime()))
        print "----------------------------------------------------------"
        print
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print """Downloading new logging dump files from %s,
                     for language %s""" % (mirror, self.lang)
            self.down = download.LoggingDownloader(mirror,
                                                   self.lang, dumps_dir)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            if not self.paths:
                print "Error: dump files with pages-logging info not found."
                print "Program will exit now."
                sys.exit()

            print "Got files for lang %s, date: %s" % (self.lang, self.date)

        else:
            print "Looking for pages-logging dump file in data dir"
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                abs_dumps_path = os.path.expanduser(dumps_dir)
                dumps_path = os.path.join(abs_dumps_path,
                                          self.lang + '_dumps', self.date)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print "No dump files will be downloaded and local folder with dump files not found."
                    print "Please, specify a valid path to local folder containing dump files."
                    print "Program will exit now."
                    sys.exit()

                else:
                    # Attempt to find list of *page-logging*.gz or
                    # *page-logging*.xml files to be processed
                    self.paths = glob.glob(os.path.join(dumps_path, '*pages-logging*.gz'))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_path,
                                                            '*pages-logging*.xml'))
                        if not self.paths:
                            print "Directory %s does not contain any valid dump file." % dumps_path
                            print "Program will exit now."
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join("data", self.lang + '_dumps',
                                         self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print "Default directory %s containing dump files not found." % dumps_dir
                    print "Program will exit now."
                    sys.exit()

                else:
                    self.paths = glob.glob(os.path.join(dumps_dir, '*pages-logging*.gz'))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_dir,
                                                            '*pages-logging*.xml'))
                        if not self.paths:
                            print "Directory %s does not contain any valid dump file." % dumps_dir
                            print "Program will exit now."
                            sys.exit()

            print "Found pages-logging dump file to process."
            print
        if debug:
            print "paths: " + unicode(self.paths)
            print

        # Create database if it does not exist
        # empty logging table otherwise
        if self.DB_exists():
            self.create_DB(complete=False)
        else:
            self.create_DB(complete=True)

        new_etl = LoggingETL(name="[ETL:PagesLogging-0]",
                             path=self.paths, lang=self.lang,
                             log_fan=log_fan,
                             log_cache_size=log_cache_size,
                             db_name=self.db_name,
                             db_user=self.db_user, db_passw=self.db_passw,
                             base_port=base_ports[0]+(30),
                             control_port=control_ports[0]+(30)
                             )
        print "ETL:Logging task for administrative records defined OK."
        print "Proceeding with ETL workflow. This may take time..."
        print
        # Extract, process and load information in local DB
        new_etl.start()
        # Wait for ETL line to finish
        new_etl.join()
        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print "ETL:Logging task finished for lang %s and date %s" % (
              self.lang, self.date)
        print
        # Create primary keys for all tables
        # TODO: This must also be tracked by official logging module
        print "Now creating primary key indexes in database tables."
        print "This may take a while..."
        print
        db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user,
                         passwd=self.db_passw, db=self.db_name)
        db_pks.connect()
        db_pks.create_pks_logitem()
        db_pks.close()
Пример #9
0
    def run(self):
        """
        Execute workflow to import revision history data from dump files

        The data loading workflow is composed of a number of processor
        elements, which can be:

            - Producer (P): raw input data --> input element queue
            - ConsumerProducer (CP): input element queue --> insert db queue
            - Consumer (C): insert db queue --> database (MySQL/MariaDB)

        In this case, the logical combination is usually N:N:1 (P, CP, C)
        """
        start = time.time()
        print self.name, "Starting PageRevisionETL workflow at %s" % (
            time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))

        db_ns = MySQLDB(host='localhost',
                        port=3306,
                        user=self.db_user,
                        passwd=self.db_passw,
                        db=self.db_name)
        db_ns.connect()

        db_pages = MySQLDB(host='localhost',
                           port=3306,
                           user=self.db_user,
                           passwd=self.db_passw,
                           db=self.db_name)
        db_pages.connect()

        db_revs = MySQLDB(host='localhost',
                          port=3306,
                          user=self.db_user,
                          passwd=self.db_passw,
                          db=self.db_name)
        db_revs.connect()

        # DATA EXTRACTION
        # Use consistent naming for all child processes
        xml_reader_name = '-'.join([self.name, 'xml_reader'])
        page_proc_name = '-'.join([self.name, 'process_page'])
        rev_proc_name = '-'.join([self.name, 'process_revision'])
        page_insert_name = '-'.join([self.name, 'insert_page'])
        rev_insert_name = '-'.join([self.name, 'insert_revision'])

        for path in iter(self.paths_queue.get, 'STOP'):
            # Start subprocess to extract elements from revision dump file
            dump_file = DumpFile(path)
            xml_reader = Producer(name=xml_reader_name,
                                  target=process_xml,
                                  kwargs=dict(dump_file=dump_file),
                                  consumers=self.page_fan + self.rev_fan,
                                  push_pages_port=self.base_port,
                                  push_revs_port=self.base_port + 1,
                                  control_port=self.control_port)
            xml_reader.start()
            print xml_reader_name, "started"
            print self.name, "Extracting data from XML revision history file:"
            print path

            # List to keep tracking of page and revision workers
            workers = []
            db_workers_revs = []
            # Create and start page processes
            for worker in range(self.page_fan):
                page_worker_name = '-'.join([page_proc_name, unicode(worker)])
                process_page = Processor(name=page_worker_name,
                                         target=pages_to_file,
                                         producers=1,
                                         consumers=1,
                                         pull_port=self.base_port,
                                         push_port=self.base_port + 2,
                                         control_port=self.control_port)
                process_page.start()
                workers.append(process_page)
                print page_worker_name, "started"

            # Create and start revision processes
            for worker in range(self.rev_fan):
                rev_worker_name = '-'.join([rev_proc_name, unicode(worker)])

                db_wrev = MySQLDB(host='localhost',
                                  port=3306,
                                  user=self.db_user,
                                  passwd=self.db_passw,
                                  db=self.db_name)
                db_wrev.connect()

                process_revision = Processor(name=rev_worker_name,
                                             target=revs_to_file,
                                             kwargs=dict(lang=self.lang),
                                             producers=1,
                                             consumers=1,
                                             pull_port=self.base_port + 1,
                                             push_port=self.base_port + 3,
                                             control_port=self.control_port)
                process_revision.start()
                workers.append(process_revision)
                db_workers_revs.append(db_wrev)
                print rev_worker_name, "started"

            # Create directory for logging files if it does not exist
            log_dir = os.path.join(os.path.split(path)[0], 'logs')
            tmp_dir = os.path.join(os.getcwd(), os.path.split(path)[0], 'tmp')
            file_name = os.path.split(path)[1]

            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            log_file = os.path.join(log_dir, file_name + '.log')

            page_insert_db = Consumer(name=page_insert_name,
                                      target=pages_file_to_db,
                                      kwargs=dict(
                                          con=db_pages,
                                          log_file=log_file,
                                          tmp_dir=tmp_dir,
                                          file_rows=self.page_cache_size,
                                          etl_prefix=self.name),
                                      producers=self.page_fan,
                                      pull_port=self.base_port + 2)

            rev_insert_db = Consumer(name=rev_insert_name,
                                     target=revs_file_to_db,
                                     kwargs=dict(con=db_revs,
                                                 log_file=log_file,
                                                 tmp_dir=tmp_dir,
                                                 file_rows=self.rev_cache_size,
                                                 etl_prefix=self.name),
                                     producers=self.rev_fan,
                                     pull_port=self.base_port + 3)

            page_insert_db.start()
            print page_insert_name, "started"
            rev_insert_db.start()
            print rev_insert_name, "started"

            print self.name, "Waiting for all processes to finish..."
            print
            xml_reader.join()
            for w in workers:
                w.join()
            page_insert_db.join()
            rev_insert_db.join()

            # Mark this path as done
            self.paths_queue.task_done()

        # Mark STOP message as processed and finish
        self.paths_queue.task_done()

        end = time.time()
        print self.name, ": All tasks done in %.4f sec." % ((end - start) / 1.)
        print
        db_ns.close()
        db_pages.close()
        db_revs.close()
        for dbcon in db_workers_revs:
            dbcon.close()
Пример #10
0
    def run(self):
        """
        Execute workflow to import logging records of actions on pages and
        users from dump file

        The data loading workflow is composed of a number of processor
        elements, which can be:

            - Producer (P): raw input data --> input element queue
            - ConsumerProducer (CP): input element queue --> insert db queue
            - Consumer (C): insert db queue --> database (MySQL/MariaDB)

        In this case, the usual combination is 1:N:1 (P, CP, C)
        """
        start = time.time()
        print "Starting LoggingETL workflow at %s" % (time.strftime(
            "%Y-%m-%d %H:%M:%S %Z", time.localtime()))

        # DATA EXTRACTION
        xml_reader_name = '-'.join([self.name, 'xml_reader'])
        logitem_proc_name = '-'.join([self.name, 'process_logitem'])
        logitem_insert_name = '-'.join([self.name, 'insert_logitem'])
        # Start subprocess to extract elements from logging dump file
        file_path = self.path[0]
        dump_file = DumpFile(file_path)
        xml_reader = Producer(name=xml_reader_name,
                              target=process_xml,
                              kwargs=dict(dump_file=dump_file),
                              consumers=self.log_fan,
                              push_logs_port=self.base_port,
                              control_port=self.control_port)
        xml_reader.start()
        print xml_reader_name, "started"
        print self.name, "Extracting data from XML revision history file:"
        print unicode(self.path[0])

        # List to keep tracking of logitem workers
        workers = []
        # Create and start page processes
        for worker in range(self.log_fan):
            worker_name = '-'.join([logitem_proc_name, unicode(worker)])
            process_logitems = Processor(name=worker_name,
                                         target=logitem_to_file,
                                         producers=1,
                                         consumers=1,
                                         pull_port=self.base_port,
                                         push_port=self.base_port + 2,
                                         control_port=self.control_port)
            process_logitems.start()
            workers.append(process_logitems)
            print worker_name, "started"

        # Create directory for logging files if it does not exist
        log_dir = os.path.join(os.path.split(file_path)[0], 'logs')
        tmp_dir = os.path.join(os.getcwd(), os.path.split(file_path)[0], 'tmp')
        file_name = os.path.split(file_path)[1]

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
        log_file = os.path.join(log_dir, file_name + '.log')

        db_log = MySQLDB(host='localhost',
                         port=3306,
                         user=self.db_user,
                         passwd=self.db_passw,
                         db=self.db_name)
        db_log.connect()
        logitem_insert_db = Consumer(name=logitem_insert_name,
                                     target=logitem_file_to_db,
                                     kwargs=dict(con=db_log,
                                                 log_file=log_file,
                                                 tmp_dir=tmp_dir,
                                                 file_rows=self.log_cache_size,
                                                 etl_prefix=self.name),
                                     producers=self.log_fan,
                                     pull_port=self.base_port + 2)

        print logitem_insert_name, "started"
        logitem_insert_db.start()

        print "Waiting for all processes to finish..."
        print
        xml_reader.join()
        for w in workers:
            w.join()
        logitem_insert_db.join()

        # All operations finished
        end = time.time()
        print "All tasks done in %.4f sec." % ((end - start) / 1.)
        print
        db_log.close()
Пример #11
0
    def run(self):
        """
        Execute workflow to import revision history data from dump files

        The data loading workflow is composed of a number of processor
        elements, which can be:

            - Producer (P): raw input data --> input element queue
            - ConsumerProducer (CP): input element queue --> insert db queue
            - Consumer (C): insert db queue --> database (MySQL/MariaDB)

        In this case, the logical combination is usually N:N:1 (P, CP, C)
        """
        start = time.time()
        print self.name, "Starting PageRevisionETL workflow at %s" % (
                         time.strftime("%Y-%m-%d %H:%M:%S %Z",
                                       time.localtime()))

        db_ns = MySQLDB(host='localhost', port=3306, user=self.db_user,
                        passwd=self.db_passw, db=self.db_name)
        db_ns.connect()

        db_pages = MySQLDB(host='localhost', port=3306,
                           user=self.db_user, passwd=self.db_passw,
                           db=self.db_name)
        db_pages.connect()

        db_revs = MySQLDB(host='localhost', port=3306, user=self.db_user,
                          passwd=self.db_passw, db=self.db_name)
        db_revs.connect()

        # DATA EXTRACTION
        # Use consistent naming for all child processes
        xml_reader_name = '-'.join([self.name, 'xml_reader'])
        page_proc_name = '-'.join([self.name, 'process_page'])
        rev_proc_name = '-'.join([self.name, 'process_revision'])
        page_insert_name = '-'.join([self.name, 'insert_page'])
        rev_insert_name = '-'.join([self.name, 'insert_revision'])

        for path in iter(self.paths_queue.get, 'STOP'):
            # Start subprocess to extract elements from revision dump file
            dump_file = DumpFile(path)
            xml_reader = Producer(name=xml_reader_name,
                                  target=process_xml,
                                  kwargs=dict(
                                      dump_file=dump_file),
                                  consumers=self.page_fan + self.rev_fan,
                                  push_pages_port=self.base_port,
                                  push_revs_port=self.base_port+1,
                                  control_port=self.control_port)
            xml_reader.start()
            print xml_reader_name, "started"
            print self.name, "Extracting data from XML revision history file:"
            print path

            # List to keep tracking of page and revision workers
            workers = []
            db_workers_revs = []
            # Create and start page processes
            for worker in range(self.page_fan):
                page_worker_name = '-'.join([page_proc_name, unicode(worker)])
                process_page = Processor(name=page_worker_name,
                                         target=pages_to_file,
                                         producers=1, consumers=1,
                                         pull_port=self.base_port,
                                         push_port=self.base_port+2,
                                         control_port=self.control_port)
                process_page.start()
                workers.append(process_page)
                print page_worker_name, "started"

            # Create and start revision processes
            for worker in range(self.rev_fan):
                rev_worker_name = '-'.join([rev_proc_name, unicode(worker)])

                db_wrev = MySQLDB(host='localhost', port=3306,
                                  user=self.db_user,
                                  passwd=self.db_passw, db=self.db_name)
                db_wrev.connect()

                process_revision = Processor(name=rev_worker_name,
                                             target=revs_to_file,
                                             kwargs=dict(
                                                 lang=self.lang),
                                             producers=1, consumers=1,
                                             pull_port=self.base_port+1,
                                             push_port=self.base_port+3,
                                             control_port=self.control_port)
                process_revision.start()
                workers.append(process_revision)
                db_workers_revs.append(db_wrev)
                print rev_worker_name, "started"

            # Create directory for logging files if it does not exist
            log_dir = os.path.join(os.path.split(path)[0], 'logs')
            tmp_dir = os.path.join(os.getcwd(), os.path.split(path)[0], 'tmp')
            file_name = os.path.split(path)[1]

            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            log_file = os.path.join(log_dir, file_name + '.log')

            page_insert_db = Consumer(name=page_insert_name,
                                      target=pages_file_to_db,
                                      kwargs=dict(con=db_pages,
                                                  log_file=log_file,
                                                  tmp_dir=tmp_dir,
                                                  file_rows=self.page_cache_size,
                                                  etl_prefix=self.name),
                                      producers=self.page_fan,
                                      pull_port=self.base_port+2)

            rev_insert_db = Consumer(name=rev_insert_name,
                                     target=revs_file_to_db,
                                     kwargs=dict(con=db_revs,
                                                 log_file=log_file,
                                                 tmp_dir=tmp_dir,
                                                 file_rows=self.rev_cache_size,
                                                 etl_prefix=self.name),
                                     producers=self.rev_fan,
                                     pull_port=self.base_port+3)

            page_insert_db.start()
            print page_insert_name, "started"
            rev_insert_db.start()
            print rev_insert_name, "started"

            print self.name, "Waiting for all processes to finish..."
            print
            xml_reader.join()
            for w in workers:
                w.join()
            page_insert_db.join()
            rev_insert_db.join()

            # Mark this path as done
            self.paths_queue.task_done()

        # Mark STOP message as processed and finish
        self.paths_queue.task_done()

        end = time.time()
        print self.name, ": All tasks done in %.4f sec." % ((end-start)/1.)
        print
        db_ns.close()
        db_pages.close()
        db_revs.close()
        for dbcon in db_workers_revs:
            dbcon.close()
Пример #12
0
    def run(self):
        """
        Execute workflow to import logging records of actions on pages and
        users from dump file

        The data loading workflow is composed of a number of processor
        elements, which can be:

            - Producer (P): raw input data --> input element queue
            - ConsumerProducer (CP): input element queue --> insert db queue
            - Consumer (C): insert db queue --> database (MySQL/MariaDB)

        In this case, the usual combination is 1:N:1 (P, CP, C)
        """
        start = time.time()
        print "Starting LoggingETL workflow at %s" % (
              time.strftime("%Y-%m-%d %H:%M:%S %Z",
                            time.localtime()))

        # DATA EXTRACTION
        xml_reader_name = '-'.join([self.name, 'xml_reader'])
        logitem_proc_name = '-'.join([self.name, 'process_logitem'])
        logitem_insert_name = '-'.join([self.name, 'insert_logitem'])
        # Start subprocess to extract elements from logging dump file
        file_path = self.path[0]
        dump_file = DumpFile(file_path)
        xml_reader = Producer(name=xml_reader_name,
                              target=process_xml,
                              kwargs=dict(
                                  dump_file=dump_file),
                              consumers=self.log_fan,
                              push_logs_port=self.base_port,
                              control_port=self.control_port)
        xml_reader.start()
        print xml_reader_name, "started"
        print self.name, "Extracting data from XML revision history file:"
        print unicode(self.path[0])

        # List to keep tracking of logitem workers
        workers = []
        # Create and start page processes
        for worker in range(self.log_fan):
            worker_name = '-'.join([logitem_proc_name, unicode(worker)])
            process_logitems = Processor(name=worker_name,
                                         target=logitem_to_file,
                                         producers=1, consumers=1,
                                         pull_port=self.base_port,
                                         push_port=self.base_port+2,
                                         control_port=self.control_port)
            process_logitems.start()
            workers.append(process_logitems)
            print worker_name, "started"

        # Create directory for logging files if it does not exist
        log_dir = os.path.join(os.path.split(file_path)[0], 'logs')
        tmp_dir = os.path.join(os.getcwd(), os.path.split(file_path)[0], 'tmp')
        file_name = os.path.split(file_path)[1]

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
        log_file = os.path.join(log_dir, file_name + '.log')

        db_log = MySQLDB(host='localhost', port=3306, user=self.db_user,
                         passwd=self.db_passw, db=self.db_name)
        db_log.connect()
        logitem_insert_db = Consumer(name=logitem_insert_name,
                                     target=logitem_file_to_db,
                                     kwargs=dict(con=db_log,
                                                 log_file=log_file,
                                                 tmp_dir=tmp_dir,
                                                 file_rows=self.log_cache_size,
                                                 etl_prefix=self.name),
                                     producers=self.log_fan,
                                     pull_port=self.base_port+2)

        print logitem_insert_name, "started"
        logitem_insert_db.start()

        print "Waiting for all processes to finish..."
        print
        xml_reader.join()
        for w in workers:
            w.join()
        logitem_insert_db.join()

        # All operations finished
        end = time.time()
        print "All tasks done in %.4f sec." % ((end-start)/1.)
        print
        db_log.close()
Пример #13
0
    def execute(self,
                log_fan,
                log_cache_size,
                mirror,
                download_files,
                base_ports,
                control_ports,
                dumps_dir=None,
                debug=False):
        """
        Run data retrieval and loading actions.
        Arguments:
            - log_fan = Number of workers to fan out logitem elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        print("----------------------------------------------------------")
        print("Executing ETL:PagesLogging on lang: {0} date: {1}".format(
            self.lang, self.date))
        print("log_fan =", log_fan)
        print("Download files =", download_files)
        print("Start time is {0}".format(
            time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())))
        print("----------------------------------------------------------")
        print()
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print("""Downloading new logging dump files from %s,
                     for language %s""" % (mirror, self.lang))
            self.down = LoggingDownloader(mirror, self.lang, dumps_dir)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            if not self.paths:
                print("Error: dump files with pages-logging info not found.")
                print("Program will exit now.")
                sys.exit()

            print("Got files for lang %s, date: %s" % (self.lang, self.date))

        else:
            print("Looking for pages-logging dump file in data dir")
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                abs_dumps_path = os.path.expanduser(dumps_dir)
                dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps',
                                          self.date)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print(
                        "No dump files will be downloaded and local folder with dump files not found."
                    )
                    print(
                        "Please, specify a valid path to local folder containing dump files."
                    )
                    print("Program will exit now.")
                    sys.exit()

                else:
                    # Attempt to find list of *page-logging*.gz or
                    # *page-logging*.xml files to be processed
                    self.paths = glob.glob(
                        os.path.join(dumps_path, '*pages-logging*.gz'))
                    if not self.paths:
                        self.paths = glob.glob(
                            os.path.join(dumps_path, '*pages-logging*.xml'))
                        if not self.paths:
                            print(
                                "Directory %s does not contain any valid dump file."
                                % dumps_path)
                            print("Program will exit now.")
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join("data", self.lang + '_dumps',
                                         self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print(
                        "Default directory %s containing dump files not found."
                        % dumps_dir)
                    print("Program will exit now.")
                    sys.exit()

                else:
                    self.paths = glob.glob(
                        os.path.join(dumps_dir, '*pages-logging*.gz'))
                    if not self.paths:
                        self.paths = glob.glob(
                            os.path.join(dumps_dir, '*pages-logging*.xml'))
                        if not self.paths:
                            print(
                                "Directory %s does not contain any valid dump file."
                                % dumps_dir)
                            print("Program will exit now.")
                            sys.exit()

            print("Found pages-logging dump file to process.")
            print()
        if debug:
            print("paths: ", str(self.paths))
            print()

        # Create database if it does not exist
        # empty logging table otherwise
        if self.DB_exists():
            self.create_DB(complete=False)
        else:
            self.create_DB(complete=True)

        new_etl = LoggingETL(name="[ETL:PagesLogging-0]",
                             path=self.paths,
                             lang=self.lang,
                             log_fan=log_fan,
                             log_cache_size=log_cache_size,
                             db_name=self.db_name,
                             db_user=self.db_user,
                             db_passw=self.db_passw,
                             base_port=base_ports[0] + (30),
                             control_port=control_ports[0] + (30))
        print("ETL:Logging task for administrative records defined OK.")
        print("Proceeding with ETL workflow. This may take time...")
        print()
        # Extract, process and load information in local DB
        new_etl.start()
        # Wait for ETL line to finish
        new_etl.join()
        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print("ETL:Logging task finished for lang %s and date %s" %
              (self.lang, self.date))
        print()
        # Create primary keys for all tables
        # TODO: This must also be tracked by official logging module
        print("Now creating primary key indexes in database tables.")
        print("This may take a while...")
        print()
        db_pks = MySQLDB(host='localhost',
                         port=3306,
                         user=self.db_user,
                         passwd=self.db_passw,
                         db=self.db_name)
        db_pks.connect()
        db_pks.create_pks_logitem()
        db_pks.close()
Пример #14
0
 def create_DB(self, complete=False):
     if complete:
         db_create = MySQLDB(host=self.host,
                             port=self.port,
                             user=self.db_user,
                             passwd=self.db_passw)
         db_create.connect()
         db_create.create_database(self.db_name)
         db_create.close()
     db_schema = MySQLDB(host=self.host,
                         port=self.port,
                         user=self.db_user,
                         passwd=self.db_passw,
                         db=self.db_name)
     db_schema.connect()
     db_schema.create_schema_logitem(engine=self.db_engine)
     db_schema.close()
Пример #15
0
    def execute(self,
                page_fan,
                rev_fan,
                page_cache_size,
                rev_cache_size,
                mirror,
                download_files,
                base_ports,
                control_ports,
                dumps_dir=None,
                debug=False):
        """
        Run data retrieval and loading actions.
        Arguments:
            - page_fan = Number of workers to fan out page elements parsing
            - rev_fan = Number of workers to fan out rev elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        print("----------------------------------------------------------")
        print(("""Executing ETL:RevHistory on lang: {0} date: {1}""".format(
            self.lang, self.date)))
        print(("ETL lines = {0} page_fan = {1} rev_fan = {2}".format(
            self.etl_lines, page_fan, rev_fan)))
        print("Download files =", download_files)
        print("Start time is {0}".format(
            time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())))
        print("----------------------------------------------------------")
        print()
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print("Downloading new dump files from %s, for language %s" %
                  (mirror, self.lang))
            self.down = RevHistDownloader(mirror, self.lang, dumps_dir)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            if not self.paths:
                print("Error: dump files with pages-logging info not found.")
                print("Program will exit now.")
                sys.exit()

            print("Retrieved dump files for lang %s, date: %s" %
                  (self.lang, self.date))
            print()

        else:
            print("Looking for revision-history dump file(s) in data dir")
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                abs_dumps_path = os.path.expanduser(dumps_dir)
                dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps',
                                          self.date)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print(
                        "No dump files will be downloaded and local folder with dump files not found."
                    )
                    print(
                        "Please, specify a valid path to local folder containing dump files."
                    )
                    print("Program will exit now.")
                    sys.exit()

                else:
                    # Attempt to find list of .7z or .xml files to be processed
                    self.paths = glob.glob(
                        os.path.join(dumps_path, '*pages-meta-history*.7z'))
                    if not self.paths:
                        self.paths = glob.glob(
                            os.path.join(dumps_path,
                                         '*pages-meta-history*.xml'))
                        if not self.paths:
                            print(
                                "Directory %s does not contain any valid dump file."
                                % dumps_path)
                            print("Program will exit now.")
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join("data", self.lang + '_dumps',
                                         self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print(
                        "Default directory %s containing dump files not found."
                        % dumps_dir)
                    print("Program will exit now.")
                    sys.exit()

                else:
                    self.paths = glob.glob(
                        os.path.join(dumps_dir, '*pages-meta-history*.7z'))
                    if not self.paths:
                        self.paths = glob.glob(
                            os.path.join(dumps_dir,
                                         '*pages-meta-history*.xml'))
                        if not self.paths:
                            print(
                                "Directory %s does not contain any valid dump file."
                                % dumps_dir)
                            print("Program will exit now.")
                            sys.exit()
            print("Found revision-history dump file(s) to process.")
            print()
        # Print list of file paths in debug mode
        if debug:
            print("paths: ", str(self.paths))
            print()

        # Create database
        # TODO: Empty correspoding tables if DB already exists
        # or let the user select behaviour with config argument
        if self.DB_exists():
            self.create_DB(complete=False)
        else:
            self.create_DB(complete=True)

        # First insert namespace info in DB
        dump = DumpFile(self.paths[0])
        db_schema = MySQLDB(host=self.host,
                            port=self.port,
                            user=self.db_user,
                            passwd=self.db_passw,
                            db=self.db_name)
        db_schema.connect()
        db_schema.insert_namespaces(nsdict=dump.get_namespaces())
        db_schema.close()

        # Complete the queue of paths to be processed and STOP flags for
        # each ETL subprocess
        paths_queue = mp.JoinableQueue()
        for path in self.paths:
            paths_queue.put(path)

        for x in range(self.etl_lines):
            paths_queue.put('STOP')

        for x in range(self.etl_lines):
            new_etl = RevisionHistoryETL(name="[ETL:RevHistory-%s]" % x,
                                         paths_queue=paths_queue,
                                         lang=self.lang,
                                         page_fan=page_fan,
                                         rev_fan=rev_fan,
                                         page_cache_size=page_cache_size,
                                         rev_cache_size=rev_cache_size,
                                         db_name=self.db_name,
                                         db_user=self.db_user,
                                         db_passw=self.db_passw,
                                         base_port=base_ports[x] + (20 * x),
                                         control_port=control_ports[x] +
                                         (20 * x))
            self.etl_list.append(new_etl)

        print("ETL:RevHistory task defined OK.")
        print("Proceeding with ETL workflows. This may take time...")
        print()
        # Extract, process and load information in local DB
        for etl in self.etl_list:
            etl.start()
            # Wait a second for new ETL process to start all subprocesses
            time.sleep(1)

        # Wait for ETL lines to finish
        for etl in self.etl_list:
            etl.join()

        # Insert user info after all ETL lines have finished
        # to ensure that all metadata are stored in Redis cache
        # disregarding of the execution order
        data_dir = os.path.join(os.getcwd(), os.path.split(self.paths[0])[0])
        db_users = MySQLDB(host=self.host,
                           port=self.port,
                           user=self.db_user,
                           passwd=self.db_passw,
                           db=self.db_name)
        db_users.connect()
        users_file_to_db(con=db_users,
                         lang=self.lang,
                         log_file=os.path.join(data_dir, 'logs', 'users.log'),
                         tmp_dir=os.path.join(data_dir, 'tmp'))
        db_users.close()
        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print("ETL:RevHistory task finished for language %s and date %s" %
              (self.lang, self.date))
        print()
        # Create primary keys for all tables
        # TODO: This must also be tracked by main logging module
        print("Now creating primary key indexes in database tables.")
        print("This may take a while...")
        print()
        db_pks = MySQLDB(host='localhost',
                         port=3306,
                         user=self.db_user,
                         passwd=self.db_passw,
                         db=self.db_name)
        db_pks.connect()
        db_pks.create_pks_revhist()
        db_pks.close()