class FacadeWorker: def __init__(self, config, task=None): self.config = config logging.basicConfig(filename='worker_{}.log'.format(self.config['id'].split('.')[len(self.config['id'].split('.')) - 1]), filemode='w', level=logging.INFO) logging.info('Worker (PID: {}) initializing...'.format(os.getpid())) self._task = task self._child = None self._queue = Queue() self._maintain_queue = Queue() self.cfg = Config() ### The real program starts here ### # Set up the database db_user = self.config['user'] db_pass = self.config['password'] db_name = self.config['database'] db_host = self.config['host'] db_port = self.config['port'] db_user_people = self.config['user'] db_pass_people = self.config['password'] db_name_people = self.config['database'] db_host_people = self.config['host'] db_port_people = self.config['port'] # Open a general-purpose connection db,cursor = self.cfg.database_connection( db_host, db_user, db_pass, db_name, db_port, False, False) # Open a connection for the people database db_people,cursor_people = self.cfg.database_connection( db_host_people, db_user_people, db_pass_people, db_name_people, db_port_people, True, False) # Check if the database is current and update it if necessary try: current_db = int(self.cfg.get_setting('database_version')) except: # Catch databases which existed before database versioning current_db = -1 #WHAT IS THE UPSTREAM_DB??? # if current_db < upstream_db: # print(("Current database version: %s\nUpstream database version %s\n" % # (current_db, upstream_db))) # self.cfg.update_db(current_db); self.commit_model() @property def task(self): """ Property that is returned when the worker's current task is referenced """ return self._task @task.setter def task(self, value): """ entry point for the broker to add a task to the queue Adds this task to the queue, and calls method to process queue """ rg_id = value['given']['repo_group_id'] """ Query all repos """ # repoUrlSQL = s.sql.text(""" # SELECT repo_id,repo_group_id,repo_git FROM repo WHERE repo_group_id = '{}' # """.format(rg_id)) # rs = pd.read_sql(repoUrlSQL, self.db, params={}) try: if value['job_type'] == "UPDATE": self._queue.put(CollectorTask(message_type='TASK', entry_info=value)) elif value['job_type'] == "MAINTAIN": self._maintain_queue.put(CollectorTask(message_type='TASK', entry_info=value)) except Exception as e: logging.info("error: {}".format(e)) self._task = CollectorTask(message_type='TASK', entry_info={"task": value, "repo_id": repo_id}) self.run() def cancel(self): """ Delete/cancel current task """ self._task = None def run(self): """ Kicks off the processing of the queue if it is not already being processed Gets run whenever a new task is added """ logging.info("Running...") if self._child is None: self._child = Process(target=self.collect, args=()) self._child.start() def collect(self): """ Function to process each entry in the worker's task queue Determines what action to take based off the message type """ while True: time.sleep(0.5) if not self._queue.empty(): message = self._queue.get() logging.info("Popped off message: {}".format(message.entry_info)) self.working_on = "UPDATE" else: if not self._maintain_queue.empty(): message = self._maintain_queue.get() logging.info("Popped off message: {}".format(message.entry_info)) self.working_on = "MAINTAIN" else: break if message.type == 'EXIT': break if message.type != 'TASK': raise ValueError(f'{message.type} is not a recognized task type') if message.type == 'TASK': try: git_url = message.entry_info['task']['given']['git_url'] self.query_issues({'git_url': git_url, 'repo_id': message.entry_info['repo_id']}) except Exception as e: logging.info("Worker ran into an error for task: {}\n".format(message.entry_info['task'])) logging.info("Error encountered: " + repr(e) + "\n") logging.info("Notifying broker and logging task failure in database...\n") message.entry_info['task']['worker_id'] = self.config['id'] requests.post("http://{}:{}/api/unstable/task_error".format( self.config['broker_host'],self.config['broker_port']), json=message.entry_info['task']) # Add to history table task_history = { "repo_id": message.entry_info['repo_id'], "worker": self.config['id'], "job_model": message.entry_info['task']['models'][0], "oauth_id": self.config['zombie_id'], "timestamp": datetime.datetime.now(), "status": "Error", "total_results": self.results_counter } self.helper_db.execute(self.history_table.update().where(self.history_table.c.history_id==self.history_id).values(task_history)) logging.info("Recorded job error for: " + str(message.entry_info['task']) + "\n") # Update job process table updated_job = { "since_id_str": message.entry_info['repo_id'], "last_count": self.results_counter, "last_run": datetime.datetime.now(), "analysis_state": 0 } self.helper_db.execute(self.job_table.update().where(self.job_table.c.job_model==message.entry_info['task']['models'][0]).values(updated_job)) logging.info("Updated job process for model: " + message.entry_info['task']['models'][0] + "\n") # Reset results counter for next task self.results_counter = 0 pass def commit_model(self): # Figure out what we need to do limited_run = read_config("Facade", name="limited_run", default=0) delete_marked_repos = read_config("Facade", name="delete_marked_repos", default=0) pull_repos = read_config("Facade", name="pull_repos", default=0) clone_repos = read_config("Facade", name="clone_repos", default=1) check_updates = read_config("Facade", name="check_updates", default=0) force_updates = read_config("Facade", name="force_updates", default=0) run_analysis = read_config("Facade", name="run_analysis", default=0) force_analysis = read_config("Facade", name="force_analysis", default=0) nuke_stored_affiliations = read_config("Facade", name="nuke_stored_affiliations", default=0) fix_affiliations = read_config("Facade", name="fix_affiliations", default=1) force_invalidate_caches = read_config("Facade", name="force_invalidate_caches", default=0) rebuild_caches = read_config("Facade", name="rebuild_caches", default=1) #if abs((datetime.datetime.strptime(self.cfg.get_setting('aliases_processed')[:-3], # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(self.cfg.get_setting( # 'update_frequency')) else 0 force_invalidate_caches = read_config("Facade", name="force_invalidate_caches", default=0) create_xlsx_summary_files = read_config("Facade", name="create_xlsx_summary_files", default=0) multithreaded = read_config("Facade", name="multithreaded", default=1) opts,args = getopt.getopt(sys.argv[1:],'hdpcuUaAmnfIrx') for opt in opts: if opt[0] == '-h': print("\nfacade-worker.py does everything by default except invalidating caches\n" "and forcing updates, unless invoked with one of the following options.\n" "In those cases, it will only do what you have selected.\n\n" "Options:\n" " -d Delete marked repos\n" " -c Run 'git clone' on new repos\n" " -u Check if any repos should be marked for updating\n" " -U Force all repos to be marked for updating\n" " -p Run 'git pull' on repos\n" " -a Analyze git repos\n" " -A Force all repos to be analyzed\n" " -m Disable multithreaded mode (but why?)\n" " -n Nuke stored affiliations (if mappings modified by hand)\n" " -f Fill empty affiliations\n" " -I Invalidate caches\n" " -r Rebuild unknown affiliation and web caches\n" " -x Create Excel summary files\n\n") sys.exit(0) elif opt[0] == '-d': delete_marked_repos = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: delete marked repos.') elif opt[0] == '-c': clone_repos = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: clone new repos.') elif opt[0] == '-u': check_updates = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: checking for repo updates') elif opt[0] == '-U': force_updates = 1 self.cfg.log_activity('Info','Option set: forcing repo updates') elif opt[0] == '-p': pull_repos = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: update repos.') elif opt[0] == '-a': run_analysis = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: running analysis.') elif opt[0] == '-A': force_analysis = 1 run_analysis = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: forcing analysis.') elif opt[0] == '-m': multithreaded = 0 self.cfg.log_activity('Info','Option set: disabling multithreading.') elif opt[0] == '-n': nuke_stored_affiliations = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: nuking all affiliations') elif opt[0] == '-f': fix_affiliations = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: fixing affiliations.') elif opt[0] == '-I': force_invalidate_caches = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: Invalidate caches.') elif opt[0] == '-r': rebuild_caches = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: rebuilding caches.') elif opt[0] == '-x': create_xlsx_summary_files = 1 limited_run = 1 self.cfg.log_activity('Info','Option set: creating Excel summary files.') # Get the location of the directory where git repos are stored repo_base_directory = self.cfg.repo_base_directory # Determine if it's safe to start the script current_status = self.cfg.get_setting('utility_status') if current_status != 'Idle': self.cfg.log_activity('Error','Something is already running, aborting maintenance ' 'and analysis.\nIt is unsafe to continue.') # sys.exit(1) if len(repo_base_directory) == 0: self.cfg.log_activity('Error','No base directory. It is unsafe to continue.') update_status('Failed: No base directory') sys.exit(1) # Begin working start_time = time.time() self.cfg.log_activity('Quiet','Running facade-worker') if not limited_run or (limited_run and delete_marked_repos): git_repo_cleanup(self.cfg) if not limited_run or (limited_run and clone_repos): git_repo_initialize(self.cfg) if not limited_run or (limited_run and check_updates): check_for_repo_updates(self.cfg) if force_updates: force_repo_updates(self.cfg) if not limited_run or (limited_run and pull_repos): git_repo_updates(self.cfg) if force_analysis: force_repo_analysis(self.cfg) if not limited_run or (limited_run and run_analysis): analysis(self.cfg, multithreaded) if nuke_stored_affiliations: nuke_affiliations(self.cfg) if not limited_run or (limited_run and fix_affiliations): fill_empty_affiliations(self.cfg) if force_invalidate_caches: invalidate_caches(self.cfg) if not limited_run or (limited_run and rebuild_caches): rebuild_unknown_affiliation_and_web_caches(self.cfg) if not limited_run or (limited_run and create_xlsx_summary_files): self.cfg.log_activity('Info','Creating summary Excel files') # from excel_generators import * self.cfg.log_activity('Info','Creating summary Excel files (complete)') # All done self.cfg.update_status('Idle') self.cfg.log_activity('Quiet','facade-worker.py completed') elapsed_time = time.time() - start_time print('\nCompleted in %s\n' % datetime.timedelta(seconds=int(elapsed_time))) self.cfg.cursor.close() self.cfg.cursor_people.close() self.cfg.db.close() self.cfg.db_people.close()
class FacadeWorker(Worker): def __init__(self, config={}, task=None): worker_type = "facade_worker" # Define what this worker can be given and know how to interpret given = [['repo_group']] models = ['commits'] # Define the tables needed to insert, update, or delete on data_tables = [] operations_tables = ['worker_history', 'worker_job'] # Run the general worker initialization super().__init__(worker_type, config, given, models, data_tables, operations_tables) # Facade-specific config self.cfg = Config(self.logger) # Define data collection info # self.tool_source = 'Facade Worker' # self.tool_version = '1.0.0' # self.data_source = 'Git Log' self.tool_source = '\'Facade Worker\'' self.tool_version = '\'1.0.1\'' self.data_source = '\'Git Log\'' def initialize_database_connections(self): # Set up the database db_user = self.config['user_database'] db_pass = self.config['password_database'] db_name = self.config['name_database'] db_host = self.config['host_database'] db_port = self.config['port_database'] # Open a general-purpose connection self.db, self.cursor = self.cfg.database_connection( db_host, db_user, db_pass, db_name, db_port, False, False) # Open a connection for the people database self.db_people, self.cursor_people = self.cfg.database_connection( db_host, db_user, db_pass, db_name, db_port, True, False) # Check if the database is current and update it if necessary try: self.current_db = int(self.cfg.get_setting('database_version')) except: # Catch databases which existed before database versioning self.current_db = -1 def collect(self): """ Function to process each entry in the worker's task queue Determines what action to take based off the message type """ self.initialize_logging( ) # need to initialize logging again in child process cause multiprocessing self.logger.info("Starting data collection process\n") self.initialize_database_connections() while True: if not self._queue.empty(): message = self._queue.get() # Get the task off our MP queue else: break self.logger.info("Popped off message: {}\n".format(str(message))) if message['job_type'] == 'STOP': break # If task is not a valid job type if message['job_type'] != 'MAINTAIN' and message[ 'job_type'] != 'UPDATE': raise ValueError('{} is not a recognized task type'.format( message['job_type'])) pass try: self.commits_model(message) except Exception as e: self.logger.error(e) raise (e) break def commits_model(self, message): # Figure out what we need to do limited_run = self.augur_config.get_value("Facade", "limited_run") delete_marked_repos = self.augur_config.get_value( "Facade", "delete_marked_repos") pull_repos = self.augur_config.get_value("Facade", "pull_repos") clone_repos = self.augur_config.get_value("Facade", "clone_repos") check_updates = self.augur_config.get_value("Facade", "check_updates") force_updates = self.augur_config.get_value("Facade", "force_updates") run_analysis = self.augur_config.get_value("Facade", "run_analysis") force_analysis = self.augur_config.get_value("Facade", "force_analysis") nuke_stored_affiliations = self.augur_config.get_value( "Facade", "nuke_stored_affiliations") fix_affiliations = self.augur_config.get_value("Facade", "fix_affiliations") force_invalidate_caches = self.augur_config.get_value( "Facade", "force_invalidate_caches") rebuild_caches = self.augur_config.get_value( "Facade", "rebuild_caches" ) #if abs((datetime.datetime.strptime(self.cfg.get_setting('aliases_processed')[:-3], # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(self.cfg.get_setting( # 'update_frequency')) else 0 force_invalidate_caches = self.augur_config.get_value( "Facade", "force_invalidate_caches") create_xlsx_summary_files = self.augur_config.get_value( "Facade", "create_xlsx_summary_files") multithreaded = self.augur_config.get_value("Facade", "multithreaded") opts, args = getopt.getopt(sys.argv[1:], 'hdpcuUaAmnfIrx') for opt in opts: if opt[0] == '-h': print( "\nfacade-worker.py does everything by default except invalidating caches\n" "and forcing updates, unless invoked with one of the following options.\n" "In those cases, it will only do what you have selected.\n\n" "Options:\n" " -d Delete marked repos\n" " -c Run 'git clone' on new repos\n" " -u Check if any repos should be marked for updating\n" " -U Force all repos to be marked for updating\n" " -p Run 'git pull' on repos\n" " -a Analyze git repos\n" " -A Force all repos to be analyzed\n" " -m Disable multithreaded mode (but why?)\n" " -n Nuke stored affiliations (if mappings modified by hand)\n" " -f Fill empty affiliations\n" " -I Invalidate caches\n" " -r Rebuild unknown affiliation and web caches\n" " -x Create Excel summary files\n\n") sys.exit(0) elif opt[0] == '-d': delete_marked_repos = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: delete marked repos.') elif opt[0] == '-c': clone_repos = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: clone new repos.') elif opt[0] == '-u': check_updates = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: checking for repo updates') elif opt[0] == '-U': force_updates = 1 self.cfg.log_activity('Info', 'Option set: forcing repo updates') elif opt[0] == '-p': pull_repos = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: update repos.') elif opt[0] == '-a': run_analysis = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: running analysis.') elif opt[0] == '-A': force_analysis = 1 run_analysis = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: forcing analysis.') elif opt[0] == '-m': multithreaded = 0 self.cfg.log_activity('Info', 'Option set: disabling multithreading.') elif opt[0] == '-n': nuke_stored_affiliations = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: nuking all affiliations') elif opt[0] == '-f': fix_affiliations = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: fixing affiliations.') elif opt[0] == '-I': force_invalidate_caches = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: Invalidate caches.') elif opt[0] == '-r': rebuild_caches = 1 limited_run = 1 self.cfg.log_activity('Info', 'Option set: rebuilding caches.') elif opt[0] == '-x': create_xlsx_summary_files = 1 limited_run = 1 self.cfg.log_activity( 'Info', 'Option set: creating Excel summary files.') # Get the location of the directory where git repos are stored repo_base_directory = self.cfg.repo_base_directory # Determine if it's safe to start the script current_status = self.cfg.get_setting('utility_status') if current_status != 'Idle': self.cfg.log_activity( 'Error', 'Something is already running, aborting maintenance ' 'and analysis.\nIt is unsafe to continue.') # sys.exit(1) if len(repo_base_directory) == 0: self.cfg.log_activity( 'Error', 'No base directory. It is unsafe to continue.') self.cfg.update_status('Failed: No base directory') sys.exit(1) # Begin working start_time = time.time() self.cfg.log_activity('Quiet', 'Running facade-worker') if not limited_run or (limited_run and delete_marked_repos): git_repo_cleanup(self.cfg) if not limited_run or (limited_run and clone_repos): git_repo_initialize(self.cfg) if not limited_run or (limited_run and check_updates): check_for_repo_updates(self.cfg) if force_updates: force_repo_updates(self.cfg) if not limited_run or (limited_run and pull_repos): git_repo_updates(self.cfg) if force_analysis: force_repo_analysis(self.cfg) if not limited_run or (limited_run and run_analysis): analysis(self.cfg, multithreaded) if nuke_stored_affiliations: nuke_affiliations(self.cfg) if not limited_run or (limited_run and fix_affiliations): fill_empty_affiliations(self.cfg) if force_invalidate_caches: invalidate_caches(self.cfg) if not limited_run or (limited_run and rebuild_caches): rebuild_unknown_affiliation_and_web_caches(self.cfg) if not limited_run or (limited_run and create_xlsx_summary_files): self.cfg.log_activity('Info', 'Creating summary Excel files') # from excel_generators import * self.cfg.log_activity('Info', 'Creating summary Excel files (complete)') # All done self.cfg.update_status('Idle') self.cfg.log_activity('Quiet', 'facade-worker.py completed') elapsed_time = time.time() - start_time print('\nCompleted in %s\n' % datetime.timedelta(seconds=int(elapsed_time))) self.cfg.cursor.close() self.cfg.cursor_people.close() self.cfg.db.close() self.cfg.db_people.close()