def __init__(self, logger): self.repos_processed = 0 self.upstream_db = 7 self.cursor = None self.cursor_people = None self.logger = logger self.db = None self.db_people = None worker_options = read_config("Workers", "facade_worker", None, None) if 'repo_directory' in worker_options: self.repo_base_directory = worker_options['repo_directory'] else: self.log_activity( 'Error', "Please specify a \'repo_directory\' parameter" " in your \'Workers\' -> \'facade_worker\' object in your config " "to the directory in which you want to clone repos. Exiting..." ) sys.exit(1) self.tool_source = '\'Facade Worker\'' self.tool_version = '\'1.0.1\'' self.data_source = '\'Git Log\'' # Figure out how much we're going to log logging.basicConfig(filename='worker_{}.log'.format( worker_options['port']), filemode='w', level=logging.INFO) self.log_level = None #self.get_setting('log_level')
def main(augur_url, host, port): """ Declares singular worker and creates the server and flask app that it will be running on """ app = Flask(__name__) #load credentials broker_host = read_config("Server", "host", "AUGUR_HOST", "0.0.0.0") broker_port = read_config("Server", "port", "AUGUR_PORT", 5000) database_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'host') worker_info = read_config('Workers', 'repo_info_worker', None, None) worker_port = worker_info['port'] if 'port' in worker_info else port while True: try: r = requests.get("http://{}:{}/AUGWOP/heartbeat".format(host, worker_port)).json() if 'status' in r: if r['status'] == 'alive': worker_port += 1 except: break logging.basicConfig(filename='worker_{}.log'.format(worker_port), filemode='w', level=logging.INFO) config = { 'id': 'com.augurlabs.core.template_worker.{}'.format(worker_port), 'location': 'http://{}:{}'.format(read_config('Server', 'host', 'AUGUR_HOST', 'localhost'),worker_port) } #create instance of the worker app.template_worker = TemplateWorker(config) # declares the worker that will be running on this server with specified config create_server(app, None) logging.info("Starting Flask App with pid: " + str(os.getpid()) + "...") app.run(debug=app.debug, host=host, port=worker_port) if app.template_worker._child is not None: app.template_worker._child.terminate() try: requests.post('http://{}:{}/api/unstable/workers/remove'.format(server['host'],server['port']), json={"id": config['id']}) except: pass logging.info("Killing Flask App: " + str(os.getpid())) os.kill(os.getpid(), 9)
def test_read_config_no_exception(): test_config = default_config base_dir = os.path.dirname(os.path.dirname(__file__)) print(base_dir) with open(config_path, "w") as f: json.dump(test_config, f) db_name = read_config('Database', 'user', 'AUGUR_DB_USER', 'augur', config_file_path=config_path) assert db_name == "augur"
def log_activity(self, level, status): # Log an activity based upon urgency and user's preference. If the log level is # "Debug", then just print it and don't save it in the database. log_options = ('Error', 'Quiet', 'Info', 'Verbose', 'Debug') self.logger.info("* %s\n" % status) if self.log_level == 'Debug' and level == 'Debug': return #if log_options.index(level) <= log_options.index(self.log_level): query = ("INSERT INTO utility_log (level,status) VALUES (%s,%s)") try: self.cursor.execute(query, (level, status)) self.db.commit() except Exception as e: self.logger.info('Error encountered: {}\n'.format(e)) # Set up the database db_user = read_config('Database', 'user', 'AUGUR_DB_USER', 'augur') db_pass = read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'augur') db_name = read_config('Database', 'name', 'AUGUR_DB_NAME', 'augur') db_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'localhost') db_port = read_config('Database', 'port', 'AUGUR_DB_PORT', 5432) db_user_people = db_user db_pass_people = db_pass db_name_people = db_name db_host_people = db_host db_port_people = db_port # Open a general-purpose connection db, cursor = self.database_connection(db_host, db_user, db_pass, db_name, db_port, False, False) self.cursor.execute(query, (level, status)) self.db.commit()
def __init__(self, config={}, given=[], models=[], data_tables=[], operations_tables=[]): self._task = None # task currently being worked on (dict) self._child = None # process of currently running task (multiprocessing process) self._queue = Queue( ) # tasks stored here 1 at a time (in a mp queue so it can translate across multiple processes) # count of tuples inserted in the database (to store stats for each task in op tables) self.results_counter = 0 # if we are finishing a previous task, certain operations work differenty self.finishing_task = False # Update config with options that are general and not specific to any worker self.config = config self.config.update({ 'port_broker': read_config('Server', 'port', 'AUGUR_PORT', 5000), 'host_broker': read_config('Server', 'host', 'AUGUR_HOST', '0.0.0.0'), 'host_database': read_config('Database', 'host', 'AUGUR_DB_HOST', 'host'), 'port_database': read_config('Database', 'port', 'AUGUR_DB_PORT', 'port'), 'user_database': read_config('Database', 'user', 'AUGUR_DB_USER', 'user'), 'name_database': read_config('Database', 'name', 'AUGUR_DB_NAME', 'database'), 'password_database': read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'password') }) # Format the port the worker is running on to the name of the # log file so we can tell multiple instances apart logging.basicConfig(filename='worker_{}.log'.format( self.config['id'].split('.')[len(self.config['id'].split('.')) - 1]), filemode='w', level=logging.INFO) logging.info('Worker (PID: {}) initializing...'.format(str( os.getpid()))) self.given = given self.models = models self.specs = { 'id': self.config['id'], # what the broker knows this worker as 'location': self.config[ 'location'], # host + port worker is running on (so broker can send tasks here) 'qualifications': [{ 'given': self.given, # type of repo this worker can be given as a task 'models': self.models # models this worker can fill for a repo as a task }], 'config': self.config } DB_STR = 'postgresql://{}:{}@{}:{}/{}'.format( self.config['user_database'], self.config['password_database'], self.config['host_database'], self.config['port_database'], self.config['name_database']) # Create an sqlalchemy engine for both database schemas logging.info("Making database connections... {}".format(DB_STR)) db_schema = 'augur_data' self.db = s.create_engine( DB_STR, poolclass=s.pool.NullPool, connect_args={'options': '-csearch_path={}'.format(db_schema)}) helper_schema = 'augur_operations' self.helper_db = s.create_engine( DB_STR, poolclass=s.pool.NullPool, connect_args={'options': '-csearch_path={}'.format(helper_schema)}) metadata = MetaData() helper_metadata = MetaData() # Reflect only the tables we will use for each schema's metadata object metadata.reflect(self.db, only=data_tables) helper_metadata.reflect(self.helper_db, only=operations_tables) Base = automap_base(metadata=metadata) HelperBase = automap_base(metadata=helper_metadata) Base.prepare() HelperBase.prepare() # So we can access all our tables when inserting, updating, etc for table in data_tables: setattr(self, '{}_table'.format(table), Base.classes[table].__table__) try: logging.info(HelperBase.classes.keys()) except: pass for table in operations_tables: try: setattr(self, '{}_table'.format(table), HelperBase.classes[table].__table__) except Exception as e: logging.info( "Error setting attribute for table: {} : {}".format( table, e)) # Increment so we are ready to insert the 'next one' of each of these most recent ids self.history_id = self.get_max_id( 'worker_history', 'history_id', operations_table=True) + 1 # Organize different api keys/oauths available if 'gh_api_key' in self.config: self.init_oauths() # Send broker hello message self.connect_to_broker()
def analyze_commit(cfg, repo_id, repo_loc, commit, multithreaded): # This function analyzes a given commit, counting the additions, removals, and # whitespace changes. It collects all of the metadata about the commit, and # stashes it in the database. A new database connection is opened each time in # case we are running in multithreaded mode, since MySQL cursors are not # currently threadsafe. ### Local helper functions ### def check_swapped_emails(name, email): # Sometimes people mix up their name and email in their git settings if name.find('@') >= 0 and email.find('@') == -1: cfg.log_activity('Debug', 'Found swapped email/name: %s/%s' % (email, name)) return email, name else: return name, email def strip_extra_amp(email): # Some repos have multiple ampersands, which really messes up domain pattern # matching. This extra info is not used, so we discard it. if email.count('@') > 1: cfg.log_activity('Debug', 'Found extra @: %s' % email) return email[:email.find('@', email.find('@') + 1)] else: return email def discover_alias(email): # Match aliases with their canonical email fetch_canonical = ("SELECT canonical_email " "FROM contributors_aliases " "WHERE alias_email=%s " "AND cntrb_active = 1") cursor_people_local.execute(fetch_canonical, (email, )) db_people_local.commit() canonical = list(cursor_people_local) if canonical: for email in canonical: return email[0] else: return email def update_contributors(author_em, committer_em, auth_nm, cmtr_nm): #Check if an email already exists in the database for either the committer or the author #There is a committer and an author on each commit, but only one record in the contributor table (ideally) # For each email address. So, for each email address, we need to check if it exists in the contributor # Table. ## Refactor Facade for Contributors here: Note that we need to map to some kind of alias as defined by Gabe. ## Sean Goggins, February 5, 2021 ## %TODO def contributor_exists(some_email): #SQL String to insert values into the contributors table some_email = some_email.replace("'", "") email_check = ( """SELECT cntrb_email, tool_source, tool_version, data_source FROM contributors WHERE cntrb_email = '{}'""" .format(some_email)) cursor_local.execute(email_check) if cursor_local.fetchone() is not None: db_local.commit() emails_to_add = some_email return True else: return False #SQL to update the contributors table cntrb = ( "INSERT INTO contributors " "(cntrb_email,cntrb_canonical,cntrb_full_name,tool_source, tool_version, data_source) " "VALUES (%s,%s,%s,'FacadeAugur','0.0.1','git_repository')") ## Logic block for updating contributors. if contributor_exists(author_em): cfg.log_activity( 'Info', 'Author contributor record already exists: {}'.format( author_em)) else: # add a contributor record for the author cursor_local.execute( cntrb, (author_em, discover_alias(author_em), str(auth_nm))) db_local.commit() cfg.log_activity( 'Info', 'Stored author contributor with email: {}'.format(author_em)) if contributor_exists(committer_em): cfg.log_activity( 'Info', 'Author contributor record already exists: {}'.format( committer_em)) else: #add a contributor record for the committer cursor_local.execute( cntrb, (committer_em, discover_alias(committer_em), str(cmtr_nm))) db_local.commit() cfg.log_activity( 'Info', 'Stored committer contributor with email: {}'.format( committer_em)) def store_commit(repos_id, commit, filename, author_name, author_email, author_date, author_timestamp, committer_name, committer_email, committer_date, committer_timestamp, added, removed, whitespace): # Fix some common issues in git commit logs and store data. # Sometimes git is misconfigured and name/email get swapped author_name, author_email = check_swapped_emails( author_name, author_email) committer_name, committer_email = check_swapped_emails( committer_name, committer_email) # Some systems append extra info after a second @ author_email = strip_extra_amp(author_email) committer_email = strip_extra_amp(committer_email) store = ("""INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename, cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp, cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp, cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""") try: cursor_local.execute(store, ( repos_id, str(commit), filename, str(author_name), author_email, discover_alias(author_email), author_date, author_timestamp, committer_name, committer_email, discover_alias(committer_email), committer_date, committer_timestamp, added, removed, whitespace, committer_date, cfg.tool_source, cfg.tool_version, cfg.data_source, )) db_local.commit() except: try: cfg.log_activity( 'Info', """Timezone error caught, inspect values: INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename, cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp, cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp, cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""". format(repos_id, str(commit), filename, str(author_name), author_email, discover_alias(author_email), author_date, author_timestamp, committer_name, committer_email, discover_alias(committer_email), committer_date, committer_timestamp, added, removed, whitespace, committer_date, cfg.tool_source, cfg.tool_version, cfg.data_source)) except: cfg.log_activity( 'Info', 'Something wrong in error log for timezone error') cfg.log_activity('Debug', 'Stored commit: %s' % commit) # Check if email already exists in db # email_check = ("""SELECT cntrb_email, tool_source, tool_version, data_source # FROM contributors WHERE cntrb_email = {augur_email} OR cntrb_email = {committer_email}}""") ## Commented out so as to not update contributors ## sean: 11/6/2019 ## Goal: Address with the contributors model worker # try: # update_contributors(author_email, committer_email, author_name, committer_name) # except Exception: #print(e) # cfg.log_activity('Info', str(traceback.print_exc())) ### The real function starts here ### header = True filename = '' filename = '' added = 0 removed = 0 whitespace = 0 db_user = read_config('Database', 'user', 'AUGUR_DB_USER', 'augur') db_pass = read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'augur') db_name = read_config('Database', 'name', 'AUGUR_DB_NAME', 'augur') db_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'localhost') db_port = read_config('Database', 'port', 'AUGUR_DB_PORT', 5432) db_user_people = db_user db_pass_people = db_pass db_name_people = db_name db_host_people = db_host db_port_people = db_port # Set up new threadsafe database connections if multithreading. Otherwise # use the gloabl database connections so we don't incur a performance # penalty. if multithreaded: db_local, cursor_local = cfg.database_connection( db_host, db_user, db_pass, db_name, db_port, False, True) db_people_local, cursor_people_local = cfg.database_connection( db_host_people, db_user_people, db_pass_people, db_name_people, db_port_people, True, True) else: db_local = cfg.db cursor_local = cfg.cursor db_people_local = cfg.db_people cursor_people_local = cfg.cursor_people # Read the git log git_log = subprocess.Popen([ "git --git-dir %s log -p -M %s -n1 " "--pretty=format:'" "author_name: %%an%%nauthor_email: %%ae%%nauthor_date:%%ai%%n" "committer_name: %%cn%%ncommitter_email: %%ce%%ncommitter_date: %%ci%%n" "parents: %%p%%nEndPatch' " % (repo_loc, commit) ], stdout=subprocess.PIPE, shell=True) ## # Stash the commit we're going to analyze so we can back it out if something # goes wrong later. store_working_commit = ("INSERT INTO working_commits " "(repos_id,working_commit) VALUES (%s,%s)") cursor_local.execute(store_working_commit, (repo_id, commit)) db_local.commit() cfg.log_activity('Debug', 'Stored working commit and analyzing : %s' % commit) for line in git_log.stdout.read().decode("utf-8", errors="ignore").split( os.linesep): if len(line) > 0: if line.find('author_name:') == 0: author_name = line[13:] continue if line.find('author_email:') == 0: author_email = line[14:] continue if line.find('author_date:') == 0: author_date = line[12:22] author_timestamp = line[12:] continue if line.find('committer_name:') == 0: committer_name = line[16:] continue if line.find('committer_email:') == 0: committer_email = line[17:] continue if line.find('committer_date:') == 0: committer_date = line[16:26] committer_timestamp = line[16:] continue if line.find('parents:') == 0: if len(line[9:].split(' ')) == 2: # We found a merge commit, which won't have a filename filename = '(Merge commit)' added = 0 removed = 0 whitespace = 0 continue if line.find('--- a/') == 0: if filename == '(Deleted) ': filename = filename + line[6:] continue if line.find('+++ b/') == 0: if not filename.find('(Deleted) ') == 0: filename = line[6:] continue if line.find('rename to ') == 0: filename = line[10:] continue if line.find('deleted file ') == 0: filename = '(Deleted) ' continue if line.find('diff --git') == 0: # Git only displays the beginning of a file in a patch, not # the end. We need some kludgery to discern where one starts # and one ends. This is the last line always separating # files in commits. But we only want to do it for the second # time onward, since the first time we hit this line it'll be # right after parsing the header and there won't be any useful # information contained in it. if not header: store_commit(repo_id, commit, filename, author_name, author_email, author_date, author_timestamp, committer_name, committer_email, committer_date, committer_timestamp, added, removed, whitespace) header = False # Reset stats and prepare for the next section whitespaceCheck = [] resetRemovals = True filename = '' added = 0 removed = 0 whitespace = 0 continue # Count additions and removals and look for whitespace changes if not header: if line[0] == '+': # First check if this is a whitespace change if len(line.strip()) == 1: # Line with zero length whitespace += 1 else: # Compare against removals, detect whitespace changes whitespaceChange = False for check in whitespaceCheck: # Mark matches of non-trivial length if line[1:].strip() == check and len( line[1:].strip()) > 8: whitespaceChange = True if whitespaceChange: # One removal was whitespace, back it out removed -= 1 whitespace += 1 # Remove the matched line whitespaceCheck.remove(check) else: # Did not trigger whitespace criteria added += 1 # Once we hit an addition, next removal line will be new. # At that point, start a new collection for checking. resetRemovals = True if line[0] == '-': removed += 1 if resetRemovals: whitespaceCheck = [] resetRemovals = False # Store the line to check next add lines for a match whitespaceCheck.append(line[1:].strip()) # Store the last stats from the git log store_commit(repo_id, commit, filename, author_name, author_email, author_date, author_timestamp, committer_name, committer_email, committer_date, committer_timestamp, added, removed, whitespace) # Remove the working commit. try: remove_commit = ("DELETE FROM working_commits " "WHERE repos_id = %s AND working_commit = %s") cursor_local.execute(remove_commit, (repo_id, commit)) db_local.commit() cfg.log_activity('Debug', 'Completed and removed working commit: %s' % commit) except: cfg.log_activity('Info', 'Working Commit: %s' % commit) # If multithreading, clean up the local database if multithreaded: cursor_local.close() cursor_people_local.close() db_local.close() db_people_local.close()
def test_read_config_exception(): with pytest.raises(AttributeError): db_name = read_config('Server', 'username')