def timestamp_to_datetime(timestamp): """ Convert a string timestamp to a datetime object. :param str timestamp: a generic or ISO-8601 timestamp :return: datetime object of the timestamp :rtype: datetime.datetime :raises ValueError: if the timestamp is an unsupported or invalid format """ log.debug('Trying to parse the timestamp "{0}"'.format(timestamp)) error_msg = 'The timestamp "{0}" is an invalid format'.format(timestamp) combinations = ( (r'^(?P<datetime>\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})(?:\.\d+)?$', '%Y-%m-%d %H:%M:%S'), (r'^(?P<datetime>\d{4}-\d{1,2}-\d{1,2})$', '%Y-%m-%d'), # ISO 8601 format (r'^(?P<datetime>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.\d+)?(?:Z|[-+]00(?::00)?)?$', '%Y-%m-%dT%H:%M:%S')) for combination in combinations: regex_match = re.match(combination[0], timestamp) if regex_match: try: return datetime.strptime(regex_match.group('datetime'), combination[1]) except ValueError: # In case the user asked for an unreleastic date like "2020:99:99" raise ValueError(error_msg) raise ValueError(error_msg)
def get_connection(self, db_name, force_new=False, retry=None): """ Return an existing psycopg2 connection and establish it if needed. :param str db_name: the database name to get a connection to :kwarg bool force_new: forces a new database connection even if one already exists :kwarg int retry: the number of times to retry a failed connection. If this is not set, then the Teiid connection attempt will be repeated until it is successful. :return: a connection to Teiid :rtype: psycopg2 connection """ if not force_new and db_name in self._connections: return self._connections[db_name] if retry is not None and retry < 1: raise ValueError( 'The retry keyword must contain a value greater than 0') log.debug('Connecting to Teiid host {0}:{1}'.format( self.host, self.port)) attempts = 0 while True: attempts += 1 try: conn = psycopg2.connect(database=db_name, host=self.host, port=str(self.port), user=self.username, password=self.password, connect_timeout=300) break except psycopg2.OperationalError as e: if retry and attempts > retry: raise else: log.exception(e) log.warning( 'The Teiid connection failed on attempt {0}. Sleeping for 60 ' 'seconds.'.format(attempts)) sleep(60) # Teiid does not support setting this value at all and unless we # specify ISOLATION_LEVEL_AUTOCOMMIT (zero), psycopg2 will send a # SET command to the Teiid server doesn't understand. conn.set_isolation_level(0) self._connections[db_name] = conn return conn
def run(self, since=None, until=None): """ Run the dist-git scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load of dist-git commits') if since is None: start_date = self.default_since else: start_date = timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = timestamp_to_date(until) results = self.get_distgit_data(start_date, end_date) total_results = len(results) log.info('Successfully fetched {0} results from Teiid'.format( total_results)) # Overwrite results with the formatted results so we don't have to store both in RAM results = list(self._get_result_chunks(results)) # Upload the results to Neo4j using multi-processing to process chunks of results. We don't # use pool so that way the process doesn't get reused and the RAM is returned to the OS. # This will aid in a work-around for a memory leak from one of the libraries used that # couldn't be tracked down. procs = [] concurrent_procs = 2 for i, result in enumerate(results): # Only check if we've reached the process limit after it's technically possible if i >= concurrent_procs: active_procs = [_proc for _proc in procs if _proc.is_alive()] if len(active_procs) >= concurrent_procs: log.debug( 'There are already {0} processes running. Will wait until one of ' 'them completes.'.format(len(active_procs))) active_procs[0].join() proc = Process(target=self._update_neo4j, args=(neomodel_config.DATABASE_URL, total_results, result)) proc.start() procs.append(proc) for proc in procs: # Wait for all the processes to finish proc.join() log.info('Initial load of dist-git commits complete!')
def is_user_authorized(username, employee_type): """ Verify the user is authorized to access the application. :param str username: the username from the user's token :param str employee_type: the employee type from the user's token :return: a boolean that determines if the user is authorized :rtype: bool """ employee_types = current_app.config.get('EMPLOYEE_TYPES', []) if employee_type in employee_types: log.debug('The user %s is an employee', username) return True ldap_group_dn = current_app.config.get('LDAP_EXCEPTIONS_GROUP_DN') if ldap_group_dn and username in _get_exception_users(): log.debug( 'The user %s is not considered an employee but is an exception', username) return True return False
def query_api_and_update_neo4j(self): """ Scrape the Freshmaker API and upload the data to Neo4j. :param str start_date: a datetime to start scraping data from """ # Initialize session and url session = retry_session() fm_url = self.freshmaker_url while True: log.debug('Querying {0}'.format(fm_url)) try: rv_json = session.get(fm_url, timeout=60).json() except ConnectionError: # TODO: Remove this once FACTORY-3955 is resolved log.error( 'The connection to Freshmaker at %s failed. Skipping the rest of the scraper.', fm_url, ) break for fm_event in rv_json['items']: try: int(fm_event['search_key']) except ValueError: # Skip Freshmaker Events that don't have the search_key as the Advisory ID continue log.debug('Creating FreshmakerEvent {0}'.format(fm_event['id'])) event_params = dict( id_=fm_event['id'], event_type_id=fm_event['event_type_id'], message_id=fm_event['message_id'], state=fm_event['state'], state_name=fm_event['state_name'], state_reason=fm_event['state_reason'], url=fm_event['url'] ) if fm_event.get('time_created'): event_params['time_created'] = timestamp_to_datetime(fm_event['time_created']) if fm_event.get('time_done'): event_params['time_done'] = timestamp_to_datetime(fm_event['time_created']) event = FreshmakerEvent.create_or_update(event_params)[0] log.debug('Creating Advisory {0}'.format(fm_event['search_key'])) advisory = Advisory.get_or_create(dict( id_=fm_event['search_key'] ))[0] event.conditional_connect(event.triggered_by_advisory, advisory) for build_dict in fm_event['builds']: # To handle a faulty container build in Freshmaker if build_dict['build_id'] and int(build_dict['build_id']) < 0: continue log.debug('Creating FreshmakerBuild {0}'.format(build_dict['build_id'])) fb_params = dict( id_=build_dict['id'], dep_on=build_dict['dep_on'], name=build_dict['name'], original_nvr=build_dict['original_nvr'], rebuilt_nvr=build_dict['rebuilt_nvr'], state=build_dict['state'], state_name=build_dict['state_name'], state_reason=build_dict['state_reason'], time_submitted=timestamp_to_datetime(build_dict['time_submitted']), type_=build_dict['type'], type_name=build_dict['type_name'], url=build_dict['url'] ) if build_dict['time_completed']: fb_params['time_completed'] = timestamp_to_datetime( build_dict['time_completed']) if build_dict['build_id']: fb_params['build_id'] = build_dict['build_id'] fb = FreshmakerBuild.create_or_update(fb_params)[0] event.requested_builds.connect(fb) # The build ID obtained from Freshmaker API is actually a Koji task ID task_result = None if build_dict['build_id']: task_result = self.get_koji_task_result(build_dict['build_id']) if not task_result: continue # Extract the build ID from a task result xml_root = ET.fromstring(task_result) # TODO: Change this if a task can trigger multiple builds try: build_id = xml_root.find(".//*[name='koji_builds'].//string").text except AttributeError: build_id = None if not build_id: continue log.debug('Creating ContainerKojiBuild {0}'.format(build_id)) build_params = { 'id_': build_id, 'original_nvr': build_dict['original_nvr'] } try: build = ContainerKojiBuild.create_or_update(build_params)[0] except neomodel.exceptions.ConstraintValidationFailed: # This must have errantly been created as a KojiBuild instead of a # ContainerKojiBuild, so let's fix that. build = KojiBuild.nodes.get_or_none(id_=build_id) if not build: # If there was a constraint validation failure and the build isn't just # the wrong label, then we can't recover. raise build.add_label(ContainerKojiBuild.__label__) build = ContainerKojiBuild.create_or_update(build_params)[0] event.successful_koji_builds.connect(build) if rv_json['meta'].get('next'): fm_url = rv_json['meta']['next'] else: break
def query(self, sql, db='public', retry=None): """ Send the SQL query to Teiid and return the rows as a list. :param str sql: the SQL query to send to the database :kwarg str db: the database name to query on :kwarg int retry: the number of times to retry a failed query. If this is not set, then the Teiid query will be repeated until it is successful. :return: a list of rows from Teiid. Each row is a dictionary with the column headers as the keys. :rtype: list """ con = self.get_connection(db) cursor = con.cursor() if retry is not None and retry < 1: raise ValueError( 'The retry keyword must contain a value greater than 0') if self._last_query_dt: now = datetime.utcnow() now_and_last_diff = now - self._last_query_dt if now_and_last_diff < timedelta(seconds=0.5): sleep(now_and_last_diff.total_seconds()) log.debug('Querying Teiid DB "{0}" with SQL:\n{1}'.format(db, sql)) fifteen_mins = 15 * 60 backoff = 30 attempts = 0 while True: attempts += 1 try: if attempts > 1: # Restart the database connection after failed queries con = self.get_connection(db, force_new=True) cursor = con.cursor() cursor.execute(sql) self._last_query_dt = datetime.utcnow() break except psycopg2.OperationalError as e: if retry and attempts > retry: raise else: log.exception(e) if backoff < fifteen_mins: # Double the backoff time backoff = backoff * 2 elif backoff > fifteen_mins: # Max out the backoff time to 15 minutes backoff = fifteen_mins log.warning( 'The Teiid query failed on attempt {0}. Sleeping for {1} seconds.' .format(attempts, backoff)) sleep(backoff) data = cursor.fetchall() # column header names cols = [t[0] for t in cursor.description or []] log.debug('Found the following columns: {}'.format(cols)) log.debug('Received {} rows from Teiid'.format(len(data))) # build a return array with all columns return [dict(zip(cols, row)) for row in data]
def _update_neo4j(neo4j_url, total_results, counter_and_results): """ Update Neo4j results via mapping with multiprocessing. :param str neo4j_url: database url for Neo4j :param int total_results: the total number of results that will be processed. This is used for a logging statement about progress. :param tuple counter_and_results: a tuple where the first index is the current counter and the second index is a list of dictionaries representing results from Teiid """ try: previous_total = counter_and_results[0] results = counter_and_results[1] # Since _update_neo4j will be run in a separate process, we must configure the database # URL every time the method is run. neomodel_config.DATABASE_URL = neo4j_url # Create a thread pool with 4 threads to speed up queries to cgit pool = ThreadPool(4) counter = 0 for result in results: if counter % 200 == 0: until = counter + 200 if until > len(results): until = len(results) # Because of the joins in the SQL query, we end up with several rows with the # same commit hash and we only want to query cgit once per commit unique_commits = set([(c['module'], c['sha']) for c in results[counter:until]]) log.debug( 'Getting the author email addresses from cgit in parallel ' 'for results {0} to {1}'.format(counter, until)) repos_info = { r['commit']: r for r in pool.map(DistGitScraper._get_repo_info, unique_commits) } # This is no longer needed so it can be cleared to save RAM del unique_commits counter += 1 log.info('Processing commit entry {0}/{1}'.format( previous_total + counter, total_results)) repo_info = repos_info[result['sha']] if not repo_info.get('namespace'): log.info( 'Skipping nodes creation with commit ID {0}'.format( result['commit_id'])) continue log.debug( 'Creating nodes associated with commit ID {0}'.format( result['commit_id'])) repo = DistGitRepo.get_or_create({ 'namespace': repo_info['namespace'], 'name': result['module'] })[0] commit = DistGitCommit.create_or_update({ 'author_date': result['author_date'], 'commit_date': result['commit_date'], 'hash_': result['sha'], # In case we get unicode characters in Python 2 'log_message': bytes(result['log_message'], 'utf-8').decode() })[0] bug = BugzillaBug.get_or_create({'id_': result['bugzilla_id']})[0] log.debug( 'Creating the user nodes associated with commit ID {0}'. format(result['commit_id'])) author = User.create_or_update({ 'username': repo_info['author_username'], 'email': repo_info['author_email'] })[0] log.debug( 'Creating the relationships associated with commit ID {0}'. format(result['commit_id'])) repo.commits.connect(commit) commit.conditional_connect(commit.author, author) if result['bugzilla_type'] == 'related': commit.related_bugs.connect(bug) elif result['bugzilla_type'] == 'resolves': commit.resolved_bugs.connect(bug) elif result['bugzilla_type'] == 'reverted': commit.reverted_bugs.connect(bug) # This is no longer needed so it can be cleared to save RAM del repo_info finally: # Close the DB connection after this is done processing db.driver.close()
def _get_repo_info(repo_and_commit): """ Query cgit for the namespace, username and email of the author. :param tuple repo_and_commit: contains the repo and commit to query for :return: a JSON string of a dictionary with the keys namespace, author_username, author_email, and the commit :rtype: str """ repo, commit = repo_and_commit log.debug( 'Attempting to find the cgit URL for the commit "{0}" in repo "{1}"' .format(commit, repo)) session = retry_session() rv = {'commit': commit} cgit_result = None # The tuple of namespaces to try when determining which namespace this git module belongs # to since this information isn't stored in GitBZ yet namespaces = ('rpms', 'containers', 'modules', 'tests') cgit_url = getenv('ESTUARY_CGIT_URL', 'http://pkgs.devel.redhat.com/cgit/') for namespace in namespaces: url = '{0}{1}/{2}/commit/?id={3}&dt=2'.format( cgit_url, namespace, repo, commit) log.debug('Trying the URL "{0}"'.format(url)) try: cgit_result = session.get(url, timeout=15) except ConnectionError: log.error('The connection to "{0}" failed'.format(url)) continue if cgit_result.status_code == 200: # If the repo is empty, cgit oddly returns a 200 status code, so let's correct the # status code so that the remainder of the code knows it's a bad request if 'Repository seems to be empty' in cgit_result.text: cgit_result.status_code = 404 else: # If the repo is populated and a 200 status code is returned, then we can # assume we found the correct repo break if not cgit_result or cgit_result.status_code != 200: log.error( 'Couldn\'t find the commit "{0}" for the repo "{1}" in the namespaces: {2}' .format(commit, repo, ', '.join(namespaces))) return rv log.debug( 'Found the cgit URL "{0}" for the commit "{1}" in repo "{2}"'. format(url, commit, repo)) rv['namespace'] = namespace # Start parsing the cgit content soup = BeautifulSoup(cgit_result.text, 'html.parser') # Workaround for BS4 in EL7 since `soup.find('th', string=person)` doesn't work in # that environment th_tags = soup.find_all('th') data_found = {'author': False} for th_tag in th_tags: if th_tag.string in ('author'): data_found[th_tag.string] = True username_key = '{0}_username'.format(th_tag.string) email_key = '{0}_email'.format(th_tag.string) rv[username_key], rv[ email_key] = DistGitScraper._parse_username_email_from_cgit( th_tag, commit, namespace, repo) # If all the "th" elements we're interested in were parsed, then break from the loop # early if all(data_found.values()): break soup.decompose() return rv
def query_api_and_update_neo4j(self): """ Scrape the Freshmaker API and upload the data to Neo4j. :param str start_date: a datetime to start scraping data from """ # Initialize session and url session = retry_session() fm_url = self.freshmaker_url while True: log.debug('Querying {0}'.format(fm_url)) rv_json = session.get(fm_url, timeout=15).json() for fm_event in rv_json['items']: try: int(fm_event['search_key']) except ValueError: # Skip Freshmaker Events that don't have the search_key as the Advisory ID continue event = FreshmakerEvent.create_or_update(dict( id_=fm_event['id'], event_type_id=fm_event['event_type_id'], message_id=fm_event['message_id'], state=fm_event['state'], state_name=fm_event['state_name'], state_reason=fm_event['state_reason'], url=fm_event['url'] ))[0] advisory = Advisory.get_or_create(dict( id_=fm_event['search_key'] ))[0] event.conditional_connect(event.triggered_by_advisory, advisory) for build_dict in fm_event['builds']: # To handle a faulty container build in Freshmaker if not build_dict['build_id'] or int(build_dict['build_id']) < 0: continue # The build ID obtained from Freshmaker API is actually a Koji task ID task_result = self.get_koji_task_result(build_dict['build_id']) if not task_result: continue # Extract the build ID from a task result xml_root = ET.fromstring(task_result) # TODO: Change this if a task can trigger multiple builds try: build_id = xml_root.find(".//*[name='koji_builds'].//string").text except AttributeError: build_id = None if build_id: build = ContainerKojiBuild.get_or_create(dict( id_=build_id, original_nvr=build_dict['original_nvr'] ))[0] event.triggered_container_builds.connect(build) if rv_json['meta'].get('next'): fm_url = rv_json['meta']['next'] else: break
def update_neo4j(self, results): """ Update Neo4j with the dist-git commit and push information from Teiid. :param list results: a list of dictionaries """ pool = Pool(processes=8) counter = 0 for result in results: if counter % 200 == 0: until = counter + 200 if until > len(results): until = len(results) # Because of the joins in the SQL query, we end up with several rows with the same # commit hash and we only want to query cgit once per commit unique_commits = set([(c['module'], c['sha']) for c in results[counter:until]]) log.debug( 'Getting the author and committer email addresses from cgit in parallel ' 'for results {0} to {1}'.format(counter, until)) repos_info = {} for _r in pool.map(DistGitScraper._get_repo_info, unique_commits): r = json.loads(_r) repos_info[r['commit']] = r # This is no longer needed so it can be cleared to save RAM del unique_commits # A lot of RAM was allocated or used up, so let's call gc.collect() to ensure it # is removed gc.collect() counter += 1 log.info('Processing commit and push entry {0}/{1}'.format( str(counter), str(len(results)))) repo_info = repos_info[result['sha']] if not repo_info.get('namespace'): log.info( 'Skipping nodes creation with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) continue log.debug( 'Creating nodes associated with commit ID {0} and push ID {1}'. format(result['commit_id'], result['push_id'])) repo = DistGitRepo.get_or_create({ 'namespace': repo_info['namespace'], 'name': result['module'] })[0] branch_name = result['ref'].rsplit('/', 1)[1] branch = DistGitBranch.get_or_create({ 'name': branch_name, 'repo_namespace': repo_info['namespace'], 'repo_name': result['module'] })[0] commit = DistGitCommit.create_or_update({ 'author_date': result['author_date'], 'commit_date': result['commit_date'], 'hash_': result['sha'], # In case we get unicode characters in Python 2 'log_message': bytes(result['log_message'], 'utf-8').decode() })[0] push = DistGitPush.get_or_create({ 'id_': result['push_id'], 'push_date': result['push_date'], 'push_ip': result['push_ip'] })[0] bug = BugzillaBug.get_or_create({'id_': result['bugzilla_id']})[0] log.debug( 'Creating the user nodes associated with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) author = User.create_or_update({ 'username': repo_info['author_username'], 'email': repo_info['author_email'] })[0] committer = User.create_or_update({ 'username': repo_info['committer_username'], 'email': repo_info['committer_email'] })[0] pusher = User.get_or_create({'username': result['pusher']})[0] log.debug( 'Creating the relationships associated with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) repo.contributors.connect(author) repo.contributors.connect(committer) repo.contributors.connect(pusher) repo.commits.connect(commit) repo.pushes.connect(push) repo.branches.connect(branch) branch.contributors.connect(author) branch.contributors.connect(committer) branch.contributors.connect(pusher) branch.commits.connect(commit) branch.pushes.connect(push) push.conditional_connect(push.pusher, pusher) push.commits.connect(commit) commit.conditional_connect(commit.author, author) commit.conditional_connect(commit.committer, committer) if repo_info['parent']: parent_commit = DistGitCommit.get_or_create( {'hash_': repo_info['parent']})[0] commit.conditional_connect(commit.parent, parent_commit) if result['bugzilla_type'] == 'related': commit.related_bugs.connect(bug) elif result['bugzilla_type'] == 'resolves': commit.resolved_bugs.connect(bug) elif result['bugzilla_type'] == 'reverted': commit.reverted_bugs.connect(bug) # This is no longer needed so it can be cleared to save RAM del repo_info
def _get_exception_users(): """ Get the list of users that are explicitly whitelisted. If the LDAP search fails, an empty set is returned. :return: a set of usernames :rtype: set :raise InternalServerError: if a required configuration value is not set or the connection to the LDAP server fails """ # Import this here so it's not required for deployments with auth disabled import ldap3 base_error = '%s is not set in the server configuration' ldap_uri = current_app.config.get('LDAP_URI') if not ldap_uri: log.error(base_error, 'LDAP_URI') raise InternalServerError() ldap_group_dn = current_app.config.get('LDAP_EXCEPTIONS_GROUP_DN') if not ldap_group_dn: log.error(base_error, 'LDAP_EXCEPTIONS_GROUP_DN') raise InternalServerError() if ldap_uri.startswith('ldaps://'): ca = current_app.config['LDAP_CA_CERTIFICATE'] log.debug('Connecting to %s using SSL and the CA %s', ldap_uri, ca) tls = ldap3.Tls(ca_certs_file=ca, validate=ssl.CERT_REQUIRED) server = ldap3.Server(ldap_uri, use_ssl=True, tls=tls) else: log.debug('Connecting to %s without SSL', ldap_uri) server = ldap3.Server(ldap_uri) connection = ldap3.Connection(server) try: connection.open() except ldap3.core.exceptions.LDAPSocketOpenError: log.exception('The connection to %s failed', ldap_uri) raise InternalServerError() membership_attr = current_app.config['LDAP_GROUP_MEMBERSHIP_ATTRIBUTE'] log.debug('Searching for the attribute %s on %s', ldap_group_dn, membership_attr) # Set the scope to base so only the group from LDAP_GROUP_DN is returned success = connection.search(ldap_group_dn, '(cn=*)', search_scope=ldap3.BASE, attributes=[membership_attr]) if not success: log.error( 'The user exceptions list could not be determined because the search for the attribute ' '%s on %s failed with %r', membership_attr, ldap_group_dn, connection.response, ) return set() return set([ dn.split('=')[1].split(',')[0] for dn in connection.response[0]['attributes'][membership_attr] ])