def commit_handler(self, msg): """ Handle a dist-git commit message and update Neo4j if necessary. :param dict msg: a message to be processed """ repo = DistGitRepo.get_or_create({ 'namespace': msg['headers']['namespace'], 'name': msg['headers']['repo'] })[0] # Get the username from the email if the email is a Red Hat email email = msg['headers']['email'].lower() if email.endswith('@redhat.com'): username = email.split('@redhat.com')[0] else: username = email author = User.create_or_update({ 'username': username, 'email': email })[0] commit_message = msg['body']['msg']['message'] commit = DistGitCommit.create_or_update({ 'hash_': msg['headers']['rev'], 'log_message': commit_message, 'author_date': timestamp_to_datetime(msg['body']['msg']['author_date']), 'commit_date': timestamp_to_datetime(msg['body']['msg']['commit_date']) })[0] bug_rel_mapping = self.parse_bugzilla_bugs(commit_message) for bug_id in bug_rel_mapping['resolves']: bug = BugzillaBug.get_or_create({'id_': bug_id})[0] commit.resolved_bugs.connect(bug) for bug_id in bug_rel_mapping['related']: bug = BugzillaBug.get_or_create({'id_': bug_id})[0] commit.related_bugs.connect(bug) for bug_id in bug_rel_mapping['reverted']: bug = BugzillaBug.get_or_create({'id_': bug_id})[0] commit.reverted_bugs.connect(bug) commit.conditional_connect(commit.author, author) repo.commits.connect(commit)
def _update_neo4j(neo4j_url, total_results, counter_and_results): """ Update Neo4j results via mapping with multiprocessing. :param str neo4j_url: database url for Neo4j :param int total_results: the total number of results that will be processed. This is used for a logging statement about progress. :param tuple counter_and_results: a tuple where the first index is the current counter and the second index is a list of dictionaries representing results from Teiid """ try: previous_total = counter_and_results[0] results = counter_and_results[1] # Since _update_neo4j will be run in a separate process, we must configure the database # URL every time the method is run. neomodel_config.DATABASE_URL = neo4j_url # Create a thread pool with 4 threads to speed up queries to cgit pool = ThreadPool(4) counter = 0 for result in results: if counter % 200 == 0: until = counter + 200 if until > len(results): until = len(results) # Because of the joins in the SQL query, we end up with several rows with the # same commit hash and we only want to query cgit once per commit unique_commits = set([(c['module'], c['sha']) for c in results[counter:until]]) log.debug( 'Getting the author email addresses from cgit in parallel ' 'for results {0} to {1}'.format(counter, until)) repos_info = { r['commit']: r for r in pool.map(DistGitScraper._get_repo_info, unique_commits) } # This is no longer needed so it can be cleared to save RAM del unique_commits counter += 1 log.info('Processing commit entry {0}/{1}'.format( previous_total + counter, total_results)) repo_info = repos_info[result['sha']] if not repo_info.get('namespace'): log.info( 'Skipping nodes creation with commit ID {0}'.format( result['commit_id'])) continue log.debug( 'Creating nodes associated with commit ID {0}'.format( result['commit_id'])) repo = DistGitRepo.get_or_create({ 'namespace': repo_info['namespace'], 'name': result['module'] })[0] commit = DistGitCommit.create_or_update({ 'author_date': result['author_date'], 'commit_date': result['commit_date'], 'hash_': result['sha'], # In case we get unicode characters in Python 2 'log_message': bytes(result['log_message'], 'utf-8').decode() })[0] bug = BugzillaBug.get_or_create({'id_': result['bugzilla_id']})[0] log.debug( 'Creating the user nodes associated with commit ID {0}'. format(result['commit_id'])) author = User.create_or_update({ 'username': repo_info['author_username'], 'email': repo_info['author_email'] })[0] log.debug( 'Creating the relationships associated with commit ID {0}'. format(result['commit_id'])) repo.commits.connect(commit) commit.conditional_connect(commit.author, author) if result['bugzilla_type'] == 'related': commit.related_bugs.connect(bug) elif result['bugzilla_type'] == 'resolves': commit.resolved_bugs.connect(bug) elif result['bugzilla_type'] == 'reverted': commit.reverted_bugs.connect(bug) # This is no longer needed so it can be cleared to save RAM del repo_info finally: # Close the DB connection after this is done processing db.driver.close()
def update_neo4j(self, results): """ Update Neo4j with the dist-git commit and push information from Teiid. :param list results: a list of dictionaries """ pool = Pool(processes=8) counter = 0 for result in results: if counter % 200 == 0: until = counter + 200 if until > len(results): until = len(results) # Because of the joins in the SQL query, we end up with several rows with the same # commit hash and we only want to query cgit once per commit unique_commits = set([(c['module'], c['sha']) for c in results[counter:until]]) log.debug( 'Getting the author and committer email addresses from cgit in parallel ' 'for results {0} to {1}'.format(counter, until)) repos_info = {} for _r in pool.map(DistGitScraper._get_repo_info, unique_commits): r = json.loads(_r) repos_info[r['commit']] = r # This is no longer needed so it can be cleared to save RAM del unique_commits # A lot of RAM was allocated or used up, so let's call gc.collect() to ensure it # is removed gc.collect() counter += 1 log.info('Processing commit and push entry {0}/{1}'.format( str(counter), str(len(results)))) repo_info = repos_info[result['sha']] if not repo_info.get('namespace'): log.info( 'Skipping nodes creation with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) continue log.debug( 'Creating nodes associated with commit ID {0} and push ID {1}'. format(result['commit_id'], result['push_id'])) repo = DistGitRepo.get_or_create({ 'namespace': repo_info['namespace'], 'name': result['module'] })[0] branch_name = result['ref'].rsplit('/', 1)[1] branch = DistGitBranch.get_or_create({ 'name': branch_name, 'repo_namespace': repo_info['namespace'], 'repo_name': result['module'] })[0] commit = DistGitCommit.create_or_update({ 'author_date': result['author_date'], 'commit_date': result['commit_date'], 'hash_': result['sha'], # In case we get unicode characters in Python 2 'log_message': bytes(result['log_message'], 'utf-8').decode() })[0] push = DistGitPush.get_or_create({ 'id_': result['push_id'], 'push_date': result['push_date'], 'push_ip': result['push_ip'] })[0] bug = BugzillaBug.get_or_create({'id_': result['bugzilla_id']})[0] log.debug( 'Creating the user nodes associated with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) author = User.create_or_update({ 'username': repo_info['author_username'], 'email': repo_info['author_email'] })[0] committer = User.create_or_update({ 'username': repo_info['committer_username'], 'email': repo_info['committer_email'] })[0] pusher = User.get_or_create({'username': result['pusher']})[0] log.debug( 'Creating the relationships associated with commit ID {0} and push ID {1}' .format(result['commit_id'], result['push_id'])) repo.contributors.connect(author) repo.contributors.connect(committer) repo.contributors.connect(pusher) repo.commits.connect(commit) repo.pushes.connect(push) repo.branches.connect(branch) branch.contributors.connect(author) branch.contributors.connect(committer) branch.contributors.connect(pusher) branch.commits.connect(commit) branch.pushes.connect(push) push.conditional_connect(push.pusher, pusher) push.commits.connect(commit) commit.conditional_connect(commit.author, author) commit.conditional_connect(commit.committer, committer) if repo_info['parent']: parent_commit = DistGitCommit.get_or_create( {'hash_': repo_info['parent']})[0] commit.conditional_connect(commit.parent, parent_commit) if result['bugzilla_type'] == 'related': commit.related_bugs.connect(bug) elif result['bugzilla_type'] == 'resolves': commit.resolved_bugs.connect(bug) elif result['bugzilla_type'] == 'reverted': commit.reverted_bugs.connect(bug) # This is no longer needed so it can be cleared to save RAM del repo_info