def main(): import sys # create a new session and init db tables session = SessionWrapper.new(init=True) repos = session.query(GhIssue.slug).distinct() for r in repos: get_commits(r.slug, sys.argv[1])
def main(): session = SessionWrapper.new(init=True) # only repos for which there are valid issue links repos = session.query(IssueLink.repo_id, Repo.slug).filter( and_(IssueLink.repo_id == Repo.id, IssueLink.is_pr == 0, IssueLink.delta_open > 0, IssueLink.delta_closed <= 0)).distinct().all() tokens = Tokens() tokens_iter = tokens.iterator() tokens_queue = Queue() for token in tokens_iter: tokens_queue.put(token) tokens_map = dict() for r in repos: b = Blamer(tokens, tokens_queue, tokens_map) b.get_blamed_commits(r.slug, r.repo_id)
from orm.initdb import SessionWrapper from orm.tables import Project, Commit, Blame, Repo, Bug_Commit_Timeline, Bug_Issue_Timeline, Issue_Timeline from orm.ghissue import GhIssue, GHIssueClassification session = SessionWrapper.new(init=True) #session.execute("Truncate table bug_commit_timeline") def create_dict(): out = dict() for year in range(2012, 2019): innedDict = dict() for quart in range(0, 4): innedDict[quart] = 0 out[year] = innedDict return out def extract_blame(): for project in session.query(Project).filter(Project.num_commits >= 0): # if not (project.language =="C++" or project.language=="Java"): # continue repo = session.query(Repo).filter_by(slug=project.name).first() blame_dict = create_dict() blamed_entries = session.query(Blame).filter_by(repo_id=repo.id) num_blamed_entries = blamed_entries.count() for entry in blamed_entries: blamed_commit_sha = entry.blamed_sha try: blamed_commit = session.query(Commit).filter_by(
def get_commits(slug, repos_folder): contributors = {} counter = itertools.count(start=1) basic_classifier = BasicFileTypeClassifier() session = SessionWrapper.new() try: folder_name = slugToFolderName(slug) folder_path = os.path.join(repos_folder, folder_name) min_commit = datetime.now(timezone.utc) max_commit = min_commit - timedelta(days=100 * 365) total_commits = 0 if not os.path.exists(folder_path): return slug try: db_repo = session.query(Repo).filter_by(slug=slug).one() # the reason why we return here is to skip analyzing # again a repo in case of crashing exception that forces # the script to be run again logger.info( msg="Skipping analysis of commits from %s, already in the db" % slug) #return slug except exc.NoResultFound: db_repo = Repo(slug, min_commit, max_commit, total_commits) session.add(db_repo) session.commit() except exc.MultipleResultsFound: logger.warning(msg="Found multiple results querying for repo %s." % slug) pass git_repo = pygit2.Repository(folder_path) last = git_repo[git_repo.head.target] # Fetch all commits as an iterator, and iterate it for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME): commit = CommitWrapper(c) total_commits += 1 sha = commit.getSha() authored_datetime = commit.getAuthoredDate() committed_datetime = commit.getCommittedDate() if authored_datetime < min_commit: min_commit = authored_datetime if authored_datetime > max_commit: max_commit = authored_datetime (author_name, author_email) = commit.getAuthor() (author_name_l, author_email_l) = (author_name.lower(), author_email.lower()) (committer_name, committer_email) = commit.getCommitter() (committer_name_l, committer_email_l) = (committer_name.lower(), committer_email.lower()) if (author_name_l, author_email_l) not in contributors: contributors[(author_name_l, author_email_l)] = next(counter) author_id = contributors[(author_name_l, author_email_l)] if (committer_name_l, committer_email_l) not in contributors: contributors[(committer_name_l, committer_email_l)] = next(counter) committer_id = contributors[(committer_name_l, committer_email_l)] parents = commit.getParents() num_parents = len(parents) if not num_parents: continue message = commit.getMessage().strip() try: db_commit = session.query(Commit).filter_by(repo_id=db_repo.id, sha=sha).one() continue # if already present, stop and go on analyzing the next one except exc.NoResultFound: diff = commit.getDiff(git_repo) loc_added = diff.stats.insertions loc_deleted = diff.stats.deletions num_files_touched = diff.stats.files_changed # get info about changes to src files in the new commit all_files, src_files, num_src_files_touched, src_loc_added, src_loc_deleted = \ CommitWrapper.get_src_changes(basic_classifier, diff) try: db_commit = Commit(db_repo.id, sha, authored_datetime, author_id, committer_id, message, num_parents, loc_added, loc_deleted, num_files_touched, all_files, src_loc_added, src_loc_deleted, num_src_files_touched, src_files) session.add(db_commit) # required to flush the pending data before adding to the CommitFiles table below session.commit() except: all_files = "" src_files = "" message = "" db_commit = Commit(db_repo.id, sha, authored_datetime, author_id, committer_id, message, num_parents, loc_added, loc_deleted, num_files_touched, all_files, src_loc_added, src_loc_deleted, num_src_files_touched, src_files) session.add(db_commit) # required to flush the pending data before adding to the CommitFiles table below session.commit() # parse changed files per diff for patch in diff: commit_file = os.path.basename(patch.delta.new_file.path) try: commit_file = session.query(CommitFiles).filter_by( commit_sha=sha, repo_slug=slug, file=commit_file).one() continue # if already present, stop and go on analyzing the next one except exc.NoResultFound: lang = basic_classifier.labelFile(commit_file) loc_ins = 0 loc_del = 0 for hunk in patch.hunks: for hl in hunk.lines: if hl.origin == '-': loc_del -= 1 elif hl.origin == '+': loc_ins += 1 commit_file = CommitFiles(db_repo.id, db_repo.slug, sha, commit_file, loc_ins, loc_del, lang) session.add(commit_file) session.commit() if message is not None: issue_id_results = commit.getIssueIds() if len(issue_id_results) >= 1: num_valid_issues = 0 for (line_num, issue_ids) in issue_id_results: for issue_id in issue_ids: logger.info(msg="Analyzing {0} issue {1}.".format( slug, issue_id)) try: gh_issue = session.query(GhIssue).filter( and_(GhIssue.slug == slug, GhIssue.issue_number == issue_id)).one() except exc.MultipleResultsFound: logger.warning( msg="{0}: Issue {1} has multiple entries.". format(slug, issue_id)) continue except exc.NoResultFound: logger.warning( msg= "{0}: Issue {1} no entry found in the issue table." .format(slug, issue_id)) continue try: db_link = session.query(IssueLink).filter( and_(IssueLink.repo_id == db_repo.id, IssueLink.sha == sha, IssueLink.issue_number == issue_id)).one() print(db_repo.id, "Touch") continue except exc.NoResultFound: delta_open = ( authored_datetime - gh_issue.created_at.replace( tzinfo=pytz.utc)).total_seconds() if gh_issue.closed_at is not None: delta_closed = ( authored_datetime - gh_issue.closed_at.replace( tzinfo=pytz.utc)).total_seconds() if delta_open > 0 and delta_closed <= 0 and gh_issue.pr_num is None: num_valid_issues += 1 else: delta_closed = None db_link = IssueLink( db_repo.id, sha, line_num, issue_id, gh_issue.pr_num is not None, delta_open, delta_closed) session.add(db_link) for (name, email), user_id in sorted(contributors.items(), key=lambda e: e[1]): try: db_user = session.query(User).filter( and_(User.name == func.binary(name), User.email == func.binary(email), User.repo_id == db_repo.id)).one() except exc.NoResultFound: db_user = User(db_repo.id, user_id, name, email) session.add(db_user) except exc.MultipleResultsFound: # Would this happens because we allow name aliases during mining? # Should we deal with it? And how? logger.warning( msg="Multiple entries for user \'{0}\' <{1}> in repo {2}". format(name, email, db_repo.slug)) db_repo.min_commit = min_commit db_repo.max_commit = max_commit db_repo.total_commits = total_commits session.add(db_repo) session.commit() return slug except Exception as e: logger.error(msg="{0}: unknown error:\t{1}".format(slug, e)) traceback.print_exc() finally: return slug
def get_commits(slug, repos_folder): contributors = {} counter = itertools.count(start=1) basic_classifier = BasicFileTypeClassifier() session = SessionWrapper.new() try: folder_name = slugToFolderName(slug) folder_path = os.path.join(repos_folder, folder_name) min_commit = datetime.now(timezone.utc) max_commit = min_commit - timedelta(days=100 * 365) total_commits = 0 if not os.path.exists(folder_path): return slug try: db_repo = session.query(Repo).filter_by(slug=slug).one() except exc.NoResultFound: db_repo = Repo(slug, min_commit, max_commit, total_commits) session.add(db_repo) session.commit() except exc.MultipleResultsFound: logger.warning(msg="Found multiple results querying for repo %s." % slug) pass git_repo = pygit2.Repository(folder_path) last = git_repo[git_repo.head.target] # Fetch all commits as an iterator, and iterate it for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME): commit = CommitWrapper(c) total_commits += 1 sha = commit.getSha() authored_datetime = commit.getAuthoredDate() committed_datetime = commit.getCommittedDate() if authored_datetime < min_commit: min_commit = authored_datetime if authored_datetime > max_commit: max_commit = authored_datetime (author_name, author_email) = commit.getAuthor() (author_name_l, author_email_l) = (author_name.lower(), author_email.lower()) (committer_name, committer_email) = commit.getCommitter() (committer_name_l, committer_email_l) = (committer_name.lower(), committer_email.lower()) if (author_name_l, author_email_l) not in contributors: contributors[(author_name_l, author_email_l)] = next(counter) author_id = contributors[(author_name_l, author_email_l)] if (committer_name_l, committer_email_l) not in contributors: contributors[(committer_name_l, committer_email_l)] = next(counter) committer_id = contributors[(committer_name_l, committer_email_l)] message = commit.getMessage() if message is not None: issue_ids = commit.getIssueIds() if len(issue_ids) >= 1: num_valid_issues = 0 for issue_id in issue_ids: try: # was session_travis gh_issue = session.query(GhIssue).filter( and_(GhIssue.slug == slug, GhIssue.issue_number == issue_id)).one() except exc.MultipleResultsFound: logger.warning( msg="{0}: Issue {1} has multiple entries.". format(slug, issue_id)) continue except exc.NoResultFound: logger.warning( msg= "{0}: Issue {1} no entry found in the issue table." .format(slug, issue_id)) continue try: db_link = session.query(IssueLink).filter( and_( IssueLink.repo_id == db_repo.id, IssueLink.sha == sha, IssueLink.issue_number == issue_id)).one() except exc.NoResultFound: # why authored_datetime and not commited_datetime ???????? ## TODO delta_open = ( authored_datetime - gh_issue.created_at.replace(tzinfo=pytz.utc) ).total_seconds() ### closed at is important!!!!!!!! ## TODO if gh_issue.closed_at is not None: delta_closed = ( authored_datetime - gh_issue.closed_at.replace(tzinfo=pytz.utc) ).total_seconds() if delta_open > 0 and delta_closed <= 0 and gh_issue.pr_num is None: num_valid_issues += 1 else: delta_closed = None db_link = IssueLink(db_repo.id, sha, issue_id, gh_issue.pr_num is not None, delta_open, delta_closed) session.add(db_link) if not num_valid_issues: continue first_msg_line = message.split('\n')[0] parents = commit.getParents() num_parents = len(parents) if not num_parents: continue sha_parent = parents[0].hex diff = commit.getDiff(git_repo) try: db_commit = session.query(Commit).filter_by( sha=sha).one() except exc.NoResultFound: db_commit = Commit(db_repo.id, sha, authored_datetime, author_id, committer_id, first_msg_line, num_parents, diff.stats.insertions, diff.stats.deletions, diff.stats.files_changed) session.add(db_commit) session.commit() # TODO parte da estrarre in un altro script blamed_commits = {} for patch in diff: old_file = patch.delta.old_file.path label = basic_classifier.labelFile(old_file) # Ignore changes to documentation files if label == basic_classifier.DOC: continue line_labels = {} blame_counter = {} for hunk in patch.hunks: if hunk.old_lines: for hl in hunk.lines: if hl.origin == '-': line_labels[ hl. old_lineno] = basic_classifier.labelDiffLine( hl.content.replace('\r', '').replace( '\n', '')) try: for bh in git_repo.blame( old_file, newest_commit=sha_parent, min_line=hunk.old_start, max_line=hunk.old_start + hunk.old_lines - 1): blamed_sha = str(bh.final_commit_id) if blamed_sha in blamed_commits: blamed_commit = blamed_commits[ blamed_sha] else: try: blamed_commit = CommitWrapper( git_repo.revparse_single( blamed_sha)) blamed_commits[ blamed_sha] = blamed_commit blamed_parents = blamed_commit.getParents( ) blamed_num_parents = len( blamed_parents) if not blamed_num_parents: ins = None dels = None files = None else: blamed_diff = blamed_commit.getDiff( git_repo) ins = blamed_diff.stats.insertions dels = blamed_diff.stats.deletions files = blamed_diff.stats.files_changed # Ignore commits that changed more than 100 files if files >= 100: continue try: blamed_db_commit = session.query( Commit).filter_by( sha=blamed_sha ).one() except exc.MultipleResultsFound: logger.warning( msg= "{0}: Multiple rows for blamed sha {1}." .format( slug, blamed_sha)) traceback.print_exc() except exc.NoResultFound: blamed_authored_datetime = blamed_commit.getAuthoredDate( ) ( blamed_author_name, blamed_author_email ) = blamed_commit.getAuthor( ) (blamed_author_name_l, blamed_author_email_l) = ( blamed_author_name. lower(), blamed_author_email. lower()) ( blamed_committer_name, blamed_committer_email ) = blamed_commit.getCommitter( ) (blamed_committer_name_l, blamed_committer_email_l ) = ( blamed_committer_name. lower(), blamed_committer_email .lower()) if (blamed_author_name_l, blamed_author_email_l ) not in contributors: contributors[( blamed_author_name_l, blamed_author_email_l )] = next(counter) blamed_author_id = contributors[ (blamed_author_name_l, blamed_author_email_l )] if (blamed_committer_name_l, blamed_committer_email_l ) not in contributors: contributors[( blamed_committer_name_l, blamed_committer_email_l )] = next(counter) blamed_committer_id = contributors[ (blamed_committer_name_l, blamed_committer_email_l )] blamed_message = blamed_commit.getMessage( ) blamed_first_msg_line = blamed_message.split( '\n')[0] blamed_db_commit = Commit( db_repo.id, blamed_sha, blamed_authored_datetime, blamed_author_id, blamed_committer_id, blamed_first_msg_line, blamed_num_parents, ins, dels, files) session.add( blamed_db_commit) session.commit() except Exception as e: logger.error( msg= "{0}: revparse error {1}:\t{2}" .format( slug, blamed_sha, e)) traceback.print_exc() for line_num in range( bh.final_start_line_number, bh.final_start_line_number + bh.lines_in_hunk): if line_labels[ line_num] == basic_classifier.CG_CODE: blame_counter.setdefault( blamed_sha, 0) blame_counter[blamed_sha] += 1 except Exception as e: logger.error( msg="{0} blame error {1}:\t{2}".format( slug, sha, e)) for blamed_sha, num_lines in blame_counter.items(): b = Blame(db_repo.id, sha, old_file, label, blamed_sha, num_lines) session.add(b) session.commit() for (name, email), user_id in sorted(contributors.items(), key=lambda e: e[1]): try: db_user = session.query(User).filter( and_(User.name == func.binary(name), User.email == func.binary(email), User.repo_id == db_repo.id)).one() except exc.NoResultFound: db_user = User(db_repo.id, user_id, name, email) session.add(db_user) except exc.MultipleResultsFound: # FIXME this should'nt be happening # is it because we allow name aliases during mining? # How do we deal with it now? logger.warning( msg="Multiple entries for user \'{0}\' <{1}> in repo {2}". format(name, email, db_repo.slug)) db_repo.min_commit = min_commit db_repo.max_commit = max_commit db_repo.total_commits = total_commits session.add(db_repo) session.commit() return slug except Exception as e: logger.error(msg="{0}: unknown error:\t{1}".format(slug, e)) traceback.print_exc() finally: return slug
def get_blamed_commits(self, slug, db_repo_id, repos_folder='./repos'): session = SessionWrapper.new() basic_classifier = BasicFileTypeClassifier() folder_path = os.path.join(repos_folder, slugToFolderName(slug)) try: git_repo = pygit2.Repository(folder_path) last = git_repo[git_repo.head.target] except Exception: logger.error("Git error opening repo %s" % slug) return try: contributors = self.get_contributors(session, db_repo_id) # TODO check start number counter = itertools.count(start=len(contributors)) except exc.NoResultFound: logger.error( msg="No contributors found for repo {0}.".format(slug)) traceback.print_exc() pass repo, pid, gh = self.get_gh_repo(slug) blamed_commits = {} # Fetch all commits as an iterator, and iterate it for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME): commit = CommitWrapper(c) sha = commit.getSha() closes_valid_issue = False issue_links = session.query(IssueLink).filter( and_(IssueLink.repo_id == db_repo_id, IssueLink.sha == sha, IssueLink.is_pr == 0, IssueLink.delta_open > 0, IssueLink.delta_closed <= 0)) """ Valid issues are those 1) for which the associated commit was registered *after* the issue was open (delta open > 0) 2) for which the associated commit was registered *before or exactly when* the associated issue was closed (delta closed <= 0) 3) are not pull requests (is_pr == 1), just issues (is_pr == 0) """ for issue_link in issue_links: # check for possible labels: if 'feature' or 'enhancement', ignore # if no labels or labels are 'fix', 'bug-fix', retain self.wait_if_depleted(pid, gh) issue = repo.get_issue(issue_link.issue_number) if issue: if not issue.labels: # no labels is fine closes_valid_issue = True break else: for label in issue.labels: if label.name in self.invalid_labels: break elif label.name in self.valid_labels: closes_valid_issue = True break if not closes_valid_issue: continue logger.info("Blaming commit %s from repo %s" % (sha, slug)) try: sha_parent = commit.getParents()[0].hex except IndexError: continue diff = commit.getDiff(git_repo) for patch in diff: # skip changes to binary files if patch.delta.is_binary: continue old_file = patch.delta.old_file.path label = basic_classifier.labelFile(old_file) # Ignore changes to documentation files if label == basic_classifier.DOC: continue line_labels = {} blame_counter = {} for hunk in patch.hunks: if hunk.old_lines: for hl in hunk.lines: """ only changes to deleted lines can be tracked back to when they were first introduced there is no parent commit that introduced a new line that it's being added in the current commit for the first time (ie, lines marked with a '+' in the diffs) this is not a basic SZZ implementation, as we classify changes at line level (e.g., skip changes to line of comments) """ if hl.origin == '-': line_labels[ hl. old_lineno] = basic_classifier.labelDiffLine( hl.content.replace('\r', '').replace( '\n', '')) try: for bh in git_repo.blame(old_file, newest_commit=sha_parent, min_line=hunk.old_start, max_line=hunk.old_start + hunk.old_lines - 1): blamed_sha = str(bh.final_commit_id) # if sha of commit is not already in the list of blamed commit if blamed_sha not in blamed_commits: try: blamed_commit = CommitWrapper( git_repo.revparse_single( blamed_sha)) blamed_commits[ blamed_sha] = blamed_commit blamed_parents = blamed_commit.getParents( ) blamed_num_parents = len( blamed_parents) if not blamed_num_parents: ins = None dels = None num_files = None else: blamed_diff = blamed_commit.getDiff( git_repo) ins = blamed_diff.stats.insertions dels = blamed_diff.stats.deletions num_files = blamed_diff.stats.files_changed # TODO fine-tune: Ignore commits that changed more than 100 files if num_files is None or num_files >= 50: continue # TODO fine-tune: filter number of new lines (ins) if ins and ins >= 200: continue try: blamed_db_commit = session.query( Commit).filter_by( sha=blamed_sha).one() except exc.MultipleResultsFound: logger.warning( msg= "{0}: Multiple rows for blamed sha {1}." .format(slug, blamed_sha)) traceback.print_exc() except exc.NoResultFound: # TODO does it ever happen? logger.warning( "exc.NoResultFound at line 141 of blame" ) blamed_authored_datetime = blamed_commit.getAuthoredDate( ) (blamed_author_name, blamed_author_email ) = blamed_commit.getAuthor() (blamed_author_name_l, blamed_author_email_l) = ( blamed_author_name.lower(), blamed_author_email.lower()) (blamed_committer_name, blamed_committer_email ) = blamed_commit.getCommitter() (blamed_committer_name_l, blamed_committer_email_l) = ( blamed_committer_name.lower(), blamed_committer_email.lower( )) if (blamed_author_name_l, blamed_author_email_l ) not in contributors: logger.debug( msg= "Found a blamed author {0} - {1} not in the contributors list for repo {2}." .format( blamed_author_name_l, blamed_author_email_l, slug)) # TODO what to do with newly added contributors here? save to db??? contributors[( blamed_author_name_l, blamed_author_email_l )] = next(counter) blamed_author_id = contributors[( blamed_author_name_l, blamed_author_email_l)] if (blamed_committer_name_l, blamed_committer_email_l ) not in contributors: logger.debug( msg= "Found a blamed author {0} - {1} not in the contributors list for repo {2}." .format( blamed_committer_name_l, blamed_committer_email_l, slug)) # TODO what to do with newly added contributors here? save to db??? contributors[( blamed_committer_name_l, blamed_committer_email_l )] = next(counter) blamed_committer_id = contributors[ (blamed_committer_name_l, blamed_committer_email_l)] blamed_message = blamed_commit.getMessage( ) blamed_first_msg_line = blamed_message.split( '\n')[0] # get info about changes to src files in the new blamed commit all_files, src_files, num_src_files_touched, src_loc_added, src_loc_deleted = \ CommitWrapper.get_src_changes(basic_classifier, blamed_commit.getDiff(git_repo)) blamed_db_commit = Commit( db_repo_id, blamed_sha, blamed_authored_datetime, blamed_author_id, blamed_committer_id, blamed_first_msg_line, blamed_num_parents, ins, dels, num_files, all_files, src_loc_added, src_loc_deleted, num_src_files_touched, src_files) session.add(blamed_db_commit) session.commit() except Exception as e: logger.error( msg="{0}: revparse error {1}:\t{2}" .format(slug, blamed_sha, e)) traceback.print_exc() for line_num in range( bh.final_start_line_number, bh.final_start_line_number + bh.lines_in_hunk): if line_labels[ line_num] == basic_classifier.CG_CODE: blame_counter.setdefault(blamed_sha, 0) blame_counter[blamed_sha] += 1 except ValueError as ve: logger.error( msg= "{0} blame error on commit {1} probably due to changes coming from a submodule: {2}" .format(slug, sha, ve)) except Exception as e: logger.error( msg="{0} Unknown blame error on commit {1}: {2}" .format(slug, sha, e)) traceback.print_exc() for blamed_sha, num_lines in blame_counter.items(): b = Blame(db_repo_id, sha, old_file, label, blamed_sha, num_lines) session.add(b) session.commit()
__all__ = [ 'base', 'tables', 'ghissue', 'initdb', 'issue_comments', 'commit_files', 'cross_reference', 'SessionWrapper' ] from orm.base import Base from orm.commit_files import CommitFiles from orm.cross_reference import CrossReference from orm.ghissue import GhIssue, BGhIssue, GHIssueClassification from orm.initdb import SessionWrapper, SessionWrapper_GHT from orm.issue_comments import IssueComment from orm.tables import User, Repo, Commit, IssueLink, Blame, Project, Control_Repo from orm.lindholmen import Repo_Lindholmen, UMLFile_Lindholmen, Commit_Lindholmen, Lindholmen_Issues from orm.ght import Repo_GHT, PR_GHT, Issue_GHT, User_GHT SessionWrapper.load_config()