def __init__(self, path_to_repo: str, clone_repo_to: str = None, at: str = 'release'): """ The class constructor. Parameters ---------- path_to_repo : str The path to a local or remote repository. clone_repo_to : str Path to clone the repository to. If path_to_repo links to a local repository, this parameter is not used. Otherwise it is mandatory. at : str When to extract metrics: at each release or each commit. Attributes ---------- dataset: pandas.DataFrame The metrics dataset, populated after ``extract()``. Raises ------ ValueError If `at` is not release or commit, or if the path to the remote repository does not link to a github or gitlab repository. NotImplementedError The commit option is not implemented yet. """ if at not in ('release', 'commit'): raise ValueError(f'{at} is not valid! Use \'release\' or \'commit\'.') self.path_to_repo = path_to_repo if is_remote(path_to_repo): if not clone_repo_to: raise ValueError('clone_repo_to is mandatory when linking to a remote repository.') full_name_pattern = re.compile(r'git(hub|lab)\.com/([\w\W]+)$') match = full_name_pattern.search(path_to_repo.replace('.git', '')) if not match: raise ValueError('The remote repository must be hosted on github or gitlab.') repo_name = match.groups()[1].split('/')[1] self.path_to_repo = os.path.join(clone_repo_to, repo_name) if os.path.isdir(self.path_to_repo): clone_repo_to = None repo_miner = Repository(path_to_repo=path_to_repo, clone_repo_to=clone_repo_to, only_releases=True if at == 'release' else False, order='date-order', num_workers=1) self.commits_at = [commit.hash for commit in repo_miner.traverse_commits()] self.dataset = pd.DataFrame()
def test_only_authors(): lc = list( Repository('test-repos/multiple_authors', only_authors=["Maurício Aniche"]).traverse_commits()) assert len(lc) == 4 lc = list( Repository('test-repos/multiple_authors', only_authors=["ishepard"]).traverse_commits()) assert len(lc) == 1
def test_single_commit_head(): lc = list( Repository('test-repos/complex_repo', single="e7d13b0511f8a176284ce4f92ed8c6e8d09c77f2"). traverse_commits()) assert len(lc) == 1 lc_head = list( Repository('test-repos/complex_repo', single="HEAD").traverse_commits()) assert len(lc_head) == 1 assert lc[0].hash == lc_head[0].hash
def test_single_commit(): lc = list( Repository('test-repos/complex_repo', single="866e997a9e44cb4ddd9e00efe49361420aff2559"). traverse_commits()) assert len(lc) == 1 assert lc[0].hash == "866e997a9e44cb4ddd9e00efe49361420aff2559" lc = list( Repository('test-repos/complex_repo', single="ffccf1e7497eb8136fd66ed5e42bef29677c4b71"). traverse_commits()) assert len(lc) == 1 assert lc[0].hash == "ffccf1e7497eb8136fd66ed5e42bef29677c4b71"
def test_only_in_branches(): # by default, only analyze master assert len( list(Repository( 'test-repos/branches_not_merged').traverse_commits())) == 3 # only analyze b2 assert len( list( Repository('test-repos/branches_not_merged', only_in_branch='b2').traverse_commits())) == 4 # only analyze b1 assert len( list( Repository('test-repos/branches_not_merged', only_in_branch='b1').traverse_commits())) == 5
def discard_undesired_fixing_commits(self, commits: List[str]): """ Given a list of commits, discard commits that do not modify at least one Tosca file. Note, the update occurs in-place. That is, the original list is updated. Parameters ---------- commits : List[str] List of commit hash """ # get a sorted list of commits in ascending order of date self.sort_commits(commits) for commit in Repository( self.path_to_repo, from_commit=commits[0], # first commit in commits to_commit=commits[-1], # last commit in commits only_in_branch=self.branch).traverse_commits(): # if none of the modified files is a TOSCA file, then discard the commit if not any(modified_file.change_type == ModificationType.MODIFY and filters.is_tosca_file(modified_file.new_path, modified_file.source_code) for modified_file in commit.modified_files): if commit.hash in commits: commits.remove(commit.hash)
def discard_undesired_fixing_commits(self, commits: List[str]): """ Given a list of commits, discard commits that do not modify at least one Ansible file. Note, the update occurs in-place. That is, the original list is updated. Parameters ---------- commits : List[str] List of commit hashes """ self.sort_commits(commits) for commit in Repository( self.path_to_repo, from_commit=commits[0], # first commit in commits to_commit=commits[-1], # last commit in commits only_in_branch=self.branch).traverse_commits(): i = 0 # if none of the modified files is a Ansible file then discard the commit while i < len(commit.modified_files): if commit.modified_files[ i].change_type != ModificationType.MODIFY: i += 1 elif not filters.is_ansible_file( commit.modified_files[i].new_path): i += 1 else: break if i == len(commit.modified_files) and commit.hash in commits: commits.remove(commit.hash)
def test_only_in_main_branch(): lc = list(Repository('test-repos/branches_not_merged').traverse_commits()) assert len(lc) == 3 assert lc[0].hash == '04b0af7b53c2a0095e98951571aa41c2e0e0dbec' assert lc[1].hash == 'e51421e0beae6a3c20bdcdfc21066e05db675e03' assert lc[2].hash == 'b197ef4f0b4bc5b7d55c8949ecb1c861731f0b9d'
def test_filepath_with_to(): dt = datetime(2018, 6, 6) assert len( list( Repository(path_to_repo='test-repos/szz', filepath='myfolder/H.java', to=dt).traverse_commits())) == 5
def test_no_filters(): lc = list(Repository('test-repos/different_files').traverse_commits()) assert len(lc) == 3 assert lc[0].hash == 'a1b6136f978644ff1d89816bc0f2bd86f6d9d7f5' assert lc[1].hash == '375de7a8275ecdc0b28dc8de2568f47241f443e9' assert lc[2].hash == 'b8c2be250786975f1c6f47e96922096f1bb25e39'
def test_filepath_with_since(): since = datetime(2018, 6, 6) assert len( list( Repository(path_to_repo='test-repos/szz', filepath='myfolder/H.java', since=since).traverse_commits())) == 11
def test_mod_with_file_types_no_extension(): lc = list( Repository('test-repos/different_files', only_modifications_with_file_types=['.py' ]).traverse_commits()) assert len(lc) == 0
def test_only_releases(): lc = list( Repository('test-repos/tags', only_releases=True).traverse_commits()) assert len(lc) == 3 assert '6bb9e2c6a8080e6b5b34e6e316c894b2ddbf7fcd' == lc[0].hash assert '4638730126d40716e230c2040751a13153fb1556' == lc[1].hash assert '627e1ad917a188a861c9fedf6e5858b79edbe439' == lc[2].hash
def test_between_dates(): list_commits = list(Repository('test-repos/different_files', since=dt1, to=dt2).traverse_commits()) assert len(list_commits) == 2 assert list_commits[0].hash == 'a1b6136f978644ff1d89816bc0f2bd86f6d9d7f5' assert list_commits[1].hash == '375de7a8275ecdc0b28dc8de2568f47241f443e9'
def test_multiple_repos_with_tags(): from_tag = 'tag2' to_tag = 'tag3' repos = ['test-repos/tags', 'test-repos/tags', 'test-repos/tags'] lc = list( Repository(path_to_repo=repos, from_tag=from_tag, to_tag=to_tag).traverse_commits()) assert len(lc) == 9
def test_should_visit_ascendent_order(): lc = list(Repository('test-repos/small_repo').traverse_commits()) assert len(lc) == 5 assert lc[0].hash == 'a88c84ddf42066611e76e6cb690144e5357d132c' assert lc[1].hash == '6411e3096dd2070438a17b225f44475136e54e3a' assert lc[2].hash == '09f6182cef737db02a085e1d018963c7a29bde5a' assert lc[3].hash == '1f99848edadfffa903b8ba1286a935f1b92b2845' assert lc[4].hash == 'da39b1326dbc2edfe518b90672734a08f3c13458'
def test_only_no_merge(): lc = list( Repository('test-repos/branches_merged', only_no_merge=True).traverse_commits()) assert len(lc) == 3 assert lc[0].hash == '168b3aab057ed61a769acf336a4ef5e64f76c9fd' assert lc[1].hash == '8169f76a3d7add54b4fc7bca7160d1f1eede6eda' assert lc[2].hash == '8986af2a679759e5a15794f6d56e6d46c3f302f1'
def test_mod_with_file_types(): lc = list( Repository('test-repos/different_files', only_modifications_with_file_types=['.java' ]).traverse_commits()) assert len(lc) == 2 assert lc[0].hash == 'a1b6136f978644ff1d89816bc0f2bd86f6d9d7f5' assert lc[1].hash == 'b8c2be250786975f1c6f47e96922096f1bb25e39' lc = list( Repository('test-repos/different_files1', only_modifications_with_file_types=['.java' ]).traverse_commits()) assert len(lc) == 2 assert lc[0].hash == '5adbb71167e79ab6b974827e74c9da4d81977655' assert lc[1].hash == '0577bec2387ee131e1ccf336adcc172224d3f6f9'
def test_should_visit_descendent_order_with_filters_reversed(): lc = list( Repository('test-repos/small_repo', from_commit='6411e3096dd2070438a17b225f44475136e54e3a', to_commit='1f99848edadfffa903b8ba1286a935f1b92b2845', order='reverse').traverse_commits()) assert len(lc) == 3 assert lc[0].hash == '1f99848edadfffa903b8ba1286a935f1b92b2845' assert lc[1].hash == '09f6182cef737db02a085e1d018963c7a29bde5a' assert lc[2].hash == '6411e3096dd2070438a17b225f44475136e54e3a'
def test_one_timezone(): lc = list( Repository('test-repos/branches_merged', single='29e929fbc5dc6a2e9c620069b24e2a143af4285f'). traverse_commits()) to_zone = timezone(timedelta(hours=2)) dt = datetime(2016, 4, 4, 13, 21, 25, tzinfo=to_zone) assert lc[0].author_date == dt
def test_between_dates_reversed(): lc = list( Repository('test-repos/different_files', single='375de7a8275ecdc0b28dc8de2568f47241f443e9'). traverse_commits()) to_zone = timezone(timedelta(hours=-4)) dt = datetime(2016, 10, 8, 17, 57, 49, tzinfo=to_zone) assert lc[0].author_date == dt
def test_between_dates_without_timezone(): dt1 = datetime(2016, 10, 8, 21, 0, 0) dt2 = datetime(2016, 10, 8, 21, 59, 0) list_commits = list(Repository('test-repos/different_files', since=dt1, to=dt2).traverse_commits()) assert len(list_commits) == 2 assert list_commits[0].hash == 'a1b6136f978644ff1d89816bc0f2bd86f6d9d7f5' assert list_commits[1].hash == '375de7a8275ecdc0b28dc8de2568f47241f443e9'
def test_only_in_branch(): lc = list( Repository('test-repos/branches_not_merged', only_in_branch='b1').traverse_commits()) assert len(lc) == 5 assert lc[0].hash == '04b0af7b53c2a0095e98951571aa41c2e0e0dbec' assert lc[1].hash == 'e51421e0beae6a3c20bdcdfc21066e05db675e03' assert lc[2].hash == 'b197ef4f0b4bc5b7d55c8949ecb1c861731f0b9d' assert lc[3].hash == '87a31153090808f1e6f679a14ea28729a0b74f4d' assert lc[4].hash == '702d469710d2087e662c210fd0e4f9418ec813fd'
def test_filepath_with_rename(): dt = datetime(2018, 6, 6) commits = list( Repository(path_to_repo='test-repos/small_repo', filepath='file4.java', to=dt).traverse_commits()) assert len(commits) == 2 commit_hashes = [commit.hash for commit in commits] assert 'da39b1326dbc2edfe518b90672734a08f3c13458' in commit_hashes assert 'a88c84ddf42066611e76e6cb690144e5357d132c' in commit_hashes
def test_topo_order(): topo_order = list( Repository('test-repos/order', order='topo-order').traverse_commits()) assert '5e3cfa27b3fe6dd4d12fd89664fea9397141b843' == topo_order[0].hash assert '19732de9e2b58ba7285f272810a9d8ddf18e7c89' == topo_order[1].hash assert '9cc3af5f242a1eba297f270acbdb8b6628556413' == topo_order[2].hash assert 'd23d7f6d37fd1163022a5dd46985acd34e6818d7' == topo_order[3].hash assert '78a94953a3e140f2d0027fb57963345fbf6d59fe' == topo_order[4].hash assert '6564f9e0bfb38725ebcfb4547e98e7f545c7de12' == topo_order[5].hash assert '5c95c1c6ba95a1bdb12772d1a63c7d331e664819' == topo_order[6].hash assert 'a45c8649b00d8b48cee04a822bd1d82acd667db2' == topo_order[7].hash
def test_mod_with_file_types_and_date(): to_zone = timezone(timedelta(hours=2)) dt1 = datetime(2016, 10, 8, 23, 57, 49, tzinfo=to_zone) print(dt1) lc = list( Repository('test-repos/different_files', only_modifications_with_file_types=['.java'], since=dt1).traverse_commits()) print(lc) assert len(lc) == 1 assert lc[0].hash == 'b8c2be250786975f1c6f47e96922096f1bb25e39'
def test_only_commits(): lc = list( Repository('test-repos/complex_repo', only_commits=["9e71dd5726d775fb4a5f08506a539216e878adbb" ]).traverse_commits()) assert len(lc) == 1 assert lc[0].hash == "9e71dd5726d775fb4a5f08506a539216e878adbb" lc = list( Repository('test-repos/complex_repo', only_commits=[ "953737b199de233896f00b4d87a0bc2794317253", "ffccf1e7497eb8136fd66ed5e42bef29677c4b71" ]).traverse_commits()) assert len(lc) == 2 assert lc[0].hash == "ffccf1e7497eb8136fd66ed5e42bef29677c4b71" assert lc[1].hash == "953737b199de233896f00b4d87a0bc2794317253" lc = list( Repository('test-repos/complex_repo', only_commits=[ "866e997a9e44cb4ddd9e00efe49361420aff2559", "57dbd017d1a744b949e7ca0b1c1a3b3dd4c1cbc1", "e7d13b0511f8a176284ce4f92ed8c6e8d09c77f2" ]).traverse_commits()) assert len(lc) == 3 assert lc[0].hash == "866e997a9e44cb4ddd9e00efe49361420aff2559" assert lc[1].hash == "57dbd017d1a744b949e7ca0b1c1a3b3dd4c1cbc1" assert lc[2].hash == "e7d13b0511f8a176284ce4f92ed8c6e8d09c77f2" lc = list( Repository('test-repos/complex_repo', only_commits=["fake hash"]).traverse_commits()) assert len(lc) == 0 total_commits = len( list(Repository('test-repos/complex_repo').traverse_commits())) assert total_commits == 13
def test_filepath_with_rename_complex(): commits = list( Repository(path_to_repo='test-repos/complex_repo', filepath='Matricula.javax').traverse_commits()) assert len(commits) == 6 commit_hashes = [commit.hash for commit in commits] assert 'f0dd1308bd904a9b108a6a40865166ee962af3d4' in commit_hashes assert '953737b199de233896f00b4d87a0bc2794317253' in commit_hashes assert 'a3290ac2f555eabca9e31180cf38e91f9e7e2761' in commit_hashes assert '71535a31f0b598a5d5fcebda7146ebc01def783a' in commit_hashes assert '57dbd017d1a744b949e7ca0b1c1a3b3dd4c1cbc1' in commit_hashes assert '866e997a9e44cb4ddd9e00efe49361420aff2559' in commit_hashes
def test_between_revisions(): from_tag = 'tag1' to_tag = 'tag3' lc = list( Repository('test-repos/tags', from_tag=from_tag, to_tag=to_tag).traverse_commits()) assert len(lc) == 5 assert '6bb9e2c6a8080e6b5b34e6e316c894b2ddbf7fcd' == lc[0].hash assert 'f1a90b8d7b151ceefd3e3dfc0dc1d0e12b5f48d0' == lc[1].hash assert '4638730126d40716e230c2040751a13153fb1556' == lc[2].hash assert 'a26f1438bd85d6b22497c0e5dae003812becd0bc' == lc[3].hash assert '627e1ad917a188a861c9fedf6e5858b79edbe439' == lc[4].hash
def test_include_refs(): commits_no_refs = list( Repository('test-repos/branches_not_merged/', include_refs=False).traverse_commits()) assert len(commits_no_refs) == 3 commit_no_refs_hashes = [commit.hash for commit in commits_no_refs] commits_with_refs = list( Repository('test-repos/branches_not_merged/', include_refs=True).traverse_commits()) assert len(commits_with_refs) == 6 commit_with_refs_hashes = [commit.hash for commit in commits_with_refs] commits_not_in_commits_no_refs = list( set(commit_with_refs_hashes) - set(commit_no_refs_hashes)) assert len(commits_not_in_commits_no_refs) == 3 # First commit on branch b1 assert '87a31153090808f1e6f679a14ea28729a0b74f4d' in commits_not_in_commits_no_refs # Commit that branch b1 points to assert '702d469710d2087e662c210fd0e4f9418ec813fd' in commits_not_in_commits_no_refs # Commit that branch b2 points to assert '7203c0b8220dcc7a59614bc7549799cd203ac072' in commits_not_in_commits_no_refs