def test_extract_edits_1(git_repo_dir): commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505' filename = 'first_lines.txt' extraction_settings = { 'use_blocks': False, 'blame_C': 'CCC4', 'extract_complexity': True, 'extract_text': True } git_repo = pydriller.GitRepository(git_repo_dir) commit = git_repo.get_commit(commit_hash) for mod in commit.modifications: if mod.filename == filename: df = git2net.extraction._extract_edits(git_repo, commit, mod, extraction_settings) assert len(df) == 3 assert df.at[ 0, 'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97' assert df.at[ 1, 'original_commit_addition'] == '6b531fcb57d5b9d98dd983cb65357d82ccca647b' assert df.at[ 2, 'original_commit_addition'] == None # as there is no match due to line ending
def test_extract_edits_2(repo_string): commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505' filename = 'first_lines.txt' git_repo = pydriller.GitRepository(repo_string) commit = git_repo.get_commit(commit_hash) df = None for mod in commit.modifications: if mod.filename == filename: df = git2net.extraction._extract_edits(git_repo, commit, mod, use_blocks=True, blame_C='CCC4') assert len(df) == 1 assert df.at[0, 'original_commit_addition'] == 'not available with use_blocks'
def test_extract_edits_2(git_repo_dir): commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505' filename = 'first_lines.txt' extraction_settings = { 'use_blocks': True, 'blame_C': 'CCC4', 'extract_complexity': True, 'extract_text': True } git_repo = pydriller.GitRepository(git_repo_dir) commit = git_repo.get_commit(commit_hash) df = None for mod in commit.modifications: if mod.filename == filename: df = git2net.extraction._extract_edits(git_repo, commit, mod, extraction_settings) assert len(df) == 1 assert df.at[0, 'original_commit_addition'] == 'not available with use_blocks'
def test_identify_edits(repo_string): commit_hash = 'f343ed53ee64717f85135c4b8d3f6bd018be80ad' filename = 'text_file.txt' git_repo = pydriller.GitRepository(repo_string) commit = git_repo.get_commit(commit_hash) for x in commit.modifications: if x.filename == filename: mod = x parsed_lines = git_repo.parse_diff(mod.diff) deleted_lines = {x[0]: x[1] for x in parsed_lines['deleted']} added_lines = {x[0]: x[1] for x in parsed_lines['added']} _, edits = git2net.extraction._identify_edits(deleted_lines, added_lines, use_blocks=False) assert list(edits.type) == [ 'deletion', 'replacement', 'deletion', 'replacement', 'addition', 'addition', 'addition' ]
def test_extract_edits_1(repo_string): commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505' filename = 'first_lines.txt' git_repo = pydriller.GitRepository(repo_string) commit = git_repo.get_commit(commit_hash) for mod in commit.modifications: if mod.filename == filename: df = git2net.extraction._extract_edits(git_repo, commit, mod, use_blocks=False, blame_C='CCC4') assert len(df) == 3 assert df.at[ 0, 'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97' assert df.at[ 1, 'original_commit_addition'] == '6b531fcb57d5b9d98dd983cb65357d82ccca647b' assert df.at[ 2, 'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97'
def compileAuthors(self): """Mine all repos in the repo list for commits by those in authors. None for get all""" print("mining " + self.name) reposToMine = [] for repo in self.repos: if repo not in self.minedRepos: reposToMine.append(repo) for repo in reposToMine: try: if not os.path.exists( repo + "/.git"): #in case the repo is one level down repo = repo + "/" + os.listdir(repo)[0] #print("moved to "+repo) if repo in self.minedRepos: continue self.minedRepos.add(repo) #if not os.path.exists(repo+".git"): # repo = os.listdir(repo)[0] remote = self.get_remote(repo) created = remote.created_at if self.authorsToMine: commitsToMine = [] for authorName in self.authorsToMine: for commit in remote.get_commits(author=authorName): if commit.sha not in self.commits: commitsToMine.append(commit.sha) if not commitsToMine: print("No important commits here, skipping " + repo) continue miner = pydriller.repository_mining.RepositoryMining( repo, only_modifications_with_file_types=gitProfileSet. langList, only_no_merge=True, only_commits=commitsToMine, since=created) else: miner = pydriller.repository_mining.RepositoryMining( repo, only_modifications_with_file_types=gitProfileSet. langList, only_no_merge=True, since=created) repository = pydriller.GitRepository(repo) print("Scanning repo: " + miner._path_to_repo) for commit in tqdm(miner.traverse_commits()): try: author = commit.author if author.name not in self.aliases: ghCommit = remote.get_commit(commit.hash) namedUser = ghCommit.author if not namedUser: continue if namedUser.login not in self.authors: self.authors[namedUser.login] = gitAuthor( namedUser) self.aliases[author.name] = namedUser.login author = self.authors[self.aliases[author.name]] if self.authorsToMine and author.name not in self.authorsToMine: continue if commit.hash in author.commits or commit.hash in self.commits: continue #don't reprocess seen hashes self.commits.add(commit.hash) author.commits.add(commit.hash) if repo not in author.repos: author.repos.add(repo) for mod in commit.modifications: mod._calculate_metrics() if mod.new_path is None or not mod.new_path.split( ".")[-1] in gitProfileSet.langList: continue author.files.add(mod.new_path) #parse diff and add lines to list newSC = list() leDiff = repository.parse_diff(mod.diff) for num, line in leDiff["added"]: newSC.append(line) from lizard import analyze_file as liz fileInfo = liz.analyze_source_code( mod.new_path, "\n".join(newSC)) #maintain list of dicts containing the source code of specific functions. Same format as for lines lineIndex = 0 for fun in fileInfo.function_list: #Make sure these appear in the "function" arg_list_termination = r"\)\s*{" started = False newFun = dict() lineStr = "" try: while (leDiff["added"][lineIndex][0] < fun.start_line): lineIndex += 1 while (leDiff["added"][lineIndex][0] < fun.end_line + 1): last_lineStr = lineStr lineStr = leDiff["added"][lineIndex][1] if not started and re.search( arg_list_termination, "".join( [lineStr, last_lineStr])): started = True newFun.update({ (commit.hash, mod.new_path, leDiff["added"][lineIndex][0]): lineStr }) lineIndex += 1 except IndexError: #if end of input reached before end of functions. This is probable when non-complete functions are submitted. pass if started and len( newFun ) > 1 and '}' in lineStr + last_lineStr: author.lines.update(newFun) author.functions.append( self.functionToString(newFun)) except Exception as e: continue except Exception as e: print("problem processing " + repo) continue except KeyboardInterrupt: print("continuing") continue self.minedRepos.add(repo) print(str("finished" + str(miner._path_to_repo))) print(self) self.repos = self.minedRepos
def get_repo(repo_path): return pydriller.GitRepository(repo_path)
help='Only include vulnerabilities of given severity level or higher ' '(Default: NONE, include all)') parser.add_argument('--confidence', type=Level.from_string, choices=list(Level), default=Level.NONE, help='Only include vulnerabilities of given confidence level or higher ' '(Default: NONE, include all)') parser.add_argument('--no-merge', action='store_true', help='Do not include merge commits') args = parser.parse_args() try: # Clone the repo if URL is provided self = pd.RepositoryMining(args.repo) if self._isremote(args.repo): tmp_folder = tempfile.TemporaryDirectory() args.repo = self._clone_remote_repos(tmp_folder.name, args.repo) repo = pd.GitRepository(args.repo) args.single = repo.get_commit(args.single).hash if args.single else None args.since = datetime.strptime(args.since, '%Y/%m/%d') if args.since else None args.to = datetime.strptime(args.to, '%Y/%m/%d') if args.to else None args.from_commit = repo.get_commit(args.from_commit).hash if args.from_commit else None args.to_commit = repo.get_commit(args.to_commit).hash if args.to_commit else None repo_mining = pd.RepositoryMining(args.repo, single=args.single, since=args.since, to=args.to, from_commit=args.from_commit, to_commit=args.to_commit, from_tag=args.from_tag, to_tag=args.to_tag,