def map_advisory_record_onto_candidate_commits(advisory_record): ''' Map the advisory record onto candidate commits. Input: advisory_record (dict) Returns: list: a list of IDs of candidate commits ''' if 'repo_url' not in advisory_record or 'description' not in advisory_record: raise ValueError('advisory record should contain variables repo_url and description.') # clone repository git_repo = Git(advisory_record['repo_url'], cache_path=GIT_CACHE) git_repo.clone(skip_existing=False) tags = git_repo.get_tags() versions_in_description = retreive_all_versions_from_description(advisory_record['description']) tag_intervals = list() for version in versions_in_description: # map the mentioned version onto an interval of tags and add them to the list for version_interval_tags in version_to_wide_interval_tags(tags, version, git_repo): if version_interval_tags not in tag_intervals and version_interval_tags != None: tag_intervals.append(version_interval_tags) #map the list of intervals onto candidate commits candidate_commits = get_commits_between_interval_tags(tag_intervals, git_repo=git_repo) return candidate_commits
def select_commit_ids_based_on_vulnerability_publish_date(vulnerability_published_timestamp, git_repo=None, repo_url=None, days_before=730, days_after=100, commits_before_cap=5215, commits_after_cap=100): ''' To select commit IDs based on the vulnerability publish date. This can be used as a starting position for the search for fix commits. Input: vulnerability_published_timestamp (int): the timestamp at which the vulnerability is been published i.e. in the NVD git_repo (git_explorer.core.Git): to use for extracting the content repository_url: if git_repo is not provided, a repository url is needed to initialize the git_repo days_before (int): the maximum number of days before the release timestamp (edge) days_after (int): the maximum number of days after the release timestamp (edge) commits_before_cap (int): the maximum number of commits before the release timestamp (edge) commits_after_cap (int): the maximum number of commits after the release timestamp (edge) Returns: list: a list of commit IDs within the interval ''' if git_repo == None: try: git_repo = Git(repo_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) except: raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo))) ### Add commits before NVD release since, until = database.timestamp_to_timestamp_interval(int(vulnerability_published_timestamp), days_before=days_before, days_after=0) commit_ids_to_add_before = database.get_commit_ids_between_timestamp_interval(str(since), str(until), git_repo=git_repo, repository_url=repo_url) # maximum to add if len(commit_ids_to_add_before) > commits_before_cap: commit_ids_to_add_before = commit_ids_to_add_before[:commits_before_cap] #add the 5215 closest before the NVD release date ### Add commits after NVD release since, until = database.timestamp_to_timestamp_interval(int(vulnerability_published_timestamp), days_before=0, days_after=days_after) commit_ids_to_add_after = database.get_commit_ids_between_timestamp_interval(str(since), str(until), git_repo=git_repo, repository_url=repo_url) # maximum to add if len(commit_ids_to_add_after) > commits_after_cap: commit_ids_to_add_after = commit_ids_to_add_after[-commits_after_cap:] #add the 100 closest before the NVD release date commit_ids = commit_ids_to_add_before + commit_ids_to_add_after return commit_ids
def get_commit_ids_between_timestamp_interval(since, until, git_repo=None, repository_url=None): ''' Function to get all commit IDs that have been committed in the timestamp interval Based on git_explorer.core.get_commits() The order is from newest to oldest: the result[0] is the most recent one (larger timestamp), the result[-1] is the oldest (smallest timestamp) Input: since (str/int/float): timestamp in format i.e. '123456789' until (str/int/float): timestamp in format i.e. '123456789' git_repo (git_explorer.core.Git): to use for extracting the content repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo Returns: list: the commit IDs that have been committed in the timestamp interval ''' if git_repo == None and repository_url == None: raise ValueError('Provide a git_repo or a repository_url') if int(since) >= int(until): raise ValueError( 'The timestamps provided result in an interval without commit IDs, as since >= until.' ) if git_repo == None: git_repo = Git(repository_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) # create git command cmd = ["git", "rev-list", "--all"] cmd.append("--since=" + str(since)) cmd.append("--until=" + str(until)) try: out = git_repo._exec.run(cmd) except: print("Git command failed. Could not obtain commit ids.") return return [l.strip() for l in out]
def add_tags_to_database(connection, tags=None, git_repo=None, repo_url=None, verbose=True): ''' Add tags to the database Input: connection (sqlite3.connection): the connection to the database tags (list): a list of tags git_repo (git_explorer.core.Git): to use for extracting the content repo_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" ''' if git_repo == None and repo_url ==None: raise ValueError('Provide a git_repo or a repo_url') if git_repo == None: git_repo = Git(repo_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=False) if repo_url==None: repo_url = git_repo.get_url() repo_url = re.sub('\.git$|/$', '', repo_url) if tags == None: tags = git_repo.get_tags() elif type(tags) == str: tags = [tags] if len(tags) == 0: return cursor = connection.cursor() # to not add duplicates tags = list(dict.fromkeys(tags)) # to get only unique tags cursor.execute("SELECT tag FROM tags WHERE repo_url = :repo_url AND tag IN {}".format(tuple(tags)), {'repo_url':repo_url}) tags_already_in_the_db = list(pd.read_sql("SELECT tag FROM tags WHERE tag IN {} and repo_url = '{}'".format(tuple(tags+[tags[0]]), repo_url), connection).tag) tags_to_add = [tag for tag in tags if tag not in tags_already_in_the_db] if len(tags_to_add) == 0: cursor.close() return print(' Adding new tags to the database') for tag in tqdm(tags_to_add): try: tag_timestamp = filter.get_timestamp_for_tag(tag, git_repo) # add to database cursor.execute("INSERT INTO tags VALUES (:tag, :repo_url, :tag_timestamp)", {'tag':tag, 'repo_url':repo_url, 'tag_timestamp':str(tag_timestamp)}) except: print(' Failed to add tag {}'.format(tag)) connection.commit() if verbose: print(' {} / {} tags were already in the database and added the rest.'.format(len(tags_already_in_the_db), len(tags))) cursor.close() return
def get_commits_between_interval_tags(intervals_tags, git_repo=None, repo_url=None): ''' Finds the commits between intervals tags Input: intervals_tags (list): tags for version intervals repo_url (str): the URL of the repository to draw the commits from Returns: list: a list with IDs for commits in the intervals ''' candidate_commits = list() # obtain candidate commits with git-explorer if git_repo == None: try: git_repo = Git(repo_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) except: raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo))) for interval_tags in intervals_tags: t1, t2 = interval_tags #@TODO: one tag before this one cid_1 = git_repo.get_commit_id_for_tag(t1) c1 = Commit(git_repo, cid_1) time_1 = c1.get_timestamp() cid_2 = git_repo.get_commit_id_for_tag(t2) c2 = Commit(git_repo, cid_2) candidates = git_repo.get_commits(since=time_1, ancestors_of=cid_2, exclude_ancestors_of=cid_1, filter_files='*') candidate_commits += candidates return list(dict.fromkeys(candidate_commits)) #only unique ids
def git_repo(example_advisory_record): git_repo = Git(example_advisory_record['repo_url'], cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) return git_repo
def add_commits_to_database(connection, commit_ids, git_repo=None, repository_url=None, driver=None, with_message_references_content=False, verbose=True): ''' Add commits to the database Input: connection (sqlite3.connection): the connection to the database commit_ids (list): a list of commit_ids git_repo (git_explorer.core.Git): to use for extracting the content repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo driver: a webdriver can be provided to avoid javascript required pages with_message_references_content (bool): to add commits references (requires additional time) verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" ''' if git_repo == None and repository_url == None: raise ValueError('Provide a git_repo or a repository_url') if git_repo == None: git_repo = Git(repository_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) if repository_url == None: repository_url = git_repo.get_url() repository_url = re.sub('\.git$|/$', '', repository_url) if type(commit_ids) == str: commit_ids = [commit_ids] if len(commit_ids) == 0: print('No commit IDs were provided') return cursor = connection.cursor() # to not add duplicates commit_ids = list(dict.fromkeys(commit_ids)) # to get only unique ids commits_already_in_the_db = list( pd.read_sql( "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'". format(tuple(commit_ids + [commit_ids[0]]), repository_url), connection).id) commits_to_add = [ commit_id for commit_id in commit_ids if commit_id not in commits_already_in_the_db ] if len(commits_to_add) == 0: cursor.close() return if verbose: print(' {} / {} are already in the database, now adding the rest.'. format(len(commits_already_in_the_db), len(commit_ids))) for commit_id in tqdm(commits_to_add): try: # initialize commit object commit = Commit(git_repo, commit_id) # message execution is combined with timestamp execution to speed up to process message = commit._exec.run( ['git', 'log', '--format=%B%n%ct', '-n1', commit._id]) timestamp = message.pop(-1) diff = commit._exec.run([ 'git', 'diff', '--unified=1', commit._id + "^.." + commit._id ]) changed_files = get_changed_files_from_diff(diff) hunks = get_hunks_from_diff(diff) preprocessed_message = rank.simpler_filter_text(message) preprocessed_diff = rank.simpler_filter_text( re.sub( '[^A-Za-z0-9]+', ' ', ' '.join( rank.extract_relevant_lines_from_commit_diff(diff)))) preprocessed_changed_files = rank.simpler_filter_text( changed_files) if with_message_references_content: commit_message_reference_content = extract_commit_message_reference_content( message, repository_url, driver) preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words( commit_message_reference_content, n=20) else: commit_message_reference_content, preprocessed_commit_message_reference_content = None, None # add to database with connection: cursor.execute( "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)", { 'repository_url': repository_url, 'id': commit_id, 'timestamp': str(timestamp), 'message': str(message), 'changed_files': str(changed_files), 'diff': str(diff), 'hunks': str(hunks), 'commit_message_reference_content': commit_message_reference_content, 'preprocessed_message': preprocessed_message, 'preprocessed_diff': preprocessed_diff, 'preprocessed_changed_files': preprocessed_changed_files, 'preprocessed_commit_message_reference_content': preprocessed_commit_message_reference_content }) except: print(' Failed to add commit {}'.format(commit_id)) if verbose: print(' All commits have been added to the database.') cursor.close() return
def example_vulnerability_git_repo(example_vulnerability): example_vulnerability_git_repo = Git(example_vulnerability['repo_url'], cache_path=GIT_CACHE) example_vulnerability_git_repo.clone(skip_existing=True) return example_vulnerability_git_repo