Exemplo n.º 1
0
def map_advisory_record_onto_candidate_commits(advisory_record):
    '''
    Map the advisory record onto candidate commits.

    Input:
        advisory_record (dict)

    Returns:
        list: a list of IDs of candidate commits
    '''
    if 'repo_url' not in advisory_record or 'description' not in advisory_record:
        raise ValueError('advisory record should contain variables repo_url and description.')

    # clone repository
    git_repo = Git(advisory_record['repo_url'], cache_path=GIT_CACHE)
    git_repo.clone(skip_existing=False)
    tags = git_repo.get_tags()

    versions_in_description = retreive_all_versions_from_description(advisory_record['description'])

    tag_intervals = list()

    for version in versions_in_description:

        # map the mentioned version onto an interval of tags and add them to the list
        for version_interval_tags in version_to_wide_interval_tags(tags, version, git_repo):
            if version_interval_tags not in tag_intervals and version_interval_tags != None:
                tag_intervals.append(version_interval_tags)

    #map the list of intervals onto candidate commits
    candidate_commits = get_commits_between_interval_tags(tag_intervals, git_repo=git_repo)
    return candidate_commits
Exemplo n.º 2
0
def select_commit_ids_based_on_vulnerability_publish_date(vulnerability_published_timestamp, git_repo=None, repo_url=None, days_before=730, days_after=100, commits_before_cap=5215, commits_after_cap=100):
    '''
    To select commit IDs based on the vulnerability publish date.
    This can be used as a starting position for the search for fix commits.

    Input:
        vulnerability_published_timestamp (int): the timestamp at which the vulnerability is been published i.e. in the NVD
        git_repo (git_explorer.core.Git): to use for extracting the content
        repository_url: if git_repo is not provided, a repository url is needed to initialize the git_repo
        days_before (int): the maximum number of days before the release timestamp (edge)
        days_after (int): the maximum number of days after the release timestamp (edge)
        commits_before_cap (int): the maximum number of commits before the release timestamp (edge)
        commits_after_cap (int): the maximum number of commits after the release timestamp (edge)

    Returns:
        list: a list of commit IDs within the interval
    '''

    if git_repo == None:
        try:
            git_repo = Git(repo_url, cache_path=GIT_CACHE)
            git_repo.clone(skip_existing=True)
        except:
            raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo)))

    ### Add commits before NVD release
    since, until = database.timestamp_to_timestamp_interval(int(vulnerability_published_timestamp), days_before=days_before, days_after=0)
    commit_ids_to_add_before = database.get_commit_ids_between_timestamp_interval(str(since), str(until), git_repo=git_repo, repository_url=repo_url)

    # maximum to add
    if len(commit_ids_to_add_before) > commits_before_cap:
        commit_ids_to_add_before = commit_ids_to_add_before[:commits_before_cap] #add the 5215 closest before the NVD release date

    ### Add commits after NVD release
    since, until = database.timestamp_to_timestamp_interval(int(vulnerability_published_timestamp), days_before=0, days_after=days_after)
    commit_ids_to_add_after = database.get_commit_ids_between_timestamp_interval(str(since), str(until), git_repo=git_repo, repository_url=repo_url)

    # maximum to add
    if len(commit_ids_to_add_after) > commits_after_cap:
        commit_ids_to_add_after = commit_ids_to_add_after[-commits_after_cap:] #add the 100 closest before the NVD release date

    commit_ids = commit_ids_to_add_before + commit_ids_to_add_after

    return commit_ids
Exemplo n.º 3
0
def get_commit_ids_between_timestamp_interval(since,
                                              until,
                                              git_repo=None,
                                              repository_url=None):
    '''
    Function to get all commit IDs that have been committed in the timestamp interval
        Based on git_explorer.core.get_commits()
        The order is from newest to oldest: the result[0] is the most recent one (larger timestamp), the result[-1] is the oldest (smallest timestamp)
    
    Input:
        since (str/int/float): timestamp in format i.e. '123456789'
        until (str/int/float): timestamp in format i.e. '123456789'
        git_repo (git_explorer.core.Git): to use for extracting the content
        repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo
    
    Returns:
        list: the commit IDs that have been committed in the timestamp interval
    '''
    if git_repo == None and repository_url == None:
        raise ValueError('Provide a git_repo or a repository_url')

    if int(since) >= int(until):
        raise ValueError(
            'The timestamps provided result in an interval without commit IDs, as since >= until.'
        )

    if git_repo == None:
        git_repo = Git(repository_url, cache_path=GIT_CACHE)
        git_repo.clone(skip_existing=True)

    # create git command
    cmd = ["git", "rev-list", "--all"]
    cmd.append("--since=" + str(since))
    cmd.append("--until=" + str(until))

    try:
        out = git_repo._exec.run(cmd)
    except:
        print("Git command failed. Could not obtain commit ids.")
        return

    return [l.strip() for l in out]
Exemplo n.º 4
0
def add_tags_to_database(connection, tags=None, git_repo=None, repo_url=None, verbose=True):
    '''
    Add tags to the database

    Input:
        connection (sqlite3.connection): the connection to the database
        tags (list): a list of tags
        git_repo (git_explorer.core.Git): to use for extracting the content
        repo_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
    '''
    if git_repo == None and repo_url ==None:
        raise ValueError('Provide a git_repo or a repo_url')

    if git_repo == None:
        git_repo = Git(repo_url, cache_path=GIT_CACHE)
        git_repo.clone(skip_existing=False)
    
    if repo_url==None:
        repo_url = git_repo.get_url()

    repo_url = re.sub('\.git$|/$', '', repo_url) 

    if tags == None:
        tags = git_repo.get_tags()
    elif type(tags) == str:
        tags = [tags]
    if len(tags) == 0:
        return

    cursor = connection.cursor()

    # to not add duplicates
    tags = list(dict.fromkeys(tags))  # to get only unique tags
    cursor.execute("SELECT tag FROM tags WHERE repo_url = :repo_url AND tag IN {}".format(tuple(tags)), {'repo_url':repo_url})
    tags_already_in_the_db = list(pd.read_sql("SELECT tag FROM tags WHERE tag IN {} and repo_url = '{}'".format(tuple(tags+[tags[0]]), repo_url), connection).tag)
    tags_to_add = [tag for tag in tags if tag not in tags_already_in_the_db]

    if len(tags_to_add) == 0:
        cursor.close()
        return

    print('    Adding new tags to the database')
    for tag in tqdm(tags_to_add):
        try:
            tag_timestamp = filter.get_timestamp_for_tag(tag, git_repo)

            # add to database
            cursor.execute("INSERT INTO tags VALUES (:tag, :repo_url, :tag_timestamp)",
                {'tag':tag, 'repo_url':repo_url, 'tag_timestamp':str(tag_timestamp)})
        except:
            print('    Failed to add tag {}'.format(tag))

    connection.commit()
    if verbose: print('    {} / {} tags were already in the database and added the rest.'.format(len(tags_already_in_the_db), len(tags)))
    cursor.close()
    return
Exemplo n.º 5
0
def get_commits_between_interval_tags(intervals_tags, git_repo=None, repo_url=None):
    '''
    Finds the commits between intervals tags

    Input:
        intervals_tags (list): tags for version intervals
        repo_url (str): the URL of the repository to draw the commits from

    Returns:
        list: a list with IDs for commits in the intervals
    '''
    candidate_commits = list()

    # obtain candidate commits with git-explorer
    if git_repo == None:
        try:
            git_repo = Git(repo_url, cache_path=GIT_CACHE)
            git_repo.clone(skip_existing=True)
        except:
            raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo)))

    for interval_tags in intervals_tags:
        t1, t2 = interval_tags

        #@TODO: one tag before this one
        cid_1 = git_repo.get_commit_id_for_tag(t1)
        c1 = Commit(git_repo, cid_1)
        time_1 = c1.get_timestamp()

        cid_2 = git_repo.get_commit_id_for_tag(t2)
        c2 = Commit(git_repo, cid_2)

        candidates = git_repo.get_commits(since=time_1, ancestors_of=cid_2, exclude_ancestors_of=cid_1, filter_files='*')

        candidate_commits += candidates

    return list(dict.fromkeys(candidate_commits)) #only unique ids
def git_repo(example_advisory_record):
    git_repo = Git(example_advisory_record['repo_url'], cache_path=GIT_CACHE)
    git_repo.clone(skip_existing=True)
    return git_repo
Exemplo n.º 7
0
def add_commits_to_database(connection,
                            commit_ids,
                            git_repo=None,
                            repository_url=None,
                            driver=None,
                            with_message_references_content=False,
                            verbose=True):
    '''
    Add commits to the database

    Input:
        connection (sqlite3.connection): the connection to the database
        commit_ids (list): a list of commit_ids
        git_repo (git_explorer.core.Git): to use for extracting the content
        repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo
        driver: a webdriver can be provided to avoid javascript required pages
        with_message_references_content (bool): to add commits references (requires additional time)
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
    '''
    if git_repo == None and repository_url == None:
        raise ValueError('Provide a git_repo or a repository_url')

    if git_repo == None:
        git_repo = Git(repository_url, cache_path=GIT_CACHE)
        git_repo.clone(skip_existing=True)

    if repository_url == None:
        repository_url = git_repo.get_url()
    repository_url = re.sub('\.git$|/$', '', repository_url)

    if type(commit_ids) == str:
        commit_ids = [commit_ids]
    if len(commit_ids) == 0:
        print('No commit IDs were provided')
        return

    cursor = connection.cursor()

    # to not add duplicates
    commit_ids = list(dict.fromkeys(commit_ids))  # to get only unique ids
    commits_already_in_the_db = list(
        pd.read_sql(
            "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'".
            format(tuple(commit_ids + [commit_ids[0]]),
                   repository_url), connection).id)
    commits_to_add = [
        commit_id for commit_id in commit_ids
        if commit_id not in commits_already_in_the_db
    ]

    if len(commits_to_add) == 0:
        cursor.close()
        return

    if verbose:
        print('    {} / {} are already in the database, now adding the rest.'.
              format(len(commits_already_in_the_db), len(commit_ids)))

    for commit_id in tqdm(commits_to_add):
        try:
            # initialize commit object
            commit = Commit(git_repo, commit_id)

            # message execution is combined with timestamp execution to speed up to process
            message = commit._exec.run(
                ['git', 'log', '--format=%B%n%ct', '-n1', commit._id])
            timestamp = message.pop(-1)

            diff = commit._exec.run([
                'git', 'diff', '--unified=1', commit._id + "^.." + commit._id
            ])
            changed_files = get_changed_files_from_diff(diff)
            hunks = get_hunks_from_diff(diff)

            preprocessed_message = rank.simpler_filter_text(message)
            preprocessed_diff = rank.simpler_filter_text(
                re.sub(
                    '[^A-Za-z0-9]+', ' ', ' '.join(
                        rank.extract_relevant_lines_from_commit_diff(diff))))
            preprocessed_changed_files = rank.simpler_filter_text(
                changed_files)

            if with_message_references_content:
                commit_message_reference_content = extract_commit_message_reference_content(
                    message, repository_url, driver)
                preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words(
                    commit_message_reference_content, n=20)
            else:
                commit_message_reference_content, preprocessed_commit_message_reference_content = None, None

            # add to database
            with connection:
                cursor.execute(
                    "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)",
                    {
                        'repository_url':
                        repository_url,
                        'id':
                        commit_id,
                        'timestamp':
                        str(timestamp),
                        'message':
                        str(message),
                        'changed_files':
                        str(changed_files),
                        'diff':
                        str(diff),
                        'hunks':
                        str(hunks),
                        'commit_message_reference_content':
                        commit_message_reference_content,
                        'preprocessed_message':
                        preprocessed_message,
                        'preprocessed_diff':
                        preprocessed_diff,
                        'preprocessed_changed_files':
                        preprocessed_changed_files,
                        'preprocessed_commit_message_reference_content':
                        preprocessed_commit_message_reference_content
                    })
        except:
            print('    Failed to add commit {}'.format(commit_id))
    if verbose: print('    All commits have been added to the database.')
    cursor.close()
    return
Exemplo n.º 8
0
def example_vulnerability_git_repo(example_vulnerability):
    example_vulnerability_git_repo = Git(example_vulnerability['repo_url'],
                                         cache_path=GIT_CACHE)
    example_vulnerability_git_repo.clone(skip_existing=True)
    return example_vulnerability_git_repo