Пример #1
0
def extract_commit_message_reference_content(commit_message, repo_url, driver=None):
    '''
    Can be used to find references in commit messages and extract the content from these references

    Input:
        commit_message (list/str): the commit message
        repo_url (str): the repository URL (when commits refer to a Git issue)
        driver: a webdriver can be provided to avoid javascript required pages
    
    Returns:
        list: a list containing the preprocessed content of the references that have been found
    '''
    if type(commit_message) == list:
        commit_message = ' '.join(commit_message)

    repo_url = re.sub('\.git$|/$', '', repo_url) 
    references = rank.find_references(commit_message)
    references_content = list()
    
    for reference in references:
        time.sleep(0.5)
        try:
            if 'http' not in reference:
                url = repo_url + '/issues/' + reference.lstrip('#')
                r = requests.get(url)
                soup = BeautifulSoup(r.content, "html.parser")

                # check if reference is found and whether it is an issue or pull page
                if reference.lstrip('#') in r.url and ('/issues/' in r.url or '/pull/' in r.url):
                    references_content.append(rank.simpler_filter_text(' '.join([string for string in soup.stripped_strings if string not in strings_on_every_GitHub_page])))
            else: 
                if 'securityfocus.com' in reference.strip('/.'): #securityfocus.com requires a selection in a menu
                    reference = reference.strip('/.') + '/discuss' 

                try:
                    r = requests.get(reference.strip('.')) #can be end of the sentence
                    soup = BeautifulSoup(r.content, "html.parser")
                    reference_content = ' '.join([string for string in soup.stripped_strings])

                    # Apache pony mail requires the webdriver to see the content
                    if 'requires JavaScript enabled' in reference_content and driver != None:

                        driver.get(reference.strip('.'))
                        time.sleep(0.5)
                        soup = BeautifulSoup(driver.page_source, "html.parser")
                        reference_content = ' '.join([string for string in soup.stripped_strings])

                    references_content.append(rank.simpler_filter_text(reference_content))
                except:
                    if driver != None:
                        driver.get(reference.strip('.'))
                        time.sleep(0.5)
                        soup = BeautifulSoup(driver.page_source, "html.parser")
                        reference_content = ' '.join([string for string in soup.stripped_strings])
                        references_content.append(rank.simpler_filter_text(reference_content))
        except:
            print('Failed in obtaining content for reference {}'.format(reference))

    return references_content
Пример #2
0
def test_extract_n_most_occurring_words(example_commit_content):
    assert rank.extract_n_most_occurring_words(rank.simpler_filter_text(
        'Messages contain fix indicating words like fixing, fix or fixes, can also contain a lot of different words. And we do not want a lot of stopwords! From this description, fix should be the returned word and and and not not not a stopword.'
    ),
                                               n=1) == 'fix'
    assert rank.extract_n_most_occurring_words(rank.simpler_filter_text(
        example_commit_content['message']),
                                               n=1) == 'add'
    assert rank.extract_n_most_occurring_words(rank.simpler_filter_text(
        ' '.join(example_commit_content['message'])),
                                               n=1) == 'add'
Пример #3
0
def test_simpler_filter_text(example_commit_content):
    '''
    The function should be able to handle real commit content, where the message and diff are provided as list
    '''
    assert rank.simpler_filter_text(
        text=example_commit_content['message']) == 'add changelog merge'
    assert rank.simpler_filter_text(text=' '.join(
        example_commit_content['message'])) == 'add changelog merge'
    assert rank.simpler_filter_text(
        text=
        'This is an example sentence to test the functionalities of filtered_text'
    ) == 'example sentence test functionality filtered_text filter text'
def map_description_to_repository_url(vulnerability_id, description,
                                      vulnerabilities_df, repository_url_df):
    # if the vulnerabilities df is empty
    if type(vulnerabilities_df) == type(None):
        return

    if vulnerability_id in list(vulnerabilities_df.index):
        return vulnerabilities_df.at[vulnerability_id, 'repo_url']

    # else return url with highest lexical similarity
    repo_urls = list(repository_url_df.repo_url)
    project_names = list(repository_url_df.project_name)

    preprocessed_description = rank.simpler_filter_text([
        re.sub('[^\w]', ' ', token.text) for token in nlp(description)
    ]).lower()
    tfidf_vectorized_strings = TfidfVectorizer().fit_transform(
        [preprocessed_description] + project_names)

    scores = {
        repo_url: cosine_similarity(tfidf_vectorized_strings[0],
                                    tfidf_vectorized_strings[i + 1])[0][0]
        for i, repo_url in enumerate(repo_urls)
    }
    return list({
        k: v
        for k, v in sorted(
            scores.items(), key=lambda item: item[1], reverse=True)
    }.keys())[0]
Пример #5
0
def add_vulnerability_to_database(connection, vulnerability_id, repo_url, description=None, published_timestamp=None, references=None, driver=None, verbose=True):
    '''
    Input:
        connection (sqlite3.connection): the connection with the database
        vulnerability_id (str): the identifier of the vulnerability
        repo_url (str): the repository url
        description (str): the description of the vulnerability can be provided manually, or will be extracted from the NVD
        published_timestamp (str): vulnerability published timestamp can be provided manually, or will be extracted from the NVD
        references (list): vulnerability references can be provided manually, or will be extracted from the NVD
        driver: i.e. a chromedriver can be provided to scrape with when requests does not succeed
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
    '''
    if type(published_timestamp) == int:
        published_timestamp = str(published_timestamp)

    #preprocess repo_url entry
    repo_url = re.sub('\.git$|/$', '', repo_url)

    cursor = connection.cursor()
    if if_new_vulnerability(cursor, vulnerability_id):

        # gather information for the new vulnerability if needed
        if description == None or published_timestamp == None or references == None:
            try:
                nvd_description, nvd_published_timestamp, nvd_references = extract_nvd_content(vulnerability_id)
            except: #if the vulnerability is not in the NVD
                nvd_description, nvd_published_timestamp, nvd_references = None, None, None

            if description == None:
                if nvd_description == None:
                    raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                else:
                    description = nvd_description
            if published_timestamp == None:
                if nvd_published_timestamp == None:
                    raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability published timestamp manually.".format(vulnerability_id))
                else:
                    published_timestamp = nvd_published_timestamp
            if references == None:
                if nvd_references == None:
                    raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a advisory references manually.".format(vulnerability_id))
                else:
                    references = nvd_references
            
        # add to the database
        preprocessed_description = rank.simpler_filter_text(description)
        with connection:
            cursor.execute("INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)",
            {'vulnerability_id':vulnerability_id, 'repo_url':repo_url, 'description':description, 'published_timestamp':str(published_timestamp), 'preprocessed_description':preprocessed_description})
    
        # add the references to the database
        if references != None and len(references) > 0:
            add_vulnerability_references_to_database(connection, vulnerability_id, references, driver=driver, verbose=verbose)
    elif verbose:
        print("    There is already a vulnerability with ID {} in the database".format(vulnerability_id))
    cursor.close()
    return
def get_vulnerability_data(vulnerability_id, vulnerabilities_df,
                           db_references_df):
    if type(vulnerabilities_df) != type(None) and vulnerability_id in list(
            vulnerabilities_df.index):
        repo_url, cve_description, cve_published_timestamp, preprocessed_description = vulnerabilities_df.loc[
            vulnerability_id]
        # cve_project_name = ' '.join(re.split('/|-|\.', cve_repo_url.lstrip('https?://')))
        references = list(db_references_df[db_references_df.vulnerability_id ==
                                           vulnerability_id].url)
    else:
        cve_description, cve_published_timestamp, references = database.extract_nvd_content(
            vulnerability_id)
        references = []
        preprocessed_description = rank.simpler_filter_text(cve_description)
    return cve_description, cve_published_timestamp, preprocessed_description, references
def load_vulnerabilities():
    prospector_connection, prospector_cursor = database.connect_with_database(
        commits_db_path)
    vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
        vulnerabilities_db_path)

    print("Reading vulnerabilities")
    vulnerabilities_df = pd.read_sql(
        "SELECT * FROM vulnerabilities",
        vulnerabilities_connection).set_index("vulnerability_id")
    db_references_df = pd.read_sql(
        "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references",
        vulnerabilities_connection)
    advisory_references_df = pd.read_sql(
        "SELECT vulnerability_id, url FROM advisory_references",
        vulnerabilities_connection)
    fixes_df = pd.read_sql("SELECT * FROM fix_commits",
                           vulnerabilities_connection)

    # prospector_connection, prospector_cursor = database.connect_with_database(commits_db_path)
    tags_df = pd.read_sql("SELECT * FROM tags", prospector_connection)

    # Create repository_url_df
    repository_url_df = pd.DataFrame()
    for i, repo_url in enumerate(list(vulnerabilities_df.repo_url.unique())):
        repository_url_df.at[i, 'repo_url'] = repo_url
        repository_url_df.at[i, 'project_name'] = rank.simpler_filter_text(
            re.sub('^https?://|[^\w]', ' ', repo_url)).lower()
    repository_url_df['project_name'] = repository_url_df.apply(
        lambda x: ' '.join([
            token for token in x['project_name'].split(' ')
            if token not in ['github', 'com', 'git', 'org']
        ]),
        axis=1)

    return vulnerabilities_df, db_references_df, advisory_references_df, tags_df, repository_url_df, fixes_df
def dashboard_page(state):

    st.title("PROSPECTOR")

    st.subheader(
        "The search engine for fix-commits for security vulnerabilities in OSS"
    )
    st.write('By SAP - Antonino SABETTA & Daan HOMMERSOM')
    st.write('''
        How to use Prospector:
        \n1) Provide a vulnerability description, (GitHub) repository URL and a release date (or pick a CVE).
        \n2) Check whether Prospector fills in the rest correctly, and provide additional information if needed.
        \n3) Find security fixes!
    ''')

    # with st.beta_expander(label="Find out more", expanded=False):
    st.write('''
        The objective of Prospector is to minimize the (manual) effort needed for finding
        the fix commit of a known vulnerability in an open-source software project.
        Since these repositories can contain hundreds thousands commits, the commits are
        firstly filtered by only selecting all commits within two years before and
        one hundred days after the release date with a maximum of respectively 5215 and 100 commits.
        A study has shown that this selection has 93% recall.
        \n
        Firstly, an advisory record is created containing information on the vulnerability.
        This advisory record is used to select candidate commits. For these candidate commits,
        ranking vectors are computed. These ranking vectors consist of several components that
        can be used to predict whether a candidate commit is the fix commit we are looking for.
        These candidates are then ranked on this probability score.

        In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10,
        and in 88.59% in the top 20.
    ''')

    st.subheader("ADVISORY RECORD")
    state.vulnerability_id = st.text_input(
        "Vulnerability identifyer:",
        value=state.vulnerability_id
        if state.vulnerability_id else '').upper()

    if state.vulnerability_id:
        try:
            cve_description, cve_published_timestamp, preprocessed_description, references = get_vulnerability_data(
                state.vulnerability_id, state.vulnerabilities_df,
                state.db_references_df)
        except:
            references = st.text_input(
                "Please provide useful references (separated by commas)")
            references = references.split(',')
            cve_description, cve_published_timestamp, preprocessed_description = '', time.time(
            ), None
    else:
        cve_description, cve_published_timestamp, preprocessed_description, references = '', time.time(
        ), None, []

    vulnerability_description = st.text_area("Vulnerability description",
                                             value=cve_description)
    project_name = st.text_input(
        "Project name",
        value=' '.join([
            token.text for token in nlp(vulnerability_description)
            if token.tag_ == 'NNP'
        ]))
    repo_url = st.text_input("Repository URL",
                             value=map_description_to_repository_url(
                                 vulnerability_id=state.vulnerability_id,
                                 description=project_name,
                                 vulnerabilities_df=state.vulnerabilities_df,
                                 repository_url_df=state.repository_url_df)
                             if project_name != '' else '')
    published_date = st.date_input("Vulnerability published date",
                                   value=datetime.fromtimestamp(
                                       int(cve_published_timestamp)))
    published_timestamp = int(time.mktime(published_date.timetuple()))

    state.advisory_record_confirmed = st.button(
        "CONFIRM ADVISORY RECORD"
    ) if not state.advisory_record_confirmed else True
    if state.advisory_record_confirmed:

        # option to clear the state
        if st.button("CLEAR FIELDS"):
            state.clear()

        # if it was a new vulnerability, add it to the DB
        if type(state.vulnerabilities_df) == type(
                None) or state.vulnerability_id not in list(
                    state.vulnerabilities_df.index):
            vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
                vulnerabilities_db_path)
            database.add_vulnerability_to_database(vulnerabilities_connection,
                                                   state.vulnerability_id,
                                                   repo_url,
                                                   vulnerability_description,
                                                   published_timestamp)

            # if it was not an NVD CVE, or the extraction failed
            if len(references) == 0:
                try:
                    cve_description, cve_published_timestamp, references = database.extract_nvd_content(
                        state.vulnerability_id)
                    references = [reference for reference in references]
                except:
                    references = st.text_input(
                        "Please provide useful references (separated by commas)"
                    )
                    references = references.split(',')

            database.add_vulnerability_references_to_database(
                vulnerabilities_connection,
                state.vulnerability_id,
                references,
                driver=None)
            prospector_connection, prospector_cursor = connect_with_commits_database(
                commits_db_path)
            database.add_tags_to_database(prospector_connection,
                                          tags=None,
                                          git_repo=None,
                                          repo_url=repo_url,
                                          verbose=True)
            state.vulnerabilities_df, state.db_references_df, state.advisory_references_df, state.tags_df, state.repository_url_df, state.fixes_df = load_vulnerabilities(
            )

        # gather values
        repository_tags = gather_tags(repo_url, state.tags_df)
        versions_in_description = filter.retreive_all_versions_from_description(
            vulnerability_description)
        tags_in_description = list(
            dict.fromkeys([
                tag for version in versions_in_description
                for tag in filter.get_tag_for_version(repository_tags, version)
            ]))
        references = [
            state.db_references_df.at[index, 'url'] for index in
            state.db_references_df[state.db_references_df.vulnerability_id ==
                                   state.vulnerability_id].index
        ]

        advisory_references = list(state.advisory_references_df[
            state.advisory_references_df.vulnerability_id ==
            state.vulnerability_id].url)

        # allow the user to influence the filtering
        state.advanced_settings = st.checkbox("Show advanced settings",
                                              state.advanced_settings)
        if state.advanced_settings:

            # the adding of references can be gone wrong
            first_commit_timestamp = rank.get_first_commit_timestamp(
                repo_url
            )  #@TODO: add a column to the database containing this value
            first_commit_date, today = datetime.fromtimestamp(
                int(first_commit_timestamp)).date(), datetime.fromtimestamp(
                    int(time.time())).date()
            lower_bound = published_date - timedelta(
                days=730) if published_date - timedelta(
                    days=730) > first_commit_date else first_commit_date
            upper_bound = published_date + timedelta(
                days=100) if published_date + timedelta(
                    days=100) < today else today

            since, until = st.slider("Published date based interval",
                                     min_value=first_commit_date,
                                     max_value=today,
                                     value=(lower_bound, upper_bound))
            since, until = int(time.mktime(since.timetuple())), int(
                time.mktime(until.timetuple()))

            # references
            additional_references = st.text_input(
                "Additional references (separated by commas)")
            if additional_references:
                references += additional_references.split(',')
                vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
                    vulnerabilities_db_path)
                database.add_vulnerability_references_to_database(
                    vulnerabilities_connection,
                    state.vulnerability_id,
                    references,
                    driver=None)

            selected_references = st.multiselect('Advisory references',
                                                 tuple(references),
                                                 default=tuple(references))

            # affected versions
            relevant_tags = st.multiselect(
                'Relevant tags',
                tuple(repository_tags),
                default=tuple(tags_in_description)
                if len(tags_in_description) != 0 else None)
            # st input int k
            k = st.number_input("The number of results to show",
                                min_value=1,
                                max_value=50,
                                value=10,
                                step=1)
        else:
            selected_references = references
            relevant_tags = tags_in_description
            since, until = None, None
            k = 10

        # st.write('vulnerability_description:', vulnerability_description)
        # st.write('references_content:', references_content)
        # st.write('vulnerability_id:', state.vulnerability_id)
        # st.write('since - published_timestamp - until:', since, published_timestamp, until)
        # st.write('repo_url:', repo_url)
        # # st.write('references:', references)
        # # st.write('advisory_references:', advisory_references)
        # st.write('relevant_tags:', relevant_tags)

        if st.button("Search prospects!"):
            model, min_max_scaler = load_model()
            prospector_connection, prospector_cursor = connect_with_commits_database(
                commits_db_path)

            preprocessed_description = rank.simpler_filter_text(
                vulnerability_description)

            references_content = tuple(state.db_references_df[
                (state.db_references_df.vulnerability_id ==
                 state.vulnerability_id)
                & (state.db_references_df.url.isin(selected_references))].
                                       preprocessed_content)
            references_content = rank.extract_n_most_occurring_words(
                rank.remove_forbidden_words_from_string(
                    string=' '.join(references_content),
                    forbidden_words=rank.reference_stopwords +
                    project_name.split(' ')),
                n=20)

            st.write(references_content)

            advisory_record = rank.Advisory_record(
                state.vulnerability_id,
                published_timestamp,
                repo_url,
                selected_references,
                references_content,
                advisory_references,
                vulnerability_description,
                prospector_connection,
                preprocessed_vulnerability_description=preprocessed_description,
                relevant_tags=relevant_tags,
                verbose=True,
                since=since,
                until=until)

            print(
                "\nGathering candidate commits and computing ranking vectors.")
            advisory_record.gather_candidate_commits()
            advisory_record.compute_ranking_vectors()

            # scaling some columns using the pretrained scaler, and some vulnerability specific
            advisory_record.ranking_vectors[
                vulnerability_specific_columns] = MinMaxScaler().fit_transform(
                    advisory_record.
                    ranking_vectors[vulnerability_specific_columns])
            advisory_record.ranking_vectors[
                universal_columns] = min_max_scaler.transform(
                    advisory_record.ranking_vectors[universal_columns])
            advisory_record.ranking_vectors.drop(columns=columns_to_drop,
                                                 inplace=True)

            advisory_record.ranked_candidate_commits = rank.rank_candidates(
                model, advisory_record.ranking_vectors)

            advisory_record.ranking_vectors.set_index('commit_id',
                                                      inplace=True)
            output = prospector_main.advisory_record_to_output(
                advisory_record, model, prospector_cursor, k=k)
            tmp_download_link = download_link(
                output,
                'Prospector_results-{}.txt'.format(state.vulnerability_id),
                "Click here to download Prospector's results as a txt file!")

            st.header("Results")

            st.markdown(tmp_download_link, unsafe_allow_html=True)

            st.write(
                "Showing the top {} candidates from {} candidates considered".
                format(k, len(advisory_record.ranking_vectors)))
            st.write(output)
Пример #9
0
def main(vulnerability_id,
         verbose,
         description=None,
         published_timestamp=None,
         repo_url=None,
         project_name=None,
         references=None,
         k=10,
         vulnerability_specific_scaling=False):
    model = load(model_path)
    universal_columns_scaler = load(min_max_scaler_path)

    # databases are created in the notebook database_creation.ipynb
    # the vulnerabilities database
    vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
        'data/prospector-vulnerabilities.db', verbose=verbose)
    # the commits database
    prospector_connection, prospector_cursor = database.connect_with_database(
        'data/prospector-commits.db', verbose=verbose)

    # if the vulnerability is already in the database
    if database.if_new_vulnerability(vulnerabilities_cursor,
                                     vulnerability_id) == False:
        vulnerability = vulnerabilities_cursor.execute(
            "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id",
            {
                'vulnerability_id': vulnerability_id
            }).fetchone()

        # keep the manually provided value if it has been provided, otherwise select the one in the DB
        repo_url = repo_url if repo_url != None else vulnerability['repo_url']
        published_timestamp = published_timestamp if published_timestamp != None else vulnerability[
            'published_date']

        if description == None:
            description = vulnerability['description']
            preprocessed_description = vulnerability[
                'preprocessed_description']
        else:
            preprocessed_description = rank.simpler_filter_text(description)

        if references != None:
            database.add_vulnerability_references_to_database(
                vulnerabilities_connection,
                vulnerability_id,
                references,
                driver=None,
                verbose=verbose)
        else:
            references = references if references != None else [
                nvd_reference['url']
                for nvd_reference in vulnerabilities_cursor.execute(
                    "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id",
                    {'vulnerability_id': vulnerability_id})
            ]

    else:
        if verbose:
            print("Vulnerability {} is a new vulnerability".format(
                vulnerability_id))

        # gather information for the new vulnerability if needed
        if description == None or published_timestamp == None or references == None:
            try:
                nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content(
                    vulnerability_id)
            except:  #if the vulnerability is not in the NVD
                nvd_description, nvd_published_timestamp, nvd_references = None, None, None

            if description == None:
                if nvd_description == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually."
                        .format(vulnerability_id))
                    description = input()

                    if description == "SKIP!":
                        print('skipping this one')
                        return
                else:
                    description = nvd_description

            if published_timestamp == None:
                if nvd_published_timestamp == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually."
                        .format(vulnerability_id))
                    published_timestamp = input()
                else:
                    published_timestamp = nvd_published_timestamp

            if references == None:
                if nvd_references == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)."
                        .format(vulnerability_id))
                    references = input()
                    references = references.split(',')
                else:
                    references = nvd_references

        # determine the repo_url
        if repo_url == None:
            if verbose: print('Suggesting a repository URL')
            repo_url = rank.map_description_to_repository_url(
                vulnerabilities_connection, vulnerability_id, description)

            print(
                'Does the vulnerability affect the following repository: {} [Y/n]'
                .format(repo_url))
            choice = input()
            if choice.lower() in [
                    '', 'y', 'yes'
            ]:  #@TODO: can be a while, where it is either yes or no, not enter
                print('Confirmed')
            else:
                print('Provide the (GitHub) URL of the affected repository:')
                repo_url = input()
                repo_url = re.sub('\.git$|/$', '', repo_url)
            print('repo_url:', repo_url)

        # add to the database
        preprocessed_description = rank.simpler_filter_text(description)
        with vulnerabilities_connection:
            vulnerabilities_cursor.execute(
                "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)",
                {
                    'vulnerability_id': vulnerability_id,
                    'repo_url': repo_url,
                    'description': description,
                    'published_timestamp': str(published_timestamp),
                    'preprocessed_description': preprocessed_description
                })

        # add the references to the database
        database.add_vulnerability_references_to_database(
            vulnerabilities_connection,
            vulnerability_id,
            references,
            driver=None,
            verbose=verbose)

    # determine the project_name
    if project_name == None:
        if verbose: print('Suggesting a project name')
        project_name = rank.extract_project_name_from_repository_url(repo_url)
        print('Does the vulnerability affect the following project: {} [Y/n]'.
              format(project_name))
        choice = input()
        if choice.lower() in [
                '', 'y', 'yes'
        ]:  #@TODO: can be a while, where it is either yes or no, not enter
            print('Confirmed')
        else:
            print('Provide the name of the affected project:')
            project_name = input()
    references_for_query = ''
    if len(references) == 1:
        references_for_query = "('" + references[0] + "')"
    else:
        references_for_query = tuple(references)
    references_content = tuple(
        pd.read_sql(
            "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'"
            .format(references_for_query, vulnerability_id),
            vulnerabilities_connection).preprocessed_content)
    references_content = rank.extract_n_most_occurring_words(
        rank.remove_forbidden_words_from_string(
            string=' '.join(references_content),
            forbidden_words=rank.reference_stopwords +
            project_name.split(' ')),
        n=20)

    # @TODO: now adding all advisory references --> change to only using the provided references
    advisory_references = [
        advisory_reference['url']
        for advisory_reference in vulnerabilities_cursor.execute(
            "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id",
            {'vulnerability_id': vulnerability_id})
    ]

    # creating advisory record
    advisory_record = rank.Advisory_record(
        vulnerability_id,
        published_timestamp,
        repo_url,
        references,
        references_content,
        advisory_references,
        description,
        prospector_connection,
        preprocessed_vulnerability_description=preprocessed_description,
        relevant_tags=None,
        verbose=verbose,
        since=None,
        until=None)

    if verbose:
        print("\nThe following advisory record has been created:")
        print(" - Vulnerability ID: {}".format(advisory_record.id))
        print(" - Vulnerability description: {}".format(
            advisory_record.description))
        print(" - Vulnerability published timestamp: {}".format(
            advisory_record.published_timestamp))
        print(" - Affected project: {}".format(advisory_record.project_name))
        print(" - Affected repository: {}".format(advisory_record.repo_url))
        print(" - References content extracted: {}".format(
            advisory_record.references_content))

    if verbose: print("\nGathering candidate commits:")
    advisory_record.gather_candidate_commits()

    if verbose: print("\nComputing ranking vectors:")
    advisory_record.compute_ranking_vectors(vulnerability_specific_scaling)

    if vulnerability_specific_scaling == False:
        if verbose:
            print(
                "\nscaling some columns using the pretrained scaler, and some vulnerability specific"
            )
        advisory_record.ranking_vectors[
            vulnerability_specific_columns] = MinMaxScaler().fit_transform(
                advisory_record.ranking_vectors[vulnerability_specific_columns]
            )
        advisory_record.ranking_vectors[
            universal_columns] = universal_columns_scaler.transform(
                advisory_record.ranking_vectors[universal_columns])
    advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True)

    if verbose: print("\nRanking the candidate commits:")
    advisory_record.ranked_candidate_commits = rank.rank_candidates(
        model, advisory_record.ranking_vectors)

    if verbose: print('\nResults:')
    advisory_record.ranking_vectors.set_index('commit_id', inplace=True)
    output = advisory_record_to_output(advisory_record,
                                       model,
                                       prospector_cursor,
                                       k=k)
    print(output)

    # # succeeded
    vulnerabilities_connection.close()
    prospector_connection.close()
    return advisory_record
Пример #10
0
def add_commits_to_database(connection,
                            commit_ids,
                            git_repo=None,
                            repository_url=None,
                            driver=None,
                            with_message_references_content=False,
                            verbose=True):
    '''
    Add commits to the database

    Input:
        connection (sqlite3.connection): the connection to the database
        commit_ids (list): a list of commit_ids
        git_repo (git_explorer.core.Git): to use for extracting the content
        repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo
        driver: a webdriver can be provided to avoid javascript required pages
        with_message_references_content (bool): to add commits references (requires additional time)
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
    '''
    if git_repo == None and repository_url == None:
        raise ValueError('Provide a git_repo or a repository_url')

    if git_repo == None:
        git_repo = Git(repository_url, cache_path=GIT_CACHE)
        git_repo.clone(skip_existing=True)

    if repository_url == None:
        repository_url = git_repo.get_url()
    repository_url = re.sub('\.git$|/$', '', repository_url)

    if type(commit_ids) == str:
        commit_ids = [commit_ids]
    if len(commit_ids) == 0:
        print('No commit IDs were provided')
        return

    cursor = connection.cursor()

    # to not add duplicates
    commit_ids = list(dict.fromkeys(commit_ids))  # to get only unique ids
    commits_already_in_the_db = list(
        pd.read_sql(
            "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'".
            format(tuple(commit_ids + [commit_ids[0]]),
                   repository_url), connection).id)
    commits_to_add = [
        commit_id for commit_id in commit_ids
        if commit_id not in commits_already_in_the_db
    ]

    if len(commits_to_add) == 0:
        cursor.close()
        return

    if verbose:
        print('    {} / {} are already in the database, now adding the rest.'.
              format(len(commits_already_in_the_db), len(commit_ids)))

    for commit_id in tqdm(commits_to_add):
        try:
            # initialize commit object
            commit = Commit(git_repo, commit_id)

            # message execution is combined with timestamp execution to speed up to process
            message = commit._exec.run(
                ['git', 'log', '--format=%B%n%ct', '-n1', commit._id])
            timestamp = message.pop(-1)

            diff = commit._exec.run([
                'git', 'diff', '--unified=1', commit._id + "^.." + commit._id
            ])
            changed_files = get_changed_files_from_diff(diff)
            hunks = get_hunks_from_diff(diff)

            preprocessed_message = rank.simpler_filter_text(message)
            preprocessed_diff = rank.simpler_filter_text(
                re.sub(
                    '[^A-Za-z0-9]+', ' ', ' '.join(
                        rank.extract_relevant_lines_from_commit_diff(diff))))
            preprocessed_changed_files = rank.simpler_filter_text(
                changed_files)

            if with_message_references_content:
                commit_message_reference_content = extract_commit_message_reference_content(
                    message, repository_url, driver)
                preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words(
                    commit_message_reference_content, n=20)
            else:
                commit_message_reference_content, preprocessed_commit_message_reference_content = None, None

            # add to database
            with connection:
                cursor.execute(
                    "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)",
                    {
                        'repository_url':
                        repository_url,
                        'id':
                        commit_id,
                        'timestamp':
                        str(timestamp),
                        'message':
                        str(message),
                        'changed_files':
                        str(changed_files),
                        'diff':
                        str(diff),
                        'hunks':
                        str(hunks),
                        'commit_message_reference_content':
                        commit_message_reference_content,
                        'preprocessed_message':
                        preprocessed_message,
                        'preprocessed_diff':
                        preprocessed_diff,
                        'preprocessed_changed_files':
                        preprocessed_changed_files,
                        'preprocessed_commit_message_reference_content':
                        preprocessed_commit_message_reference_content
                    })
        except:
            print('    Failed to add commit {}'.format(commit_id))
    if verbose: print('    All commits have been added to the database.')
    cursor.close()
    return
Пример #11
0
def add_vulnerability_references_to_database(connection,
                                             vulnerability_id,
                                             references,
                                             driver=None,
                                             verbose=True):
    '''
    Input:
        connection (sqlite3.connection): the connection with the database
        vulnerability_id (str): the identifier of the vulnerability
        references (list): the (NVD) references
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
        driver: a webdriver can be provided to avoid javascript required pages
    '''
    if type(references) == str:
        references = [references]

    cursor = connection.cursor()

    for reference in references:
        # cursor.execute("SELECT * FROM vulnerability_references WHERE url = :url AND vulnerability_id = :vulnerability_id;",
        #    {'url' : reference, 'vulnerability_id':vulnerability_id})
        # if len(cursor.fetchall()) == 0:

        if cursor.execute(
                "SELECT EXISTS(SELECT 1 FROM vulnerability_references WHERE  url = :url AND vulnerability_id = :vulnerability_id LIMIT 1) AS 'exists';",
            {
                'url': reference,
                'vulnerability_id': vulnerability_id
            }).fetchone()['exists'] == 0:
            time.sleep(random.random())
            if reference[-4:] == '.pdf' and verbose:
                print('    Skipping reference since reference is a pdf')
            elif any([term in reference for term in test_url_terms]) == False:
                try:
                    if 'securityfocus.com' in reference.strip(
                            '/.'
                    ):  #securityfocus.com requires a selection in a menu
                        reference = reference.strip('/.') + '/discuss'
                    try:
                        r = requests.get(
                            reference.strip('.'))  #can be end of the sentence
                        soup = BeautifulSoup(r.content, "html.parser")
                        reference_content = ' '.join(
                            [string for string in soup.stripped_strings])

                        # Apache pony mail requires the webdriver to see the content
                        if 'requires JavaScript enabled' in reference_content and driver != None:

                            driver.get(reference.strip('.'))
                            time.sleep(0.5)
                            soup = BeautifulSoup(driver.page_source,
                                                 "html.parser")
                            reference_content = ' '.join(
                                [string for string in soup.stripped_strings])
                    except:
                        if driver != None:
                            driver.get(reference.strip('.'))
                            time.sleep(0.5)
                            soup = BeautifulSoup(driver.page_source,
                                                 "html.parser")
                            reference_content = ' '.join(
                                [string for string in soup.stripped_strings])

                    preprocessed_reference_content = rank.simpler_filter_text(
                        reference_content)

                    # add to database
                    with connection:
                        cursor.execute(
                            "INSERT INTO vulnerability_references VALUES (:url, :vulnerability_id, :preprocessed_content)",
                            {
                                'url':
                                reference.strip('/'),
                                'vulnerability_id':
                                vulnerability_id,
                                'preprocessed_content':
                                str(preprocessed_reference_content)
                            })

                    try:
                        # add the urls referred to on these pages to the advisory references DB
                        urls_found = [
                            link.get('href').strip('/')
                            for link in soup.find_all('a')
                            if link.get('href') and 'http' in link.get('href')
                        ]
                        add_advisory_references_to_database(
                            connection, vulnerability_id, urls_found)
                    except:
                        print("Failed in adding advisory references")
                except:
                    print(
                        '    reference {} could not be added to the db'.format(
                            reference))
        elif verbose:
            print('    reference {} is already in the db'.format(reference))
    cursor.close()
    return