示例#1
0
def get_software_name_and_version_from_cpe(cpe):
    # print(cpe)
    parts = cpe.split(':')[2:]
    # print(parts)
    software, version = '', ''

    num_found = False
    idx = 0

    for part in parts:
        if not utils.contain_letter(part) and not utils.contain_number(part):
            break
        part = part.replace('_', ' ').replace('~', ' ')

        if part[0].isdigit():
            version += part + ' '
            num_found = True
        elif num_found:
            version += part + ' '
        else:
            software += part + ' '

        idx += 1

    software = software.strip()
    version = version.strip()
    if version == '':
        software, version = extract_windows_version(software)
    if software == '':
        software, version = corner_case(version)

    return software, version
示例#2
0
def judge_word_in_sentence_with_both_dots_and_numbers(sent):
    words = nltk.word_tokenize(sent)
    for word in words:
        contains_number = utils.contain_number(word)
        contains_dot = '.' in word
        if contains_number and contains_dot:
            return True
    return False
示例#3
0
def corner_case(version):
    # '1024cms 1024 cms 1.4.2 beta'
    version_split = version.split()
    version_split.reverse()

    idx = 0
    for word in version_split:
        if utils.contain_number(word):
            version_split.reverse()
            return ' '.join(version_split[:-idx - 1]), ' '.join(
                version_split[-idx - 1:])
        idx += 1
    return '', version
def clean_redundant_words_and_reserve_range(before_clean_list):
    # preserve words that are
    # (1) in the cve word set or
    # (2) contain numbers or
    # (3) do not contain number and letter or
    # (4) in range word set
    # todo: enrich range word set
    range_word_set = {'before', 'older', 'prior', 'up', 'to', 'through', 'and', 'earlier',
                      'upper', 'higher', 'lower', 'including', 'since', 'onwards'}

    after_clean_set = set()
    for version_str in before_clean_list:
        clean_version_str = ''
        version_str_word_list = version_str.split()
        for word in version_str_word_list:
            if utils.contain_number(word) or (not utils.contain_number(word) and not utils.contain_letter(
                    word)) or word in range_word_set:
                clean_version_str += word + ' '
        after_clean_set.add(clean_version_str.strip())
    after_clean_set -= {''}
    # if '' in after_clean_set:
    #     after_clean_set.remove('')
    return list(after_clean_set)
示例#5
0
def extract_pair_from_edb_title(raw_title):
    title_dict = {}
    if raw_title.find(' - ') != -1:
        # software_in_title might contain version
        software_in_title = raw_title.split(' - ')[0]
        contains_number = utils.contain_number(software_in_title)
        if contains_number:
            content_line_version, content_line_software = get_pair_from_content_line_focus_official(
                raw_title)
            content_line_version = content_line_version[:content_line_version.
                                                        find(' - ')]
            content_line_version, content_line_software = move_range_from_software_to_version(
                content_line_version, content_line_software)
            title_dict = {
                encode_content(content_line_software):
                encode_content(content_line_version)
            }
    return title_dict
示例#6
0
def get_pair_from_content_line_focus_official(line):
    line = line.lower()
    con_word_list = line.split()
    content_line_version, content_line_software = '', ''
    keyword_software_loc = 0
    # contains keyword software
    word_idx = 0
    mat1 = False
    version_loc = -1
    for word in con_word_list:
        if word == '':
            continue

        if word in ['windows', 'office']:
            version_loc = word_idx + 1
            mat1 = True
            break
        word_idx += 1
    if mat1:
        content_line_version = get_right_part(con_word_list, version_loc)
        content_line_software = get_left_part(con_word_list, version_loc)
    else:
        mat1 = False
        # the n-th word is version
        version_loc = 0
        word_idx = 0
        for word in con_word_list:
            if word == '':
                continue
            mat1 = re.match(r'(v)?[\d]{1,2}((\.[\d]{1,2}){1,2}(\.x)?|\.x)',
                            word)
            if mat1:
                version_loc = word_idx
                break
            word_idx += 1

        # contains 1.1.x format number
        if version_loc != 0:
            content_line_version = get_right_part(con_word_list, version_loc)
            content_line_software = get_left_part(con_word_list, version_loc)
            # content_line_software = con[:con.find(content_line_version)].strip()
            # print(content_line_software, ' ||| ', content_line_version)
            # print(con)
            # print()

        # find word that is a number
        else:
            version_loc = 0
            word_idx = 0
            for word in con_word_list:
                if word == '':
                    continue
                contains_number = utils.contain_number(word)
                if contains_number and word.lower not in [
                        'x64', 'x86', 'x86_64'
                ]:
                    version_loc = word_idx
                    break
                word_idx += 1
            content_line_version = get_right_part(con_word_list, version_loc)
            content_line_software = get_left_part(con_word_list, version_loc)
            # print(content_line_software, ' ||| ', content_line_version)
            # print(con)
            # print()
            if content_line_version in content_line_software:
                print(
                    'ERROR if content_line_version in content_line_software:')
    return content_line_version.lower(), remove_duplicate_word_from_software(
        content_line_software).lower()
def remove_space_in_focus_version(version_str):
    version_split = version_str.split()
    if len(version_split) == 2:
        if version_split[1][0] == '.' and utils.contain_number(version_split[1][1:]):
            version_str = version_str.replace(' ', '')
    return version_str