Пример #1
0
def find_sentences(keywords_list, input_str, topn):
    from flashtext.keyword import KeywordProcessor
    from collections import Counter
    '''
    :param keywords_list:  list     ['word1',..'wordn']
    :param doc_txt: txt文档
    :param topn: 显示重要程度TOPN的句子
    :return:keywords_sentences: dict  {'word1':['sen1','sen2',...],...}
    '''
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(keywords_list)

    # 按标点符号切分句子
    doc_cut_list = set(re.split('[?!…。?!]', input_str))  # 去重

    # 查找关键词所在句子
    sentences_wordcount = {}
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            keywords_count = Counter(keywords_found)
            sentences_wordcount[sentence] = keywords_count

    keywords_sentences = {}
    for word in keywords_list:
        keywords_sentences[word] = [
            k for k, v in sentences_wordcount.items() if word in v.keys()
        ][0:topn]

    return keywords_sentences
Пример #2
0
 def test_file_format_two(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_two.txt')
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Пример #3
0
 def test_list_loading(self):
     keyword_processor = KeywordProcessor()
     keyword_list = ["java", "product management"]
     keyword_processor.add_keywords_from_list(keyword_list)
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Пример #4
0
def extract_tag_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract tag frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_tag_freq_dict = dict()
    target_tag_file_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = ' ' + tag.strip() + ' '
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            nlp_file_path = os.path.join(dir_path, file_name)
            try:
                nlp_file = open(nlp_file_path)
                for line in nlp_file:
                    line_list = line.strip().split('\t')
                    tag_sent = ' ' + line_list[2] + ' '
                    keywords_found = keyword_processor.extract_keywords(tag_sent)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_tag_freq_dict:
                            target_tag_freq_dict[keyword] = 1
                        else:
                            target_tag_freq_dict[keyword] += 1
                        if keyword not in target_tag_file_dict:
                            target_tag_file_dict[keyword] = [nlp_file_path]
                        else:
                            if nlp_file_path not in target_tag_file_dict[keyword]:
                                target_tag_file_dict[keyword].append(nlp_file_path)
                nlp_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(nlp_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_tag_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(tag, target_tag_freq_dict[tag])
    frequency_output_file.close()
    file_list_output_file = open('{0}_file_list.txt'.format(target_date), 'w')
    sorted_file_list = sorted(target_tag_file_dict)
    for tag in sorted_file_list:
        for file_nm in target_tag_file_dict[tag]:
            print >> file_list_output_file, '{0}\t{1}'.format(tag, file_nm)
    file_list_output_file.close()
Пример #5
0
    def test_extract_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Extract keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keywords_extracted = keyword_processor.extract_keywords(
                test_case['sentence'])
            self.assertEqual(
                keywords_extracted, test_case['keywords'],
                "keywords_extracted don't match the expected results for test case: {}"
                .format(test_id))
Пример #6
0
    def test_dictionary_loading(self):
        keyword_processor = KeywordProcessor()
        keyword_dict = {
            "java": ["java_2e", "java programing"],
            "product management": ["product management techniques", "product management"]
        }
        keyword_processor.add_keywords_from_dict(keyword_dict)

        sentence = 'I know java_2e and product management techniques'
        keywords_extracted = keyword_processor.extract_keywords(sentence)
        self.assertEqual(keywords_extracted, ['java', 'product management'],
                         "Failed file format one test")
        sentence_new = keyword_processor.replace_keywords(sentence)
        self.assertEqual(sentence_new, "I know java and product management",
                         "Failed file format one test")
Пример #7
0
def find_sentences_weight(word_weight_dict, input_str, topn):
    from flashtext.keyword import KeywordProcessor
    from collections import Counter
    '''
    :param keywords_list:  list     ['word1',..'wordn']
    :param top_keywords:  计算TOP权重的关键词     
    :param input_str: str文档
    :param topn: 显示重要程度TOPN的句子
    :return:keywords_sentences: dict  {'word1':['sen1','sen2',...],...}
    '''
    # 读取词汇权重文件

    keywords_list = list(word_weight_dict.keys())

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(keywords_list))

    input_str = re.sub('\s', '', input_str)
    doc_cut_list = set(re.split('[?!…。;?!]', input_str))  # 去重

    # 计算句子重要程度
    sentences_wordcount = {}  # [('句子',(value,['word1',...'wordn']))]
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            keywords_in_sentence = []
            sentence_value = 0
            keywords_count = Counter(keywords_found)
            for k, v in keywords_count.items():
                sentence_value = sentence_value + v * word_weight_dict[
                    k]  # 权重 = 求和(句子中出现的关键词权重)
                keywords_in_sentence.append(k)
            sentence_value = len(keywords_in_sentence) * sentence_value / len(
                sentence)  #  权重 = 关键词个数 * 关键词权重 / 句子长度,  可根据需要注释
            sentences_wordcount[sentence] = (sentence_value,
                                             keywords_in_sentence)

    sentences_wordcount_sort = sorted(
        sentences_wordcount.items(), key=lambda x: x[1][0],
        reverse=True)  # 句子,权重     [('句子',(value,['word1',...'wordn']))]
    topn_sentences = [s[0] for s in sentences_wordcount_sort[0:topn]]

    return topn_sentences
Пример #8
0
def extract_word_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract word frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_word_freq_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = tag.strip()
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            if not file_name.endswith('_trx.txt'):
                continue
            trx_file_path = os.path.join(dir_path, file_name)
            try:
                trx_file = open(trx_file_path)
                for line in trx_file:
                    line = line.strip()
                    keywords_found = keyword_processor.extract_keywords(line)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_word_freq_dict:
                            target_word_freq_dict[keyword] = 1
                        else:
                            target_word_freq_dict[keyword] += 1
                trx_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(trx_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_word_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(
            tag, target_word_freq_dict[tag])
    frequency_output_file.close()
Пример #9
0
def getSemanticNeighbors(entNameDict, entTextDict):
    semNeiDict = {i: set() for i in dm.entityDict.keys()}
    keyword_processor = KeywordProcessor()
    name2midDict = {}
    for mid in dm.entityDict.keys():
        if mid not in entNameDict.keys(): continue
        name = entNameDict[mid]
        if len(name) <= 3: continue
        keyword_processor.add_keyword(name)
        name2midDict[name] = mid

    for mid2 in entTextDict.keys():
        if mid2 not in dm.entityDict.keys(): continue
        text = entTextDict[mid2].lower()
        keywords_found = keyword_processor.extract_keywords(text)
        for key in keywords_found:
            mid = name2midDict[key]
            if mid != mid2:
                semNeiDict[mid].add(mid2)
                semNeiDict[mid2].add(mid)
    return semNeiDict
Пример #10
0
def find_sentences_rule(keywords_list, doc_txt):
    from flashtext.keyword import KeywordProcessor

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(keywords_list)

    # 按标点符号切分句子
    fie = [
        line.strip()
        for line in open(doc_txt, 'r', encoding='utf-8').readlines()
        if len(line) > 1
    ]  # 按特殊标点符号,分割句子
    article = ''
    for i in fie:
        article += i.strip() + '*'
    doc_cut_list = re.split('[?!…。?!*]', article)  # 对不同行,没有标点的句子通过‘*

    sentences_important = []
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            sentences_important.append(sentence)
    print(sentences_important)
    return sentences_important
Пример #11
0
from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword('SQL injection')
keyword_processor.add_keyword('SQL injection',
                              ('vulnerability type', 'SQL injection'))
keyword_processor.add_keyword('cross-site scripting',
                              ('vulnerability type', 'cross-site scripting'))
keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.add_keyword('parameter')
# keyword_processor.add_keyword('function')
# keyword_processor.add_keyword('variable')

keyword_dict = {"cross-site scripting": ["XSS"], "parametert": ["variabler"]}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["parameter", "function", "variable"])

####keyword replacement
# keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.replace_keywords('vulnerability is cross-site scripting')

keyword_processor.extract_keywords(
    'SQL injection vulnerability in the update_zone function in catalog/admin/geo_zones.php in osCommerce Online Merchant 2.3.3.4 and earlier allows remote administrators to execute arbitrary SQL commands via the zID parameter in a list action. '
)
count = 0
text = ""
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    text += pageObj.extractText()

#now part for Java-keywords
#aho-corasick algorithm for string matching
#trie data-structure
keywords = open("keywords.txt", "r")
keys = keywords.read().splitlines()
keyword_processor = KeywordProcessor()
for i in range(len(keys)):
    keyword_processor.add_keyword(keys[i])
    keywords_found = keyword_processor.extract_keywords(text)
found = keywords_found


# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist, wordfreq))


# Sort a dictionary of word-frequency pairs in
# order of descending frequency.
def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
Пример #13
0
def classification(textSplit):
    keyword_cyber_security_risk = ["ad fraud", "cyberattack", "malware", "botnets", "CnC", "Command and Control",
                                   "compromised accounts", \
                                   "hacking", "key logging", "phishing", "spyware", "worm", "trojan", "RAT", "APT",
                                   "adware", "fileless attack", "cryptocurrency mining", \
                                   "spam", "ransomware", "denial-of-service", "sql injection", "man-in-the-middle",
                                   "compromised pcs", "spam site", "malicious payload", "apt", \
                                   "advanced persistent threat", "spoofing", "virus", "slowloris", "xss",
                                   "cross-site scripting", "exploit", "vulnerability", "cve", "day zero","backdoor","blackhat"
                                   ,"bruteForce","botNet","cracking","forensics","exploit","scanning","enumeration","reconnaisance",""
                                   "adware", "autorun worms", "advanced persistent threats", "attack vector", "backdoor", "blended attack",
                                   "botnet", "browser hijacker", "brute force attack", "clickjacking", "command and control servers",
                                   "content spoofing", "cross site scripting", "xss" "xsrf", "data theft", "denial of service attack", "dictionary attack",
                                   "drive-by download", "email spoofing", "email virus", "exploit", "form grabber",
                                   "identity theft", "insider attack", "keylogger",
                                   "likejacking", "malware", "mman in the middle" "ransomware", "rootkit", "spam",
                                   "spyware", "SQL injection", "wabbit", "website spoofing "
                                   ,"ip","tcp","router","network","cisco", ""]

    Adult_Content = [
        "pornography", "violence", "blood", "gore", "sex", "nudity", "erotic",
        "hardcore", "fetish", "intercourse", "explicit content", "hentai",
        "masturbation", "dick", 'pussy', "penis", "v****a", "anus", 'boobs',
        'p**n', 'xxx'
    ]

    keyword_Aggressive = [
        "attacking", "fighting", "invading", "assailing", "threaten",
        "slashing", "beating", "destroy", "destruction", 'assault'
    ]

    Keyword_arts = [
        "painting", "drawing", "ceramics", "pottery", "photography",
        "sculpture", "dance"
    ]

    keyword_automotive = [
        'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
        'chevrolet', 'chrysler', 'citroen', 'dodge', 'ferrari', 'honda',
        'toyota', 'hyundai', 'kia', 'lamborghini', 'lexus', 'mazda',
        'mercedes', 'skoda', 'mitsubishi', 'nissan', 'porsche', 'subaru',
        'suzuki', 'tesla', 'volkswagen', 'volvo', 'horsepower', 'torque'
    ]

    keyword_cloud_service = [
        "cloud backup", "cloud storage", "cloud processing", "iaas", "paas",
        "saas", "aws", "azure", "google cloud", "amazon web services",
        "infrastructure as a service", "platform as a service",
        "software as a service", "cloud software", "IBM cloud", "vmware",
        "salesforce", "oracle cloud", "sap cloud", "alibaba cloud",
        "cloud service"
    ]

    keywords_IM = [
        "discord", "skype", "viber", "whatsapp", "facebook messenger",
        "wechat", "telegram", "line", "qq mobile"
    ]

    keyword_Criminal_Activities = [
        "arson", "assault", "bigamy", "blackmail", "bribery", "burglary",
        "child abuse", "conspiracy", "espionage", "forgery", "fraud",
        "genocide", "hijacking", "homicide", "kidnap", "manslaughter",
        "mugging", "murder", "kill", "perjury", "rape", "riot", "robbery",
        "shoplift", "slander", "smuggle", "treason", "trespass", "gang fights",
        "steal", "theft", "cyber crime", "corruption", "domestic", "violence",
        "ransom", "vandalism", "child abuse ", "terrorism", "militia",
        'insurgent', 'bombing', 'terrorist', 'make bomb', 'bomb making',
        'bombs'
    ]

    keyword_dating = [
        "online dating", "tinder", "okcupid", "valentines", "romantic",
        "roses", "presents", "anniversary", "rings", "dating ideas",
        "movie dates", 'wedding', 'hook up'
    ]

    keyword_softwareDevelopement = [
        "pycharm", "netbeans", "sqlite", "linux", "visual studio", "node.js",
        "codenvy", "angularjs", "eclipse", "react native", 'python', 'java',
        'c++', 'ruby on rails', 'flutter', 'javascript', 'html', 'maven',
        'node.js', 'html', 'css', 'php', 'database', 'sql', 'db', 'pip',
        'web development', 'code', 'debug', 'c#', 'kotlin', 'objective-c',
        'visual basic', 'perl', 'matlab', 'libraries', 'stack development',
        'backend', 'frontend', 'framework', 'software develop',
        'machine learning', 'tensorflow', 'AI', 'API',
        'application programming interface'
    ]

    keyword_Ecommerce_Tools = [
        "ecommerce website tools", "research tools", "business tools",
        "marketing tools", "analytics tools", "bigcommerce", "x-cart",
        "shopify", "woocommerce", "prestashop", "junglescout", "semrush",
        "ahrefs", "sourcify", "veeqo", "tickspot", "asana", "inventory source",
        "oberlo", "shipwire", "tradegecko", "shippingeasy", "wave", "ecomdash",
        "mailchimp", "campaign monitor", "feeds4", "active campaign",
        "bulk.ly", "buffer", "omnistar", "antavo", "smile.lo", "user testing",
        "wishpond", "klaviyo", "buzzstream", "exitbee", "metrilo", "storeya",
        "instasize", "visual website optimizer", "optimizely analytics",
        "google analytics", "neatly", 'search engine optimization', 'SEO'
    ]

    keyword_Entertainment = [
        "plays", "comedy", "puppet shows", "sports", "performance art",
        "animation", "karaoke", "video games", "dance", "magic",
        "television programs", "music", "acting", "nightclubs",
        "fashion shows", "netflix", "concerts", "circus", "parties",
        "symphonies", "theatre", "variety shows"
    ]

    keyword_Software_Downloads_Sites = [
        "download.com", "filehippo", "zdnet download", "softpedia", "tucows",
        "freewarefiles", "majorgeeks", "filecluster", "soft32", "torrent",
        "softonic", "freewarehome", "ninite", "download crew", "filehorse",
        "filepuma", "sourceforge", "software"
        "informer", "alternativeto"
    ]

    keyword_Finance_Accounting = [
        "Accounts payable", "accounts receivable", "accrued expense",
        "balance sheet", "book value", "equity", "inventory", "zoho books",
        "xero"
    ]

    keyword_Food_drinks = [
        "macdonald", "kfc", "grabfood", "subway", "jolibee", "coke", "laksa",
        "chicken rice", "yong tau foo", "buffet", "pizza", "bbq",
        "black pepper", "beef", "mutton", "curry", "nasi lemak", "carrot cake",
        "green tea", "bubble tea", "pudding jelly", "cake", "bread", "milo",
        "ice cream", "fishball"
    ]

    keyword_Gambling = [
        "poker", "roulette", "slot-machines", "bingo", "baccarat",
        "casino war", "craps", "carribean stud", "keno", "let it ride",
        'betting'
    ]

    keyword_government_legal = [
        "moe", "mof", "mha", "mfa", "mti", "msf", "mod", "mol", "mom", "moh",
        "mot", ".gov.sg", 'government', 'ministry of', 'minister',
        'minister of'
    ]

    keyword_Hobbies_Interests = [
        "Sports", "music", "travel", "fishing", "social work",
        "volunteer work", "painting", "dancing", "reading", "writing",
        "gardening", "animal care", "cooking", "bowling", "computer gaming",
        "fashion", "ice skating", "magic", "origami", "photography",
        "sculpting", "comedy", "winemaking", "yoga", "computer programming",
        "diving", "football", "basketball", "tennis", "badminton",
        "table tennis", "soccer", "rugby", "jogging", "marathon", "cycling",
        "rock climbing", "swimming", "cheerleading", "fencing", "laser tag",
        "darts", "eating", "sleeping", "hockey", "weightlifting", "volleyball",
        "martial arts", "hiking", "backpacking", "archery", "wrestling",
        "boxing", "poker", "chess"
    ]

    keyword_insurance = [
        "life insurance", "health insurance", "travel insurance",
        "home insurance", "child insurance", "maid insurance", "car insurance",
        "pet insurance", "personal accident insurance", "term life insurance",
        "whole life insurance", "ntuc income", "great eastern", "prudential",
        "AIA", "aviva", "savings plan", "integrated shield plan",
        "trip delays", "baggage delay", "lost items", "medical coverage",
        "missed flights"
    ]

    keyword_jobsearch = [
        "career@gov", "jobstreet", "gumtree", "indeed", "jobsdb", "stjobs",
        "mycareerfuture", "jobscentral", "linkedin", "startupjobs"
    ]

    Keyword_kids = [
        "hasbro", "nursery rhythms", "fox kids", "smiggle", "kiddy palace",
        "Playground", "toy r us", "avent", "enfagrow", "kinder joy"
    ]

    keyword_Military = [
        "army", "air force", "navy", "rank", "infrantry", "armour",
        "artillery", "special forces", "rangers", "guards", "military police",
        "signals", "combat engineers", "field engineers", "sar 21",
        "machine guns", "missile launcher", "weapons", "medic", "tanks",
        "fighter jets", "helicopter", "armoured vehicles", "rocket artillery",
        "armoured carriers", "sergeant", "officer", "encik"
    ]

    keyword_news_and_media = [
        "cna", "bbc", "thestraitstime", "thenewspaper", "mediacorp",
        "techredar", "asiaone", "yahoo", "msn", "flipboard", "twitter",
        "dailymail", "today", "thebusinesstimes", 'reporters'
    ]

    keyword_peer2peer = [
        "pirate bay", "kickass torrent", "torrent", "rarbg", "1337x",
        "torlock", "YTS", " qBittorrent", "Vuze", "Deluge", "uTorrent",
        "BitTorrent", "EZTV", "ETTV", "Popcorn Time", "LimeTorrents"
    ]

    keyword_pets = [
        "cat", "dog", "rabbit", "hamster", "fish", "bird", "guinea pig",
        "chinchilla", "cow", "chicken", "sheep", "lamb", "pig", "llama",
        "turtle", "tortoise", "frog"
    ]

    keywords_realEstate = [
        "hdb", "bungalow", "studio", "semi-detached", "condos", "landed",
        "propnex realty", "huttons asia", "era", "propseller", 'condominium',
        'apartment', 'mansionette', 'property guru', 'property agent'
    ]

    keyword_Search_engines = [
        'google', 'yahoo', 'bing', 'duckduckgo', 'wiki.com', 'gibiru',
        'boardreader', 'baidu', 'torsearch', 'ask.com'
    ]

    keywords_shopping = [
        "qoo10", "lazada", "shopee", "zalora", "taobao", "amazon", "carousell",
        "ebay", "redmart", "reebonz"
        "online shopping", "online sale", "free shipping", "free delivery",
        "next day delivery"
    ]

    Keyword_social = [
        "imgur", "facebook", "twitter", "instagram", "tumblr", "flicker",
        "google+", "youtube", "pinterest", "reddit", "snapchat", "baidu tieba",
        "skype", "telegram", "whatsapp", "hardwarezone", "forum"
    ]

    keyword_mediaStreaming = [
        "netflix", "youtube", "apple Tv", "chromecast", "subsonic",
        "audio galaxy", "tudou", "baidu", "dailymotion", "vimeo"
    ]

    keywords_trading_invest = [
        "stocks", "money", "profits", "srs", "blue-chip", "growth", "dividend",
        "nasdaq", "corporate bonds", "etf"
    ]

    Keyword_translation = [
        "google translate", "yandex", "babelfish", "tradukka", "linguee",
        "systranet", "permondo", "translatesonline.com"
    ]

    keyword_webhosting_isp_telco = [
        "singtel", "starhub", "m1", "circlelife", "tpg", "myrepublic",
        "viewquest", "alibaba", "apc", "amazon web"
    ]

    keyword_web_hosting = [
        "bluehost", "inmotion hosting", "hostgator", "hostinger", "godaddy",
        "tsohost", "wix", "siteground", "hostwinds", "weebly", "squarespace",
        "vodien", "a2 hosting", "dreamHost", "website hosting", "domain name",
        "namecheap", "host website", "domain registration", "whois",
        "website server", "apache", "nginx"
        "web host"
    ]

    keyword_proxies_vpn = [
        "expressvpn", "nordvpn", "ipvanish", "hotspot shield", "tunnelbear",
        "hidester", "hide.me", "proxysite.com", "kproxy", "VPNbook",
        "whoer.net", "megaproxy"
    ]

    keyword_webmail = [
        "gmail", "hotmail", "live", "yahoo", "outlook", "aol", "zoho",
        "protonmail"
    ]

    keyword_travel = [
        'booking.com', 'tripadvisor', 'expedia', 'airbnb', 'agoda',
        'priceline', 'skyscanner', 'kayak.com', 'makemytrip', 'cheapoair',
        'trivago', 'travelocity', 'orbitz', 'hotelurbano', 'book hotel',
        'air tickets', 'airfares', 'hotels', 'cheap flight', 'cheap hotel',
        'airline', 'flights'
    ]

    keyword_drugs = [
        'marijuana', 'opium', 'heroin', 'cocaine', 'barbiturates', 'meth',
        'ice', 'crystal meth', 'ecstacy', 'weed', 'cannabis'
    ]

    Keyword_weapons = [
        "gun", "sword", "machine gun", "butterfly knife", "rocket", "bazooka",
        "flamethrower", "pistol", "rifle", "grenade", "sniper"
    ]

    keyword_sports = [
        "soccer", "football", "tennis", "basketball", "hockey", "bowling",
        "table-tennis", "kayaking", "canoeing", "snorkeling", "diving",
        "swimming", "scuba-diving", 'martial arts'
    ]

    Keyword_religion = [
        "Buddihsm", "Hinduism", "Sikhism", "Christianity", "Islam", "Judaism",
        "Spiritism", "Shinto", "Taoism"
    ]

    Keyword_technology = [
        "cloud computing", "5g", "computer ai", "wireless", "ssd",
        "smartphone", "drones", "robots", "gaming", "smartwatch"
    ]

    keyword_cyber_security_solutions = [
        "identity and access management", "IAM", "cloud security",
        "risk and compliance management", "encryption", "data loss prevention",
        "DLP", "UTM", "unified threat management", "firewall", "antivirus",
        "antimalware", "IDS", "intrusion detection system",
        "intrusion prevention system", "IPS", "disaster recovery",
        "ddos mitigation", "cyber security solution", "IT security", "cisco",
        "symantec", "norton", "trend micro", "avast", "carbon black",
        "crowd strike", "fortinet", "palo alto", "splunk", "mcafee", "sophos",
        "proofpoint", "imperva", "fireye", "LogRythm", "Netskope", "trustwave"
    ]

    keyword_education = [
        ".edu", "coursera", "khan academy", "open culture", "udemy",
        "academic earth", "edx", "university", "polytechnic", "diploma",
        "bachelors", "degree", "phd", "masters", "professor", "scholarship",
        "schooling", "teaching", "learning", "education", "online learning",
        "distance learning", "institute"
    ]

    keyword_tobacco = [
        'marlboro', 'camel', 'cigarette', 'tobacco', 'lucky strike', 'winston',
        'dunhill', 'lung cancer', 'viceroy', 'smoking', 'vape', 'e-cigarette',
        'cigar', 'vaping', 'vaporiser', 'electronic cigarette'
    ]

    keywords=keyword_cyber_security_risk+Adult_Content+keyword_Aggressive+Keyword_arts+keyword_automotive+keyword_cloud_service+\
             keywords_IM+keyword_Criminal_Activities+keyword_dating+keyword_softwareDevelopement+keyword_Ecommerce_Tools+keyword_Entertainment+\
             keyword_Software_Downloads_Sites+keyword_Finance_Accounting+keyword_Food_drinks+keyword_Gambling+keyword_government_legal+\
             keyword_Hobbies_Interests+keyword_insurance+keyword_jobsearch+Keyword_kids+keyword_Military+keyword_news_and_media+\
             keyword_peer2peer+keyword_pets+keywords_realEstate+keyword_Search_engines+keywords_shopping+Keyword_social+keyword_mediaStreaming+\
             keywords_trading_invest+Keyword_translation+keyword_webhosting_isp_telco+keyword_web_hosting+keyword_proxies_vpn+keyword_webmail+keyword_travel+\
             keyword_drugs+Keyword_weapons+keyword_sports+Keyword_religion+Keyword_technology+keyword_cyber_security_solutions+keyword_education+keyword_tobacco

    kp0 = KeywordProcessor()
    kp1 = KeywordProcessor()
    kp2 = KeywordProcessor()
    kp3 = KeywordProcessor()
    kp4 = KeywordProcessor()
    kp5 = KeywordProcessor()
    kp6 = KeywordProcessor()
    kp7 = KeywordProcessor()
    kp8 = KeywordProcessor()
    kp9 = KeywordProcessor()
    kp10 = KeywordProcessor()
    kp11 = KeywordProcessor()
    kp12 = KeywordProcessor()
    kp13 = KeywordProcessor()
    kp14 = KeywordProcessor()
    kp15 = KeywordProcessor()
    kp16 = KeywordProcessor()
    kp17 = KeywordProcessor()
    kp18 = KeywordProcessor()
    kp19 = KeywordProcessor()
    kp20 = KeywordProcessor()
    kp21 = KeywordProcessor()
    kp22 = KeywordProcessor()
    kp23 = KeywordProcessor()
    kp24 = KeywordProcessor()
    kp25 = KeywordProcessor()
    kp26 = KeywordProcessor()
    kp27 = KeywordProcessor()
    kp28 = KeywordProcessor()
    kp29 = KeywordProcessor()
    kp30 = KeywordProcessor()
    kp31 = KeywordProcessor()
    kp32 = KeywordProcessor()
    kp33 = KeywordProcessor()
    kp34 = KeywordProcessor()
    kp35 = KeywordProcessor()
    kp36 = KeywordProcessor()
    kp37 = KeywordProcessor()
    kp38 = KeywordProcessor()
    kp39 = KeywordProcessor()
    kp40 = KeywordProcessor()
    kp41 = KeywordProcessor()
    kp42 = KeywordProcessor()
    kp43 = KeywordProcessor()
    kp44 = KeywordProcessor()
    kp45 = KeywordProcessor()
    for word in keywords:
        kp0.add_keyword(word)
    for word in keyword_cyber_security_risk:
        kp1.add_keyword(word)
    for word in Adult_Content:
        kp2.add_keyword(word)
    for word in keyword_Aggressive:
        kp3.add_keyword(word)
    for word in Keyword_arts:
        kp4.add_keyword(word)
    for word in keyword_automotive:
        kp5.add_keyword(word)
    for word in keyword_cloud_service:
        kp6.add_keyword(word)
    for word in keywords_IM:
        kp7.add_keyword(word)
    for word in keyword_Criminal_Activities:
        kp8.add_keyword(word)
    for word in keyword_dating:
        kp9.add_keyword(word)
    for word in keyword_softwareDevelopement:
        kp10.add_keyword(word)
    for word in keyword_Ecommerce_Tools:
        kp11.add_keyword(word)
    for word in keyword_Entertainment:
        kp12.add_keyword(word)
    for word in keyword_Software_Downloads_Sites:
        kp13.add_keyword(word)
    for word in keyword_Finance_Accounting:
        kp14.add_keyword(word)
    for word in keyword_Food_drinks:
        kp15.add_keyword(word)
    for word in keyword_Gambling:
        kp16.add_keyword(word)
    for word in keyword_government_legal:
        kp17.add_keyword(word)
    for word in keyword_Hobbies_Interests:
        kp18.add_keyword(word)
    for word in keyword_insurance:
        kp19.add_keyword(word)
    for word in keyword_jobsearch:
        kp20.add_keyword(word)
    for word in Keyword_kids:
        kp21.add_keyword(word)
    for word in keyword_Military:
        kp22.add_keyword(word)
    for word in keyword_news_and_media:
        kp23.add_keyword(word)
    for word in keyword_peer2peer:
        kp24.add_keyword(word)
    for word in keyword_pets:
        kp25.add_keyword(word)
    for word in keywords_realEstate:
        kp26.add_keyword(word)
    for word in keyword_Search_engines:
        kp27.add_keyword(word)
    for word in keywords_shopping:
        kp28.add_keyword(word)
    for word in Keyword_social:
        kp29.add_keyword(word)
    for word in keyword_mediaStreaming:
        kp30.add_keyword(word)
    for word in keywords_trading_invest:
        kp31.add_keyword(word)
    for word in Keyword_translation:
        kp32.add_keyword(word)
    for word in keyword_webhosting_isp_telco:
        kp33.add_keyword(word)
    for word in keyword_web_hosting:
        kp34.add_keyword(word)
    for word in keyword_proxies_vpn:
        kp35.add_keyword(word)
    for word in keyword_webmail:
        kp36.add_keyword(word)
    for word in keyword_travel:
        kp37.add_keyword(word)
    for word in keyword_drugs:
        kp38.add_keyword(word)
    for word in Keyword_weapons:
        kp39.add_keyword(word)
    for word in keyword_sports:
        kp40.add_keyword(word)
    for word in Keyword_religion:
        kp41.add_keyword(word)
    for word in Keyword_technology:
        kp42.add_keyword(word)
    for word in keyword_cyber_security_solutions:
        kp43.add_keyword(word)
    for word in keyword_education:
        kp44.add_keyword(word)
    for word in keyword_tobacco:
        kp45.add_keyword(word)
    x = textSplit
    y0 = len(kp0.extract_keywords(x))
    y1 = len(kp1.extract_keywords(x))
    y2 = len(kp2.extract_keywords(x))
    y3 = len(kp3.extract_keywords(x))
    y4 = len(kp4.extract_keywords(x))
    y5 = len(kp5.extract_keywords(x))
    y6 = len(kp6.extract_keywords(x))
    y7 = len(kp7.extract_keywords(x))
    y8 = len(kp8.extract_keywords(x))
    y9 = len(kp9.extract_keywords(x))
    y10 = len(kp10.extract_keywords(x))
    y11 = len(kp11.extract_keywords(x))
    y12 = len(kp12.extract_keywords(x))
    y13 = len(kp13.extract_keywords(x))
    y14 = len(kp14.extract_keywords(x))
    y15 = len(kp15.extract_keywords(x))
    y16 = len(kp16.extract_keywords(x))
    y17 = len(kp17.extract_keywords(x))
    y18 = len(kp18.extract_keywords(x))
    y19 = len(kp19.extract_keywords(x))
    y20 = len(kp20.extract_keywords(x))
    y21 = len(kp21.extract_keywords(x))
    y22 = len(kp22.extract_keywords(x))
    y23 = len(kp23.extract_keywords(x))
    y24 = len(kp24.extract_keywords(x))
    y25 = len(kp25.extract_keywords(x))
    y26 = len(kp26.extract_keywords(x))
    y27 = len(kp27.extract_keywords(x))
    y28 = len(kp28.extract_keywords(x))
    y29 = len(kp29.extract_keywords(x))
    y30 = len(kp30.extract_keywords(x))
    y31 = len(kp31.extract_keywords(x))
    y32 = len(kp32.extract_keywords(x))
    y33 = len(kp33.extract_keywords(x))
    y34 = len(kp34.extract_keywords(x))
    y35 = len(kp35.extract_keywords(x))
    y36 = len(kp36.extract_keywords(x))
    y37 = len(kp37.extract_keywords(x))
    y38 = len(kp38.extract_keywords(x))
    y39 = len(kp39.extract_keywords(x))
    y40 = len(kp40.extract_keywords(x))
    y41 = len(kp41.extract_keywords(x))
    y42 = len(kp42.extract_keywords(x))
    y43 = len(kp43.extract_keywords(x))
    y44 = len(kp44.extract_keywords(x))
    y45 = len(kp45.extract_keywords(x))

    Total_matches = y0
    per1 = float(percentage1(y0, y1))
    per2 = float(percentage1(y0, y2))
    per3 = float(percentage1(y0, y3))
    per4 = float(percentage1(y0, y4))
    per5 = float(percentage1(y0, y5))
    per6 = float(percentage1(y0, y6))
    per7 = float(percentage1(y0, y7))
    per8 = float(percentage1(y0, y8))
    per9 = float(percentage1(y0, y9))
    per10 = float(percentage1(y0, y10))
    per11 = float(percentage1(y0, y11))
    per12 = float(percentage1(y0, y12))
    per13 = float(percentage1(y0, y13))
    per14 = float(percentage1(y0, y14))
    per15 = float(percentage1(y0, y15))
    per16 = float(percentage1(y0, y16))
    per17 = float(percentage1(y0, y17))
    per18 = float(percentage1(y0, y18))
    per19 = float(percentage1(y0, y19))
    per20 = float(percentage1(y0, y20))
    per21 = float(percentage1(y0, y21))
    per22 = float(percentage1(y0, y22))
    per23 = float(percentage1(y0, y23))
    per24 = float(percentage1(y0, y24))
    per25 = float(percentage1(y0, y25))
    per26 = float(percentage1(y0, y26))
    per27 = float(percentage1(y0, y27))
    per28 = float(percentage1(y0, y28))
    per29 = float(percentage1(y0, y29))
    per30 = float(percentage1(y0, y30))
    per31 = float(percentage1(y0, y31))
    per32 = float(percentage1(y0, y32))
    per33 = float(percentage1(y0, y33))
    per34 = float(percentage1(y0, y34))
    per35 = float(percentage1(y0, y35))
    per36 = float(percentage1(y0, y36))
    per37 = float(percentage1(y0, y37))
    per38 = float(percentage1(y0, y38))
    per39 = float(percentage1(y0, y39))
    per40 = float(percentage1(y0, y40))
    per41 = float(percentage1(y0, y41))
    per42 = float(percentage1(y0, y42))
    per43 = float(percentage1(y0, y43))
    per44 = float(percentage1(y0, y44))
    per45 = float(percentage1(y0, y45))
    allP = [
        per1, per2, per3, per4, per5, per6, per7, per8, per9, per10, per11,
        per12, per13, per14, per15, per16, per17, per18, per19, per20, per21,
        per22, per23, per24, per25, per26, per27, per28, per29, per30, per31,
        per32, per33, per34, per35, per36, per37, per38, per39, per40, per41,
        per42, per43, per44, per45
    ]
    allP.sort(key=float)
    if y0 == 0:
        Category = 'None'
    else:
        if per1 >= allP[-1]:
            Category = 'Cyber-Security Risk'
        elif per2 >= allP[-1]:
            Category = 'Adult Content'
        elif per3 >= allP[-1]:
            Category = 'Aggresive'
        elif per4 >= allP[-1]:
            Category = 'Arts'
        elif per5 >= allP[-1]:
            Category = 'Automotive'
        elif per6 >= allP[-1]:
            Category = 'Cloud Services'
        elif per7 >= allP[-1]:
            Category = 'Instant Messaging'
        elif per8 >= allP[-1]:
            Category = 'Criminal Activities'
        elif per9 >= allP[-1]:
            Category = 'Dating'
        elif per10 >= allP[-1]:
            Category = 'Software Development'
        elif per11 >= allP[-1]:
            Category = 'Ecommerce Tools'
        elif per12 >= allP[-1]:
            Category = 'Entertainment'
        elif per13 >= allP[-1]:
            Category = 'Software Download Sites'
        elif per14 >= allP[-1]:
            Category = 'Finance & Accounting'
        elif per15 >= allP[-1]:
            Category = 'Food and Drinks'
        elif per16 >= allP[-1]:
            Category = 'Gambling'
        elif per17 >= allP[-1]:
            Category = 'Government'
        elif per18 >= allP[-1]:
            Category = 'Hobbies and Interests'
        elif per19 >= allP[-1]:
            Category = 'Insurance'
        elif per20 >= allP[-1]:
            Category = 'Job Search'
        elif per21 >= allP[-1]:
            Category = 'Kids'
        elif per22 >= allP[-1]:
            Category = 'Military'
        elif per23 >= allP[-1]:
            Category = 'News & Media'
        elif per24 >= allP[-1]:
            Category = 'Peer 2 Peer'
        elif per25 >= allP[-1]:
            Category = 'Pets'
        elif per26 >= allP[-1]:
            Category = 'Real Estate'
        elif per27 >= allP[-1]:
            Category = 'Search Engine'
        elif per28 >= allP[-1]:
            Category = 'Shopping'
        elif per29 >= allP[-1]:
            Category = 'Social'
        elif per30 >= allP[-1]:
            Category = 'Media Streaming'
        elif per31 >= allP[-1]:
            Category = 'Trading & Investment'
        elif per32 >= allP[-1]:
            Category = 'Translation'
        elif per33 >= allP[-1]:
            Category = 'WebHosting_ISP_Telco'
        elif per34 >= allP[-1]:
            Category = 'Webhosting'
        elif per35 >= allP[-1]:
            Category = 'Proxies & VPN'
        elif per36 >= allP[-1]:
            Category = 'Webmail'
        elif per37 >= allP[-1]:
            Category = 'Travel'
        elif per38 >= allP[-1]:
            Category = 'Drugs'
        elif per39 >= allP[-1]:
            Category = 'Weapons'
        elif per40 >= allP[-1]:
            Category = 'Sports'
        elif per41 >= allP[-1]:
            Category = 'Religion'
        elif per42 >= allP[-1]:
            Category = 'Technology'
        elif per43 >= allP[-1]:
            Category = 'Cyber-Security Technologies'
        elif per44 >= allP[-1]:
            Category = 'Education'
        elif per45 >= allP[-1]:
            Category = 'Tobacco'

    return Category
Пример #14
0
class KeyParser:
    def __init__(self):
        self.num_with_text = re.compile(r"номер[еау][\s:]+[0-9]{3,12}")
        self.num_only = re.compile(r"[0-9]{3,12}")

        self.code_with_text = re.compile(r"код.+\s+сло.+[:= -]+[а-яА-Я ]{3,20}")
        self.code_only = re.compile(r"[а-яА-Я ]{3,20}")

        self.service_with_text = re.compile(r"(услуг(у|и)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.service_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        self.tariff_with_text = re.compile(r"(тари(фы|ф)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.tariff_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        synonims = {}
        with open("synonims.json", "r", encoding='utf-8') as syn_file:
            synonims = json.load(syn_file)
            self.yep_key_processor = KeywordProcessor()
            for synonim in synonims['yes']:
                self.yep_key_processor.add_keyword(synonim)

            self.nope_key_processor = KeywordProcessor()
            for synonim in synonims['not']:
                self.nope_key_processor.add_keyword(synonim)

            self.on_key_processor = KeywordProcessor()
            for synonim in synonims['on']:
                self.on_key_processor.add_keyword(synonim)

            self.off_key_processor = KeywordProcessor()
            for synonim in synonims['off']:
                self.off_key_processor.add_keyword(synonim)


    def find_num(self, text):
        search_  = self.num_with_text.findall(text)
        if len(search_) > 0:
            # print(search_)
            return self.num_only.findall(search_[0])[0].rstrip().lstrip()

        search_  = self.num_only.findall(text)
        if len(search_) > 0:
            return search_[0].rstrip().lstrip()
        return None

    def find_key(self, text):
        search_  = self.code_with_text.findall(text)
        if len(search_) > 0:
            return self.code_only.findall(search_[0])[1].rstrip().lstrip()

        for line in reversed(text.splitlines()):
            search_ = self.code_only.findall(line)
            if len(search_) > 0:
                return search_[0].rstrip().lstrip()
        return None

    def find_all_commands(self, message):
        text = message['body']
        return self.find_num(text), self.find_key(text)

    def find_bool(self, message):
        keywords_found = self.yep_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            return True

        keywords_found = self.nope_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            return False
        return None

    def find_tariff(self, message):
        text = message['body']
        used = list()
        search_  = self.tariff_with_text.findall(text)[0]
        used.append(search_[0])

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        used.extend(keywords_found)
        # print(search_[0])
        if len(search_) > 0:
            tarif_name = str(search_[0])
            tarif_names = self.tariff_only.findall(tarif_name)[0]
            if len(tarif_names) > 0:
                return tarif_names[0], used
        return None, used

    def find_service_changes(self, message):
        used = list()
        text = message['body']
        service_name = None
        mode = None

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        used.extend(keywords_found)

        search_  = self.service_with_text.findall(text)[0]
        used.append(search_[0])
        #print('search_: ',search_)

        if len(search_) > 0:
            service_name = self.service_only.findall(search_[0])[0][0]

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            mode=True

        keywords_found = self.off_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            mode=False
        return service_name, mode, used
Пример #15
0
import requests
from flashtext.keyword import KeywordProcessor
from nltk.corpus import stopwords

# let's read in a couple of forum posts
forum_posts = pd.read_csv("../input/ForumMessages.csv")

# get a smaller sub-set for playing around with
sample_posts = forum_posts.Message[0:3]

# get data from list of top 5000 pypi packages (last 30 days)
url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json'
data = requests.get(url).json()

# get just the list of package names
list_of_packages = [data_item['project'] for data_item in data['rows']]

# create a KeywordProcess
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(list_of_packages)

# remove english stopwords
keyword_processor.remove_keywords_from_list(stopwords.words('english'))

# remove custom stopwords
keyword_processor.remove_keywords_from_list(['http','kaggle'])

# test our keyword processor
for post in sample_posts:
    keywords_found = keyword_processor.extract_keywords(post, span_info=True)
    print(keywords_found)
Пример #16
0
class SimWordCount(object):
    def __init__(self, input, keywords_list=None):

        # 初始化词频统计类
        self.keyword_processor = KeywordProcessor()

        # 加载停用词词典
        stopwords = os.sep.join(
            [os.path.dirname(__file__), 'lib', 'stopwords_all.txt'])
        if not os.path.exists(stopwords):
            raise FileNotFoundError('stopwords file is not found!')
        self.stopwords_list = [
            line.strip().decode('utf-8')
            for line in open(stopwords, 'rb').readlines()
        ]

        # 加入自定义词,辅助分词
        if keywords_list:
            for word in keywords_list:
                jieba.add_word(word)

        # 加载文档并分词
        self.seg_text = self._doc_cut(input)  # 分词

        # 加载近义词词典
        cilin = os.sep.join(
            [os.path.dirname(__file__), 'lib', 'new_cilin.txt'])
        if not os.path.exists(cilin):
            raise FileNotFoundError('近义词词典new_cilin.txt is not found!')
        self.file = cilin
        self.word_code = {}
        self.code_word = {}
        self.vocab = set()
        self._read_cilin()

        # 加载预定义的近似词词典
        if os.path.exists(
                os.sep.join([os.path.dirname(__file__), 'lib',
                             'simword.txt'])):
            self.simword_group = [
                line.strip().decode('utf-8-sig') for line in open(
                    os.sep.join([
                        os.path.dirname(__file__), 'lib', 'simword.txt'
                    ]), 'rb').readlines()
            ]
            self.simword_group = [
                group.split() for group in self.simword_group
            ]
        else:
            raise FileNotFoundError('可在当前文件夹下增加过滤词simword.txt,近义词中的词群将被列为统计结果')

        # 加载过滤词表
        if os.path.exists(
                os.sep.join(
                    [os.path.dirname(__file__), 'lib', 'black_list.txt'])):
            self.blacklist = [
                line.strip().decode('utf-8') for line in open(
                    os.sep.join([
                        os.path.dirname(__file__), 'lib', 'black_list.txt'
                    ]), 'rb').readlines()
            ]
        else:
            raise FileNotFoundError(
                '可在当前文件夹下增加黑名单black_list.txt,黑名单中的词将不被列为统计结果')

        # 加载分词词典
        dict_path = os.sep.join(
            [os.path.dirname(__file__), 'lib', 'new_dict.txt'])
        try:
            jieba.load_userdict(dict_path)
        except FileNotFoundError:
            print('WARNING: 为了更好的分词效果,建议在当前目录放置new_dict.txt自定义词典,辅助jieba分词')
        finally:
            pass

    def count(self,
              input,
              use_filter_words=True,
              topn=10,
              output_limit=1,
              count_only_singleword=True):
        '''
        :param : input:输入文本        type: str
        :param : thres:近义词阈值 0-1  type: float             词编码相同比例阈值, thres%相同,则认为是近义词
        :param : use_filter_words:是否使用过滤词表             True / False    默认为True
        :return: textgroup_count_sort 返回近义词组统计信息   type: list         [(words_group,count),...]
        '''
        # 输入参数类型检查
        if topn < 1: raise ValueError('topn 必须大于等于1')
        if output_limit < 1: raise ValueError('topn 必须大于等于1')

        if use_filter_words == 'True':
            use_filter_words = True
        elif use_filter_words == 'False':
            use_filter_words = False
        else:
            raise ValueError('use_filter_words必须是True或False')

        words = set(self.seg_text)  # 原文中的全部词

        # 计算每个词的位置因子

        if count_only_singleword:  # 只统计单个词频
            words_from_dict = set()
        else:  # 统计单个词,包括近义词群
            words_from_dict = set(self.word_code.keys())  # 近义词词典中包含的词(非重复)

        words_in_simgroup = set([
            word for elem in self.simword_group for word in elem
        ])  # 自定义词表中包含的词(非重复)
        # words_notin_group = [[word] for word in list(words - words_from_dict-words_in_simgroup) if len(word) >output_limit and word[0]!='.']    # 不在1.近义词词典 2.自定义词群中的词(非重复)
        words_notin_group = [
            [word]
            for word in list(words - words_from_dict - words_in_simgroup) if
            len(word) > output_limit and word >= '\u4e00' and word <= '\u9fff'
        ]  # 不在1.近义词词典 2.自定义词群中的词(非重复) 去除数字,字符,单字

        # 将自定义词加入词典
        dict_simgroup_result = {}
        if self.simword_group:
            self.keyword_processor.add_keywords_from_list(
                list(words_in_simgroup))
            keywords_found = self.keyword_processor.extract_keywords(input)
            keywords_count = Counter(keywords_found)
            # 统计自定义词群频率
            for l in self.simword_group:
                word_count = 0
                for w in l:
                    word_count += keywords_count[w]
                if word_count != 0:
                    s = ','.join(l)
                    dict_simgroup_result[s] = word_count

        # 采用黑名单机制对返回结果词频进行过滤
        if use_filter_words:
            words_notin_filter = [word[0] for word in words_notin_group]
            words_notin_group = list(
                set(words_notin_filter) - set(self.blacklist))
            words_notin_group = [[word] for word in words_notin_group]
            words_from_dict = set(self.word_code.keys()) - set(self.blacklist)

            self.simword_group = [
                list(set(group) - set(self.blacklist))
                for group in self.simword_group
            ]  # 如果自定义词表

        text_word_dict = {}
        for word in words:
            if word in words_from_dict:
                if len(word) > output_limit:
                    text_word_dict[word] = self.word_code[word]

        words_group = self._get_words_group(text_word_dict)  # 近义词词群
        words_group = words_notin_group + words_group  # 需要统计词频的全部词(普通词,近义词词群),自定义近义词由flashtext自己统计

        dict_result = {}
        count = Counter(self.seg_text)
        for l in words_group:
            word_count = 0
            for w in l:
                word_count += count[w]
            s = ','.join(l)
            dict_result[s] = word_count

        if self.simword_group:
            all_dict_result = dict(dict_result, **dict_simgroup_result)
        else:
            all_dict_result = dict_result
        textgroup_count_sort = sorted(all_dict_result.items(),
                                      key=lambda x: x[1],
                                      reverse=True)  # 按照出现频率排序的词群
        # keys = [group[0] for group in textgroup_count_sort[0:topn]]
        keys = [group[0] for group in textgroup_count_sort]
        values = [group[1] for group in textgroup_count_sort]
        # json_result = [{'key': group[0], 'value': group[1]} for group in textgroup_count_sort[0:topn]]
        return keys, values

    def count_span(self):
        # 计算每个词在文章中的跨度
        doc_cut = self.seg_text
        words = set(doc_cut)  # 原文中的全部词

        span_word = {}
        sum_span = len(doc_cut)  # 文章中词的总个数
        for word in words:
            first = doc_cut.index(word)  # 在文章中首次出现的位置
            doc_cut.reverse()
            last = sum_span - doc_cut.index(word)  # 在文章中最后出现的位置
            doc_cut.reverse()
            span_word[word] = (last - first + 1) / sum_span
        return span_word

    def _read_cilin(self):
        with open(self.file, 'r', encoding='gbk') as f:
            for line in f.readlines():
                res = line.split()
                code = res[0]  # 词义编码
                words = res[1:]  # 同组的多个词
                self.vocab.update(words)  # 一组词更新到词汇表中
                self.code_word[code] = words  # 字典,目前键是词义编码,值是一组单词。

                for w in words:
                    if w in self.word_code.keys():  # 最终目的:键是单词本身,值是词义编码。
                        self.word_code[w].append(code)  # 如果单词已经在,就把当前编码增加到字典中
                    else:
                        self.word_code[w] = [code]  # 反之,则在字典中添加该项。

    @staticmethod
    def _get_words_group(word_label_dict):
        words_group = []
        values = [t for t in set(tuple(_)
                                 for _ in word_label_dict.values())]  # 去重
        for value in values:
            words = [
                k for k, v in word_label_dict.items() if tuple(v) == value
            ]  # 编码完全一样
            words_group.append(words)
        return words_group

    @staticmethod
    def _read_words_from_file(fn):
        f = open(fn, encoding='utf-8').readlines()
        words = [s.strip() for s in f if s.strip()]
        return words

    def _doc_cut(self, doc):
        seg_text = []
        for word in jieba.cut(doc, cut_all=False):
            if not word.strip():
                continue
            if word.strip() in self.stopwords_list:
                continue
            seg_text.append(word)
        return seg_text
Пример #17
0
from flashtext.keyword import KeywordProcessor

# 提取关键字
# add_keyword(查找字符,替换字符),也就是先找到句子中的’你好’,然后显示出来的是add_keyword的替换字符
# 英文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords(
    'I love Big Apple and Bay Area.')
print(keywords_found)
#中文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
keyword_processor.add_keyword('不要')
keywords_found = keyword_processor.extract_keywords(
    '你好,请不要随便践踏草坪。')  #显示的单词为替换之后的
print(keywords_found)

# 替换关键字
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
new_sentence = keyword_processor.replace_keywords('你好,请不要随便践踏草坪。')
print(new_sentence)
        # Remove unwanted terms from list (single letters, prepositions, stop-words etc.)
        abc_list = list(string.ascii_uppercase + string.ascii_lowercase)
        kwp.remove_keywords_from_list(abc_list)
        kwp.remove_keywords_from_list(word_boundary_list)

        for i in stopwords(f"{lang}"):
            kwp.remove_keyword(i)
            kwp.remove_keyword(i.capitalize())

        # Extract the terms
        with open(f"{outdir}/{lang}{suffix1}/{domain}{suffix2}full_tok.txt", "r", encoding="utf-8") as f:
            sentences = f.readlines()
#        print(sentences[-10:])
        results =[]
        for line in tqdm(sentences):
            s = kwp.extract_keywords(line.rstrip())
            results.append(s)
        print(results[-10:])

        # Remove previously added terms from keyword processor
        for i in list_of_terms:
            kwp.remove_keyword(" ".join(i))

        # Write results:
        out = open(f"{outdir}/{lang}{suffix1}/{domain}{suffix2}label.txt","w",encoding="utf-8")

        for s in results:
            if re.match(r'^\s*$', "".join(s)):
                out.write("" + "\n")
            else:
                out.write("\t".join(s) + "\n")
Пример #19
0
# coding=utf-8
# if len(keywordList) > 500: exec(flashtext) else: exec(Regex)

from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('PyTorch')
keyword_processor.add_keyword(keyword='and', clean_name='or')
keywords_found = keyword_processor.extract_keywords('I love Python and PyTorch.')
print(keywords_found)
# ['or', 'PyTorch']

keyword_processor.add_keyword(keyword='Python', clean_name='Tensorflow')
new_sentence = keyword_processor.replace_keywords('I love Python and PyTorch.')
print(new_sentence)
# I love Tensorflow or PyTorch.