Пример #1
0
def get_kp(file):
    kp = KeywordProcessor()
    with open('../data/intents/' + file + '.csv', mode='r') as infile:
        reader = csv.reader(infile)
        for read in reader:
            kp.add_keyword(read[0], read[1])
    return kp
Пример #2
0
def abusive_analysis(description):
    '''
	This function does all the prearrangement for keyword analysis and use find_abuse and remove_stopwords as a helper funcitons
	Parameter:
		description

	'''
    #opening the text file of abusive words
    with open('bad_words_list.txt', 'r') as f:
        c = f.readlines()
    s = c[0].split(',')

    abusive_words = []
    for i in s:
        i = i[1:]
        abusive_words.append(i)

    keywords = np.array(abusive_words)
    kp0 = KeywordProcessor()  #creating object of KeywordProcessor

    for word in keywords:
        kp0.add_keyword(word)
    sw = stopwords.words('english')
    to_remove = [
        '[]', '', '1', '()', '||', '=', '.', ',', '\n', ':', ';', '\\', '//',
        '/'
    ]  #some additional stopwords provided manually

    for i in to_remove:
        sw.append(i)
        clean = remove_stopwords(description, sw)

    find_abuse(clean, kp0)
Пример #3
0
def fastcleaner(docs, replacewords):
    '''
    语料清洗工具, FastText可用于快速进行大规模语料库的文本搜索与替换
    INPUT  -> 文档集(词之间为空格)、替换词表
    '''
    docs_new = []
    keyword_processor = KeywordProcessor()
    for doc in docs:
        for word1, word2 in replacewords:
            keyword_processor.add_keyword(word1, word2)  # 前面一个词为定位词, 后面一个词为替换
        docs_new.append(keyword_processor.replace_keywords(doc))
    return docs_new
Пример #4
0
def extract_tag_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract tag frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_tag_freq_dict = dict()
    target_tag_file_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = ' ' + tag.strip() + ' '
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            nlp_file_path = os.path.join(dir_path, file_name)
            try:
                nlp_file = open(nlp_file_path)
                for line in nlp_file:
                    line_list = line.strip().split('\t')
                    tag_sent = ' ' + line_list[2] + ' '
                    keywords_found = keyword_processor.extract_keywords(tag_sent)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_tag_freq_dict:
                            target_tag_freq_dict[keyword] = 1
                        else:
                            target_tag_freq_dict[keyword] += 1
                        if keyword not in target_tag_file_dict:
                            target_tag_file_dict[keyword] = [nlp_file_path]
                        else:
                            if nlp_file_path not in target_tag_file_dict[keyword]:
                                target_tag_file_dict[keyword].append(nlp_file_path)
                nlp_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(nlp_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_tag_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(tag, target_tag_freq_dict[tag])
    frequency_output_file.close()
    file_list_output_file = open('{0}_file_list.txt'.format(target_date), 'w')
    sorted_file_list = sorted(target_tag_file_dict)
    for tag in sorted_file_list:
        for file_nm in target_tag_file_dict[tag]:
            print >> file_list_output_file, '{0}\t{1}'.format(tag, file_nm)
    file_list_output_file.close()
Пример #5
0
    def detect_rule_update(detect_list):
        logging.info('------------- flashtext keyword updating--------------')
        kp = KeywordProcessor()
        DetectThread.detect_regex_list = {}
        for id in detect_list:
            str = detect_list[id]['regex']
            if str.startswith('gfwmatch_'):
                str = str.replace('gfwmatch_', '')
                kp.add_keyword(str)
            else:
                DetectThread.detect_regex_list[id] = detect_list[id]

        DetectThread.keyword_processor = kp

        logging.debug('------------- flashtext keyword res %s --------------' %
                      (kp.get_all_keywords()))
Пример #6
0
def extract_word_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract word frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_word_freq_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = tag.strip()
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            if not file_name.endswith('_trx.txt'):
                continue
            trx_file_path = os.path.join(dir_path, file_name)
            try:
                trx_file = open(trx_file_path)
                for line in trx_file:
                    line = line.strip()
                    keywords_found = keyword_processor.extract_keywords(line)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_word_freq_dict:
                            target_word_freq_dict[keyword] = 1
                        else:
                            target_word_freq_dict[keyword] += 1
                trx_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(trx_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_word_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(
            tag, target_word_freq_dict[tag])
    frequency_output_file.close()
Пример #7
0
def getSemanticNeighbors(entNameDict, entTextDict):
    semNeiDict = {i: set() for i in dm.entityDict.keys()}
    keyword_processor = KeywordProcessor()
    name2midDict = {}
    for mid in dm.entityDict.keys():
        if mid not in entNameDict.keys(): continue
        name = entNameDict[mid]
        if len(name) <= 3: continue
        keyword_processor.add_keyword(name)
        name2midDict[name] = mid

    for mid2 in entTextDict.keys():
        if mid2 not in dm.entityDict.keys(): continue
        text = entTextDict[mid2].lower()
        keywords_found = keyword_processor.extract_keywords(text)
        for key in keywords_found:
            mid = name2midDict[key]
            if mid != mid2:
                semNeiDict[mid].add(mid2)
                semNeiDict[mid2].add(mid)
    return semNeiDict
Пример #8
0
 def run(self):
     oracle = db_connection.Oracle(config.OracleConfig,
                                   failover=True,
                                   service_name=True)
     try:
         self.logger.info('[START] TA daemon process started')
         pid_list = list()
         # 브랜드명 keyword
         brand_keyword_dict = util.select_brand_keyword(self.logger, oracle)
         brand_keyword_processor = KeywordProcessor()
         for brand_keyword in brand_keyword_dict.keys():
             brand_keyword_processor.add_keyword(brand_keyword)
         # 불용어 keyword
         del_keyword_dict = util.select_del_keyword(self.logger, oracle)
         # 감성 keyword
         senti_keyword_dict = util.select_senti_keyword(self.logger, oracle)
         # HMD category overlap check rank
         category_rank_dict = util.select_hmd_category_rank(
             self.logger, oracle)
         while True:
             try:
                 job_list = self.make_job_list(oracle)
             except Exception:
                 self.logger.error(traceback.format_exc())
                 time.sleep(10)
                 continue
             for pid in pid_list[:]:
                 if not pid.is_alive():
                     pid_list.remove(pid)
             for job in job_list:
                 if len(pid_list) >= self.conf.process_max_limit:
                     self.logger.info('Processing Count is MAX....')
                     for pid in pid_list[:]:
                         if not pid.is_alive():
                             pid_list.remove(pid)
                     continue
                 # Job -> (rest_send_key, start_date, start_time, file_name, svc_type, team_id)
                 rest_send_key, start_date, start_time, file_name, svc_type, team_id = job
                 p = multiprocessing.Process(
                     target=do_task,
                     args=(job, brand_keyword_processor, del_keyword_dict,
                           senti_keyword_dict, category_rank_dict))
                 p.daemon = None
                 pid_list.append(p)
                 p.start()
                 # Update status
                 util.update_status(self.logger, oracle, '1', rest_send_key)
                 log_str = 'Execute rest_send_key = {0},'.format(
                     rest_send_key)
                 log_str += ' start_date = {0},'.format(start_date)
                 log_str += ' start_time = {0},'.format(start_time)
                 log_str += ' file_name = {0},'.format(file_name)
                 log_str += ' svc_type = {0},'.format(svc_type)
                 log_str += ' team_id = {0},'.format(team_id)
                 log_str += ' [PID={0}]'.format(p.pid)
                 self.logger.info(log_str)
                 time.sleep(self.conf.process_interval)
             time.sleep(0.1)
     except KeyboardInterrupt:
         self.logger.info('Daemon stopped by interrupt')
     except Exception:
         self.logger.error(traceback.format_exc())
     finally:
         try:
             oracle.disconnect()
         except Exception:
             pass
         finally:
             self.logger.info('[E N D] Daemon process stopped')
class Preprocessing(object):
    def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()

    def __init_flash_text_corpus(self):
        """ Init flash text corpus. """
        # build slang word corpus
        slang_words_raw = Repository.get_slang_word()
        for word in slang_words_raw.values:
            self.keyword_processor_slang_word.add_keyword(word[0], word[1])

        # build emoticon corpus
        emoticon_raw = constant.EMOTICON_LIST
        for key, values in emoticon_raw:
            for value in values:
                self.keyword_processor_emoticon.add_keyword(value, key)

        # build meaning word corpus
        meaning_words_raw = Repository.get_meaning_text()
        for word in meaning_words_raw.values:
            self.keyword_processor_meaning_text.add_keyword(word[0], word[1])

    def __init_custom_stop_word(self):
        """ Custom stop word for chat message content. """

        for stop_word in constant.STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = True

        for stop_word in constant.EXC_STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = False

    def cleaning(self, chat_message_list):
        """
        Pre-processing the content from ChatMessage.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """
        chat_message_list_temp = []

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            chat_message_list = self.remove_repeated_message_from_agent(
                chat_message_list)
            for chat_message in chat_message_list:
                logger.info(f'BEFORE -> {chat_message.content}')
                content = self.__preprocessing_flow(chat_message.content)
                logger.info(f'AFTER -> {content}')
                chat_message.content = content
                if content.strip():
                    chat_message_list_temp.append(chat_message)

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list_temp

    def cleaning_with_pipe(self, chat_message_list):
        """
        [DEPRECATED]
        Pre-processing the content from ChatMessage with multi threading from spaCy.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            index = 0

            chat_content_list = [
                chat_message.content for chat_message in chat_message_list
            ]
            for content in self.nlp.pipe(chat_content_list,
                                         n_threads=cpu_count()):
                chat_message_list[index].content = self.__preprocessing_flow(
                    content.text)
                index = index + 1

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list

    def __preprocessing_flow(self, content):
        """ Preprocessing flow. """
        # normalize emoticon
        # content = PreprocessingUtilsV2.normalize_emoticon(content, self.keyword_processor_emoticon)

        content = str(content)

        # normalize url
        content = PreprocessingUtils.normalize_url(content)

        # remove url
        content = PreprocessingUtils.remove_url(content)

        # remove email
        content = PreprocessingUtils.remove_email(content)

        # remove digit number
        content = PreprocessingUtils.remove_digit_number(content)

        # case folding lower case
        content = PreprocessingUtils.case_folding_lowercase(content)

        # remove punctuation
        content = PreprocessingUtils.remove_punctuation(content)

        # remove repeated character
        content = PreprocessingUtils.remove_repeated_character(content)

        # normalize slang word
        content = PreprocessingUtilsV2.normalize_slang_word(
            content, self.keyword_processor_slang_word)

        # stemming, tokenize, remove stop word
        content = PreprocessingUtils.stemming(content, self.nlp, self.stemmer)

        # remove unused character
        content = PreprocessingUtils.remove_unused_character(content)

        # join negation word
        content = PreprocessingUtils.join_negation(content)

        # remove extra space between word
        content = PreprocessingUtils.remove_extra_space(content)

        # normalize word
        content = PreprocessingUtilsV2.normalize_meaning_word(
            content, self.keyword_processor_meaning_text)

        # remove stop word
        content = PreprocessingUtils.remove_stop_word(content, self.nlp)

        # TODO add another pre-processing if needed

        return content

    @staticmethod
    def identify_phrase(documents):
        """ documents : iterable of iterable of str """
        bigram = Phraser(
            Phrases(documents, min_count=5, delimiter=b'_', threshold=1))
        trigram = Phraser(
            Phrases(bigram[documents],
                    min_count=5,
                    delimiter=b'_',
                    threshold=1))

        for i in range(len(documents)):
            for token in bigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
            for token in trigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
        return documents

    @staticmethod
    def remove_repeated_message_from_agent(message_history_list):
        """ documents : removed repeated chat message if repeat more than constant.MESSAGE_TEMPLATE_MIN_COUNT"""
        message_template_list = []
        message_history_list_temp = []
        counter = collections.Counter()

        for chat_message in message_history_list:
            if chat_message.sender_role == constant.SENDER_ROLE_AGENT:
                counter[chat_message.content] += 1

        for key, value in counter.items():
            if value > constant.MESSAGE_TEMPLATE_MIN_COUNT:
                message_template_list.append(key)

        for chat_message in message_history_list:
            if chat_message.content not in message_template_list:
                message_history_list_temp.append(chat_message)

        return message_history_list_temp
Пример #10
0
words_frequency = remove_clutter_words(words_frequency)

from flashtext.keyword import KeywordProcessor
from collections import Counter

from flashtext.keyword import KeywordProcessor
from collections import Counter
all_keywords = []
word_processors = {}
print('word_processors')
for category in words_frequency.keys():
    all_keywords.extend(words_frequency[category])
    word_processor = KeywordProcessor()
    for word in words_frequency[category]:
        word_processor.add_keyword(word)
    word_processors[category] = word_processor
# remove duplicates
all_keywords = set(all_keywords)
all_keywords = list(all_keywords)
all_words_processor = KeywordProcessor()
for word in all_keywords:
    all_words_processor.add_keyword(word)


def compute_percentage(dum0, dumx):
    try:
        ans = float(dumx) / float(dum0)
        ans = ans * 100
    except:
        return 0
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    text += pageObj.extractText()

#now part for Java-keywords
#aho-corasick algorithm for string matching
#trie data-structure
keywords = open("keywords.txt", "r")
keys = keywords.read().splitlines()
keyword_processor = KeywordProcessor()
for i in range(len(keys)):
    keyword_processor.add_keyword(keys[i])
    keywords_found = keyword_processor.extract_keywords(text)
found = keywords_found


# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist, wordfreq))


# Sort a dictionary of word-frequency pairs in
# order of descending frequency.
def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
Пример #12
0
from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword('SQL injection')
keyword_processor.add_keyword('SQL injection',
                              ('vulnerability type', 'SQL injection'))
keyword_processor.add_keyword('cross-site scripting',
                              ('vulnerability type', 'cross-site scripting'))
keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.add_keyword('parameter')
# keyword_processor.add_keyword('function')
# keyword_processor.add_keyword('variable')

keyword_dict = {"cross-site scripting": ["XSS"], "parametert": ["variabler"]}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["parameter", "function", "variable"])

####keyword replacement
# keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.replace_keywords('vulnerability is cross-site scripting')

keyword_processor.extract_keywords(
    'SQL injection vulnerability in the update_zone function in catalog/admin/geo_zones.php in osCommerce Online Merchant 2.3.3.4 and earlier allows remote administrators to execute arbitrary SQL commands via the zID parameter in a list action. '
)
Пример #13
0
from flashtext.keyword import KeywordProcessor

# 提取关键字
# add_keyword(查找字符,替换字符),也就是先找到句子中的’你好’,然后显示出来的是add_keyword的替换字符
# 英文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords(
    'I love Big Apple and Bay Area.')
print(keywords_found)
#中文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
keyword_processor.add_keyword('不要')
keywords_found = keyword_processor.extract_keywords(
    '你好,请不要随便践踏草坪。')  #显示的单词为替换之后的
print(keywords_found)

# 替换关键字
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
new_sentence = keyword_processor.replace_keywords('你好,请不要随便践踏草坪。')
print(new_sentence)
Пример #14
0
def classification(textSplit):
    keyword_cyber_security_risk = ["ad fraud", "cyberattack", "malware", "botnets", "CnC", "Command and Control",
                                   "compromised accounts", \
                                   "hacking", "key logging", "phishing", "spyware", "worm", "trojan", "RAT", "APT",
                                   "adware", "fileless attack", "cryptocurrency mining", \
                                   "spam", "ransomware", "denial-of-service", "sql injection", "man-in-the-middle",
                                   "compromised pcs", "spam site", "malicious payload", "apt", \
                                   "advanced persistent threat", "spoofing", "virus", "slowloris", "xss",
                                   "cross-site scripting", "exploit", "vulnerability", "cve", "day zero","backdoor","blackhat"
                                   ,"bruteForce","botNet","cracking","forensics","exploit","scanning","enumeration","reconnaisance",""
                                   "adware", "autorun worms", "advanced persistent threats", "attack vector", "backdoor", "blended attack",
                                   "botnet", "browser hijacker", "brute force attack", "clickjacking", "command and control servers",
                                   "content spoofing", "cross site scripting", "xss" "xsrf", "data theft", "denial of service attack", "dictionary attack",
                                   "drive-by download", "email spoofing", "email virus", "exploit", "form grabber",
                                   "identity theft", "insider attack", "keylogger",
                                   "likejacking", "malware", "mman in the middle" "ransomware", "rootkit", "spam",
                                   "spyware", "SQL injection", "wabbit", "website spoofing "
                                   ,"ip","tcp","router","network","cisco", ""]

    Adult_Content = [
        "pornography", "violence", "blood", "gore", "sex", "nudity", "erotic",
        "hardcore", "fetish", "intercourse", "explicit content", "hentai",
        "masturbation", "dick", 'pussy', "penis", "v****a", "anus", 'boobs',
        'p**n', 'xxx'
    ]

    keyword_Aggressive = [
        "attacking", "fighting", "invading", "assailing", "threaten",
        "slashing", "beating", "destroy", "destruction", 'assault'
    ]

    Keyword_arts = [
        "painting", "drawing", "ceramics", "pottery", "photography",
        "sculpture", "dance"
    ]

    keyword_automotive = [
        'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
        'chevrolet', 'chrysler', 'citroen', 'dodge', 'ferrari', 'honda',
        'toyota', 'hyundai', 'kia', 'lamborghini', 'lexus', 'mazda',
        'mercedes', 'skoda', 'mitsubishi', 'nissan', 'porsche', 'subaru',
        'suzuki', 'tesla', 'volkswagen', 'volvo', 'horsepower', 'torque'
    ]

    keyword_cloud_service = [
        "cloud backup", "cloud storage", "cloud processing", "iaas", "paas",
        "saas", "aws", "azure", "google cloud", "amazon web services",
        "infrastructure as a service", "platform as a service",
        "software as a service", "cloud software", "IBM cloud", "vmware",
        "salesforce", "oracle cloud", "sap cloud", "alibaba cloud",
        "cloud service"
    ]

    keywords_IM = [
        "discord", "skype", "viber", "whatsapp", "facebook messenger",
        "wechat", "telegram", "line", "qq mobile"
    ]

    keyword_Criminal_Activities = [
        "arson", "assault", "bigamy", "blackmail", "bribery", "burglary",
        "child abuse", "conspiracy", "espionage", "forgery", "fraud",
        "genocide", "hijacking", "homicide", "kidnap", "manslaughter",
        "mugging", "murder", "kill", "perjury", "rape", "riot", "robbery",
        "shoplift", "slander", "smuggle", "treason", "trespass", "gang fights",
        "steal", "theft", "cyber crime", "corruption", "domestic", "violence",
        "ransom", "vandalism", "child abuse ", "terrorism", "militia",
        'insurgent', 'bombing', 'terrorist', 'make bomb', 'bomb making',
        'bombs'
    ]

    keyword_dating = [
        "online dating", "tinder", "okcupid", "valentines", "romantic",
        "roses", "presents", "anniversary", "rings", "dating ideas",
        "movie dates", 'wedding', 'hook up'
    ]

    keyword_softwareDevelopement = [
        "pycharm", "netbeans", "sqlite", "linux", "visual studio", "node.js",
        "codenvy", "angularjs", "eclipse", "react native", 'python', 'java',
        'c++', 'ruby on rails', 'flutter', 'javascript', 'html', 'maven',
        'node.js', 'html', 'css', 'php', 'database', 'sql', 'db', 'pip',
        'web development', 'code', 'debug', 'c#', 'kotlin', 'objective-c',
        'visual basic', 'perl', 'matlab', 'libraries', 'stack development',
        'backend', 'frontend', 'framework', 'software develop',
        'machine learning', 'tensorflow', 'AI', 'API',
        'application programming interface'
    ]

    keyword_Ecommerce_Tools = [
        "ecommerce website tools", "research tools", "business tools",
        "marketing tools", "analytics tools", "bigcommerce", "x-cart",
        "shopify", "woocommerce", "prestashop", "junglescout", "semrush",
        "ahrefs", "sourcify", "veeqo", "tickspot", "asana", "inventory source",
        "oberlo", "shipwire", "tradegecko", "shippingeasy", "wave", "ecomdash",
        "mailchimp", "campaign monitor", "feeds4", "active campaign",
        "bulk.ly", "buffer", "omnistar", "antavo", "smile.lo", "user testing",
        "wishpond", "klaviyo", "buzzstream", "exitbee", "metrilo", "storeya",
        "instasize", "visual website optimizer", "optimizely analytics",
        "google analytics", "neatly", 'search engine optimization', 'SEO'
    ]

    keyword_Entertainment = [
        "plays", "comedy", "puppet shows", "sports", "performance art",
        "animation", "karaoke", "video games", "dance", "magic",
        "television programs", "music", "acting", "nightclubs",
        "fashion shows", "netflix", "concerts", "circus", "parties",
        "symphonies", "theatre", "variety shows"
    ]

    keyword_Software_Downloads_Sites = [
        "download.com", "filehippo", "zdnet download", "softpedia", "tucows",
        "freewarefiles", "majorgeeks", "filecluster", "soft32", "torrent",
        "softonic", "freewarehome", "ninite", "download crew", "filehorse",
        "filepuma", "sourceforge", "software"
        "informer", "alternativeto"
    ]

    keyword_Finance_Accounting = [
        "Accounts payable", "accounts receivable", "accrued expense",
        "balance sheet", "book value", "equity", "inventory", "zoho books",
        "xero"
    ]

    keyword_Food_drinks = [
        "macdonald", "kfc", "grabfood", "subway", "jolibee", "coke", "laksa",
        "chicken rice", "yong tau foo", "buffet", "pizza", "bbq",
        "black pepper", "beef", "mutton", "curry", "nasi lemak", "carrot cake",
        "green tea", "bubble tea", "pudding jelly", "cake", "bread", "milo",
        "ice cream", "fishball"
    ]

    keyword_Gambling = [
        "poker", "roulette", "slot-machines", "bingo", "baccarat",
        "casino war", "craps", "carribean stud", "keno", "let it ride",
        'betting'
    ]

    keyword_government_legal = [
        "moe", "mof", "mha", "mfa", "mti", "msf", "mod", "mol", "mom", "moh",
        "mot", ".gov.sg", 'government', 'ministry of', 'minister',
        'minister of'
    ]

    keyword_Hobbies_Interests = [
        "Sports", "music", "travel", "fishing", "social work",
        "volunteer work", "painting", "dancing", "reading", "writing",
        "gardening", "animal care", "cooking", "bowling", "computer gaming",
        "fashion", "ice skating", "magic", "origami", "photography",
        "sculpting", "comedy", "winemaking", "yoga", "computer programming",
        "diving", "football", "basketball", "tennis", "badminton",
        "table tennis", "soccer", "rugby", "jogging", "marathon", "cycling",
        "rock climbing", "swimming", "cheerleading", "fencing", "laser tag",
        "darts", "eating", "sleeping", "hockey", "weightlifting", "volleyball",
        "martial arts", "hiking", "backpacking", "archery", "wrestling",
        "boxing", "poker", "chess"
    ]

    keyword_insurance = [
        "life insurance", "health insurance", "travel insurance",
        "home insurance", "child insurance", "maid insurance", "car insurance",
        "pet insurance", "personal accident insurance", "term life insurance",
        "whole life insurance", "ntuc income", "great eastern", "prudential",
        "AIA", "aviva", "savings plan", "integrated shield plan",
        "trip delays", "baggage delay", "lost items", "medical coverage",
        "missed flights"
    ]

    keyword_jobsearch = [
        "career@gov", "jobstreet", "gumtree", "indeed", "jobsdb", "stjobs",
        "mycareerfuture", "jobscentral", "linkedin", "startupjobs"
    ]

    Keyword_kids = [
        "hasbro", "nursery rhythms", "fox kids", "smiggle", "kiddy palace",
        "Playground", "toy r us", "avent", "enfagrow", "kinder joy"
    ]

    keyword_Military = [
        "army", "air force", "navy", "rank", "infrantry", "armour",
        "artillery", "special forces", "rangers", "guards", "military police",
        "signals", "combat engineers", "field engineers", "sar 21",
        "machine guns", "missile launcher", "weapons", "medic", "tanks",
        "fighter jets", "helicopter", "armoured vehicles", "rocket artillery",
        "armoured carriers", "sergeant", "officer", "encik"
    ]

    keyword_news_and_media = [
        "cna", "bbc", "thestraitstime", "thenewspaper", "mediacorp",
        "techredar", "asiaone", "yahoo", "msn", "flipboard", "twitter",
        "dailymail", "today", "thebusinesstimes", 'reporters'
    ]

    keyword_peer2peer = [
        "pirate bay", "kickass torrent", "torrent", "rarbg", "1337x",
        "torlock", "YTS", " qBittorrent", "Vuze", "Deluge", "uTorrent",
        "BitTorrent", "EZTV", "ETTV", "Popcorn Time", "LimeTorrents"
    ]

    keyword_pets = [
        "cat", "dog", "rabbit", "hamster", "fish", "bird", "guinea pig",
        "chinchilla", "cow", "chicken", "sheep", "lamb", "pig", "llama",
        "turtle", "tortoise", "frog"
    ]

    keywords_realEstate = [
        "hdb", "bungalow", "studio", "semi-detached", "condos", "landed",
        "propnex realty", "huttons asia", "era", "propseller", 'condominium',
        'apartment', 'mansionette', 'property guru', 'property agent'
    ]

    keyword_Search_engines = [
        'google', 'yahoo', 'bing', 'duckduckgo', 'wiki.com', 'gibiru',
        'boardreader', 'baidu', 'torsearch', 'ask.com'
    ]

    keywords_shopping = [
        "qoo10", "lazada", "shopee", "zalora", "taobao", "amazon", "carousell",
        "ebay", "redmart", "reebonz"
        "online shopping", "online sale", "free shipping", "free delivery",
        "next day delivery"
    ]

    Keyword_social = [
        "imgur", "facebook", "twitter", "instagram", "tumblr", "flicker",
        "google+", "youtube", "pinterest", "reddit", "snapchat", "baidu tieba",
        "skype", "telegram", "whatsapp", "hardwarezone", "forum"
    ]

    keyword_mediaStreaming = [
        "netflix", "youtube", "apple Tv", "chromecast", "subsonic",
        "audio galaxy", "tudou", "baidu", "dailymotion", "vimeo"
    ]

    keywords_trading_invest = [
        "stocks", "money", "profits", "srs", "blue-chip", "growth", "dividend",
        "nasdaq", "corporate bonds", "etf"
    ]

    Keyword_translation = [
        "google translate", "yandex", "babelfish", "tradukka", "linguee",
        "systranet", "permondo", "translatesonline.com"
    ]

    keyword_webhosting_isp_telco = [
        "singtel", "starhub", "m1", "circlelife", "tpg", "myrepublic",
        "viewquest", "alibaba", "apc", "amazon web"
    ]

    keyword_web_hosting = [
        "bluehost", "inmotion hosting", "hostgator", "hostinger", "godaddy",
        "tsohost", "wix", "siteground", "hostwinds", "weebly", "squarespace",
        "vodien", "a2 hosting", "dreamHost", "website hosting", "domain name",
        "namecheap", "host website", "domain registration", "whois",
        "website server", "apache", "nginx"
        "web host"
    ]

    keyword_proxies_vpn = [
        "expressvpn", "nordvpn", "ipvanish", "hotspot shield", "tunnelbear",
        "hidester", "hide.me", "proxysite.com", "kproxy", "VPNbook",
        "whoer.net", "megaproxy"
    ]

    keyword_webmail = [
        "gmail", "hotmail", "live", "yahoo", "outlook", "aol", "zoho",
        "protonmail"
    ]

    keyword_travel = [
        'booking.com', 'tripadvisor', 'expedia', 'airbnb', 'agoda',
        'priceline', 'skyscanner', 'kayak.com', 'makemytrip', 'cheapoair',
        'trivago', 'travelocity', 'orbitz', 'hotelurbano', 'book hotel',
        'air tickets', 'airfares', 'hotels', 'cheap flight', 'cheap hotel',
        'airline', 'flights'
    ]

    keyword_drugs = [
        'marijuana', 'opium', 'heroin', 'cocaine', 'barbiturates', 'meth',
        'ice', 'crystal meth', 'ecstacy', 'weed', 'cannabis'
    ]

    Keyword_weapons = [
        "gun", "sword", "machine gun", "butterfly knife", "rocket", "bazooka",
        "flamethrower", "pistol", "rifle", "grenade", "sniper"
    ]

    keyword_sports = [
        "soccer", "football", "tennis", "basketball", "hockey", "bowling",
        "table-tennis", "kayaking", "canoeing", "snorkeling", "diving",
        "swimming", "scuba-diving", 'martial arts'
    ]

    Keyword_religion = [
        "Buddihsm", "Hinduism", "Sikhism", "Christianity", "Islam", "Judaism",
        "Spiritism", "Shinto", "Taoism"
    ]

    Keyword_technology = [
        "cloud computing", "5g", "computer ai", "wireless", "ssd",
        "smartphone", "drones", "robots", "gaming", "smartwatch"
    ]

    keyword_cyber_security_solutions = [
        "identity and access management", "IAM", "cloud security",
        "risk and compliance management", "encryption", "data loss prevention",
        "DLP", "UTM", "unified threat management", "firewall", "antivirus",
        "antimalware", "IDS", "intrusion detection system",
        "intrusion prevention system", "IPS", "disaster recovery",
        "ddos mitigation", "cyber security solution", "IT security", "cisco",
        "symantec", "norton", "trend micro", "avast", "carbon black",
        "crowd strike", "fortinet", "palo alto", "splunk", "mcafee", "sophos",
        "proofpoint", "imperva", "fireye", "LogRythm", "Netskope", "trustwave"
    ]

    keyword_education = [
        ".edu", "coursera", "khan academy", "open culture", "udemy",
        "academic earth", "edx", "university", "polytechnic", "diploma",
        "bachelors", "degree", "phd", "masters", "professor", "scholarship",
        "schooling", "teaching", "learning", "education", "online learning",
        "distance learning", "institute"
    ]

    keyword_tobacco = [
        'marlboro', 'camel', 'cigarette', 'tobacco', 'lucky strike', 'winston',
        'dunhill', 'lung cancer', 'viceroy', 'smoking', 'vape', 'e-cigarette',
        'cigar', 'vaping', 'vaporiser', 'electronic cigarette'
    ]

    keywords=keyword_cyber_security_risk+Adult_Content+keyword_Aggressive+Keyword_arts+keyword_automotive+keyword_cloud_service+\
             keywords_IM+keyword_Criminal_Activities+keyword_dating+keyword_softwareDevelopement+keyword_Ecommerce_Tools+keyword_Entertainment+\
             keyword_Software_Downloads_Sites+keyword_Finance_Accounting+keyword_Food_drinks+keyword_Gambling+keyword_government_legal+\
             keyword_Hobbies_Interests+keyword_insurance+keyword_jobsearch+Keyword_kids+keyword_Military+keyword_news_and_media+\
             keyword_peer2peer+keyword_pets+keywords_realEstate+keyword_Search_engines+keywords_shopping+Keyword_social+keyword_mediaStreaming+\
             keywords_trading_invest+Keyword_translation+keyword_webhosting_isp_telco+keyword_web_hosting+keyword_proxies_vpn+keyword_webmail+keyword_travel+\
             keyword_drugs+Keyword_weapons+keyword_sports+Keyword_religion+Keyword_technology+keyword_cyber_security_solutions+keyword_education+keyword_tobacco

    kp0 = KeywordProcessor()
    kp1 = KeywordProcessor()
    kp2 = KeywordProcessor()
    kp3 = KeywordProcessor()
    kp4 = KeywordProcessor()
    kp5 = KeywordProcessor()
    kp6 = KeywordProcessor()
    kp7 = KeywordProcessor()
    kp8 = KeywordProcessor()
    kp9 = KeywordProcessor()
    kp10 = KeywordProcessor()
    kp11 = KeywordProcessor()
    kp12 = KeywordProcessor()
    kp13 = KeywordProcessor()
    kp14 = KeywordProcessor()
    kp15 = KeywordProcessor()
    kp16 = KeywordProcessor()
    kp17 = KeywordProcessor()
    kp18 = KeywordProcessor()
    kp19 = KeywordProcessor()
    kp20 = KeywordProcessor()
    kp21 = KeywordProcessor()
    kp22 = KeywordProcessor()
    kp23 = KeywordProcessor()
    kp24 = KeywordProcessor()
    kp25 = KeywordProcessor()
    kp26 = KeywordProcessor()
    kp27 = KeywordProcessor()
    kp28 = KeywordProcessor()
    kp29 = KeywordProcessor()
    kp30 = KeywordProcessor()
    kp31 = KeywordProcessor()
    kp32 = KeywordProcessor()
    kp33 = KeywordProcessor()
    kp34 = KeywordProcessor()
    kp35 = KeywordProcessor()
    kp36 = KeywordProcessor()
    kp37 = KeywordProcessor()
    kp38 = KeywordProcessor()
    kp39 = KeywordProcessor()
    kp40 = KeywordProcessor()
    kp41 = KeywordProcessor()
    kp42 = KeywordProcessor()
    kp43 = KeywordProcessor()
    kp44 = KeywordProcessor()
    kp45 = KeywordProcessor()
    for word in keywords:
        kp0.add_keyword(word)
    for word in keyword_cyber_security_risk:
        kp1.add_keyword(word)
    for word in Adult_Content:
        kp2.add_keyword(word)
    for word in keyword_Aggressive:
        kp3.add_keyword(word)
    for word in Keyword_arts:
        kp4.add_keyword(word)
    for word in keyword_automotive:
        kp5.add_keyword(word)
    for word in keyword_cloud_service:
        kp6.add_keyword(word)
    for word in keywords_IM:
        kp7.add_keyword(word)
    for word in keyword_Criminal_Activities:
        kp8.add_keyword(word)
    for word in keyword_dating:
        kp9.add_keyword(word)
    for word in keyword_softwareDevelopement:
        kp10.add_keyword(word)
    for word in keyword_Ecommerce_Tools:
        kp11.add_keyword(word)
    for word in keyword_Entertainment:
        kp12.add_keyword(word)
    for word in keyword_Software_Downloads_Sites:
        kp13.add_keyword(word)
    for word in keyword_Finance_Accounting:
        kp14.add_keyword(word)
    for word in keyword_Food_drinks:
        kp15.add_keyword(word)
    for word in keyword_Gambling:
        kp16.add_keyword(word)
    for word in keyword_government_legal:
        kp17.add_keyword(word)
    for word in keyword_Hobbies_Interests:
        kp18.add_keyword(word)
    for word in keyword_insurance:
        kp19.add_keyword(word)
    for word in keyword_jobsearch:
        kp20.add_keyword(word)
    for word in Keyword_kids:
        kp21.add_keyword(word)
    for word in keyword_Military:
        kp22.add_keyword(word)
    for word in keyword_news_and_media:
        kp23.add_keyword(word)
    for word in keyword_peer2peer:
        kp24.add_keyword(word)
    for word in keyword_pets:
        kp25.add_keyword(word)
    for word in keywords_realEstate:
        kp26.add_keyword(word)
    for word in keyword_Search_engines:
        kp27.add_keyword(word)
    for word in keywords_shopping:
        kp28.add_keyword(word)
    for word in Keyword_social:
        kp29.add_keyword(word)
    for word in keyword_mediaStreaming:
        kp30.add_keyword(word)
    for word in keywords_trading_invest:
        kp31.add_keyword(word)
    for word in Keyword_translation:
        kp32.add_keyword(word)
    for word in keyword_webhosting_isp_telco:
        kp33.add_keyword(word)
    for word in keyword_web_hosting:
        kp34.add_keyword(word)
    for word in keyword_proxies_vpn:
        kp35.add_keyword(word)
    for word in keyword_webmail:
        kp36.add_keyword(word)
    for word in keyword_travel:
        kp37.add_keyword(word)
    for word in keyword_drugs:
        kp38.add_keyword(word)
    for word in Keyword_weapons:
        kp39.add_keyword(word)
    for word in keyword_sports:
        kp40.add_keyword(word)
    for word in Keyword_religion:
        kp41.add_keyword(word)
    for word in Keyword_technology:
        kp42.add_keyword(word)
    for word in keyword_cyber_security_solutions:
        kp43.add_keyword(word)
    for word in keyword_education:
        kp44.add_keyword(word)
    for word in keyword_tobacco:
        kp45.add_keyword(word)
    x = textSplit
    y0 = len(kp0.extract_keywords(x))
    y1 = len(kp1.extract_keywords(x))
    y2 = len(kp2.extract_keywords(x))
    y3 = len(kp3.extract_keywords(x))
    y4 = len(kp4.extract_keywords(x))
    y5 = len(kp5.extract_keywords(x))
    y6 = len(kp6.extract_keywords(x))
    y7 = len(kp7.extract_keywords(x))
    y8 = len(kp8.extract_keywords(x))
    y9 = len(kp9.extract_keywords(x))
    y10 = len(kp10.extract_keywords(x))
    y11 = len(kp11.extract_keywords(x))
    y12 = len(kp12.extract_keywords(x))
    y13 = len(kp13.extract_keywords(x))
    y14 = len(kp14.extract_keywords(x))
    y15 = len(kp15.extract_keywords(x))
    y16 = len(kp16.extract_keywords(x))
    y17 = len(kp17.extract_keywords(x))
    y18 = len(kp18.extract_keywords(x))
    y19 = len(kp19.extract_keywords(x))
    y20 = len(kp20.extract_keywords(x))
    y21 = len(kp21.extract_keywords(x))
    y22 = len(kp22.extract_keywords(x))
    y23 = len(kp23.extract_keywords(x))
    y24 = len(kp24.extract_keywords(x))
    y25 = len(kp25.extract_keywords(x))
    y26 = len(kp26.extract_keywords(x))
    y27 = len(kp27.extract_keywords(x))
    y28 = len(kp28.extract_keywords(x))
    y29 = len(kp29.extract_keywords(x))
    y30 = len(kp30.extract_keywords(x))
    y31 = len(kp31.extract_keywords(x))
    y32 = len(kp32.extract_keywords(x))
    y33 = len(kp33.extract_keywords(x))
    y34 = len(kp34.extract_keywords(x))
    y35 = len(kp35.extract_keywords(x))
    y36 = len(kp36.extract_keywords(x))
    y37 = len(kp37.extract_keywords(x))
    y38 = len(kp38.extract_keywords(x))
    y39 = len(kp39.extract_keywords(x))
    y40 = len(kp40.extract_keywords(x))
    y41 = len(kp41.extract_keywords(x))
    y42 = len(kp42.extract_keywords(x))
    y43 = len(kp43.extract_keywords(x))
    y44 = len(kp44.extract_keywords(x))
    y45 = len(kp45.extract_keywords(x))

    Total_matches = y0
    per1 = float(percentage1(y0, y1))
    per2 = float(percentage1(y0, y2))
    per3 = float(percentage1(y0, y3))
    per4 = float(percentage1(y0, y4))
    per5 = float(percentage1(y0, y5))
    per6 = float(percentage1(y0, y6))
    per7 = float(percentage1(y0, y7))
    per8 = float(percentage1(y0, y8))
    per9 = float(percentage1(y0, y9))
    per10 = float(percentage1(y0, y10))
    per11 = float(percentage1(y0, y11))
    per12 = float(percentage1(y0, y12))
    per13 = float(percentage1(y0, y13))
    per14 = float(percentage1(y0, y14))
    per15 = float(percentage1(y0, y15))
    per16 = float(percentage1(y0, y16))
    per17 = float(percentage1(y0, y17))
    per18 = float(percentage1(y0, y18))
    per19 = float(percentage1(y0, y19))
    per20 = float(percentage1(y0, y20))
    per21 = float(percentage1(y0, y21))
    per22 = float(percentage1(y0, y22))
    per23 = float(percentage1(y0, y23))
    per24 = float(percentage1(y0, y24))
    per25 = float(percentage1(y0, y25))
    per26 = float(percentage1(y0, y26))
    per27 = float(percentage1(y0, y27))
    per28 = float(percentage1(y0, y28))
    per29 = float(percentage1(y0, y29))
    per30 = float(percentage1(y0, y30))
    per31 = float(percentage1(y0, y31))
    per32 = float(percentage1(y0, y32))
    per33 = float(percentage1(y0, y33))
    per34 = float(percentage1(y0, y34))
    per35 = float(percentage1(y0, y35))
    per36 = float(percentage1(y0, y36))
    per37 = float(percentage1(y0, y37))
    per38 = float(percentage1(y0, y38))
    per39 = float(percentage1(y0, y39))
    per40 = float(percentage1(y0, y40))
    per41 = float(percentage1(y0, y41))
    per42 = float(percentage1(y0, y42))
    per43 = float(percentage1(y0, y43))
    per44 = float(percentage1(y0, y44))
    per45 = float(percentage1(y0, y45))
    allP = [
        per1, per2, per3, per4, per5, per6, per7, per8, per9, per10, per11,
        per12, per13, per14, per15, per16, per17, per18, per19, per20, per21,
        per22, per23, per24, per25, per26, per27, per28, per29, per30, per31,
        per32, per33, per34, per35, per36, per37, per38, per39, per40, per41,
        per42, per43, per44, per45
    ]
    allP.sort(key=float)
    if y0 == 0:
        Category = 'None'
    else:
        if per1 >= allP[-1]:
            Category = 'Cyber-Security Risk'
        elif per2 >= allP[-1]:
            Category = 'Adult Content'
        elif per3 >= allP[-1]:
            Category = 'Aggresive'
        elif per4 >= allP[-1]:
            Category = 'Arts'
        elif per5 >= allP[-1]:
            Category = 'Automotive'
        elif per6 >= allP[-1]:
            Category = 'Cloud Services'
        elif per7 >= allP[-1]:
            Category = 'Instant Messaging'
        elif per8 >= allP[-1]:
            Category = 'Criminal Activities'
        elif per9 >= allP[-1]:
            Category = 'Dating'
        elif per10 >= allP[-1]:
            Category = 'Software Development'
        elif per11 >= allP[-1]:
            Category = 'Ecommerce Tools'
        elif per12 >= allP[-1]:
            Category = 'Entertainment'
        elif per13 >= allP[-1]:
            Category = 'Software Download Sites'
        elif per14 >= allP[-1]:
            Category = 'Finance & Accounting'
        elif per15 >= allP[-1]:
            Category = 'Food and Drinks'
        elif per16 >= allP[-1]:
            Category = 'Gambling'
        elif per17 >= allP[-1]:
            Category = 'Government'
        elif per18 >= allP[-1]:
            Category = 'Hobbies and Interests'
        elif per19 >= allP[-1]:
            Category = 'Insurance'
        elif per20 >= allP[-1]:
            Category = 'Job Search'
        elif per21 >= allP[-1]:
            Category = 'Kids'
        elif per22 >= allP[-1]:
            Category = 'Military'
        elif per23 >= allP[-1]:
            Category = 'News & Media'
        elif per24 >= allP[-1]:
            Category = 'Peer 2 Peer'
        elif per25 >= allP[-1]:
            Category = 'Pets'
        elif per26 >= allP[-1]:
            Category = 'Real Estate'
        elif per27 >= allP[-1]:
            Category = 'Search Engine'
        elif per28 >= allP[-1]:
            Category = 'Shopping'
        elif per29 >= allP[-1]:
            Category = 'Social'
        elif per30 >= allP[-1]:
            Category = 'Media Streaming'
        elif per31 >= allP[-1]:
            Category = 'Trading & Investment'
        elif per32 >= allP[-1]:
            Category = 'Translation'
        elif per33 >= allP[-1]:
            Category = 'WebHosting_ISP_Telco'
        elif per34 >= allP[-1]:
            Category = 'Webhosting'
        elif per35 >= allP[-1]:
            Category = 'Proxies & VPN'
        elif per36 >= allP[-1]:
            Category = 'Webmail'
        elif per37 >= allP[-1]:
            Category = 'Travel'
        elif per38 >= allP[-1]:
            Category = 'Drugs'
        elif per39 >= allP[-1]:
            Category = 'Weapons'
        elif per40 >= allP[-1]:
            Category = 'Sports'
        elif per41 >= allP[-1]:
            Category = 'Religion'
        elif per42 >= allP[-1]:
            Category = 'Technology'
        elif per43 >= allP[-1]:
            Category = 'Cyber-Security Technologies'
        elif per44 >= allP[-1]:
            Category = 'Education'
        elif per45 >= allP[-1]:
            Category = 'Tobacco'

    return Category
Пример #15
0
    all_words_chosen = random.sample(all_words, 5000)
    story = ' '.join(all_words_chosen)

    # get unique keywords from the list of words generated.
    unique_keywords_sublist = list(
        set(random.sample(all_words, keywords_length)))

    # compile regex
    # source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
    rep = dict([(key, '_keyword_') for key in unique_keywords_sublist])
    compiled_re = re.compile("|".join(rep.keys()))

    # add keywords to flashtext
    keyword_processor = KeywordProcessor()
    for keyword in unique_keywords_sublist:
        keyword_processor.add_keyword(keyword, '_keyword_')

    # time the modules
    start = time.time()
    _ = keyword_processor.replace_keywords(story)
    mid = time.time()
    _ = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story)
    end = time.time()
    # print output
    print(
        str(keywords_length).ljust(6),
        '|',
        "{0:.5f}".format(mid - start).ljust(9),
        '|',
        "{0:.5f}".format(end - mid).ljust(9),
        '|',
Пример #16
0
# coding=utf-8
# if len(keywordList) > 500: exec(flashtext) else: exec(Regex)

from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('PyTorch')
keyword_processor.add_keyword(keyword='and', clean_name='or')
keywords_found = keyword_processor.extract_keywords('I love Python and PyTorch.')
print(keywords_found)
# ['or', 'PyTorch']

keyword_processor.add_keyword(keyword='Python', clean_name='Tensorflow')
new_sentence = keyword_processor.replace_keywords('I love Python and PyTorch.')
print(new_sentence)
# I love Tensorflow or PyTorch.
Пример #17
0
class KeyParser:
    def __init__(self):
        self.num_with_text = re.compile(r"номер[еау][\s:]+[0-9]{3,12}")
        self.num_only = re.compile(r"[0-9]{3,12}")

        self.code_with_text = re.compile(r"код.+\s+сло.+[:= -]+[а-яА-Я ]{3,20}")
        self.code_only = re.compile(r"[а-яА-Я ]{3,20}")

        self.service_with_text = re.compile(r"(услуг(у|и)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.service_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        self.tariff_with_text = re.compile(r"(тари(фы|ф)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.tariff_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        synonims = {}
        with open("synonims.json", "r", encoding='utf-8') as syn_file:
            synonims = json.load(syn_file)
            self.yep_key_processor = KeywordProcessor()
            for synonim in synonims['yes']:
                self.yep_key_processor.add_keyword(synonim)

            self.nope_key_processor = KeywordProcessor()
            for synonim in synonims['not']:
                self.nope_key_processor.add_keyword(synonim)

            self.on_key_processor = KeywordProcessor()
            for synonim in synonims['on']:
                self.on_key_processor.add_keyword(synonim)

            self.off_key_processor = KeywordProcessor()
            for synonim in synonims['off']:
                self.off_key_processor.add_keyword(synonim)


    def find_num(self, text):
        search_  = self.num_with_text.findall(text)
        if len(search_) > 0:
            # print(search_)
            return self.num_only.findall(search_[0])[0].rstrip().lstrip()

        search_  = self.num_only.findall(text)
        if len(search_) > 0:
            return search_[0].rstrip().lstrip()
        return None

    def find_key(self, text):
        search_  = self.code_with_text.findall(text)
        if len(search_) > 0:
            return self.code_only.findall(search_[0])[1].rstrip().lstrip()

        for line in reversed(text.splitlines()):
            search_ = self.code_only.findall(line)
            if len(search_) > 0:
                return search_[0].rstrip().lstrip()
        return None

    def find_all_commands(self, message):
        text = message['body']
        return self.find_num(text), self.find_key(text)

    def find_bool(self, message):
        keywords_found = self.yep_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            return True

        keywords_found = self.nope_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            return False
        return None

    def find_tariff(self, message):
        text = message['body']
        used = list()
        search_  = self.tariff_with_text.findall(text)[0]
        used.append(search_[0])

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        used.extend(keywords_found)
        # print(search_[0])
        if len(search_) > 0:
            tarif_name = str(search_[0])
            tarif_names = self.tariff_only.findall(tarif_name)[0]
            if len(tarif_names) > 0:
                return tarif_names[0], used
        return None, used

    def find_service_changes(self, message):
        used = list()
        text = message['body']
        service_name = None
        mode = None

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        used.extend(keywords_found)

        search_  = self.service_with_text.findall(text)[0]
        used.append(search_[0])
        #print('search_: ',search_)

        if len(search_) > 0:
            service_name = self.service_only.findall(search_[0])[0][0]

        keywords_found = self.on_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            mode=True

        keywords_found = self.off_key_processor.extract_keywords(message['body'])
        if len(keywords_found) > 0:
            mode=False
        return service_name, mode, used
Пример #18
0
import re
from pathos import multiprocessing
from flashtext.keyword import KeywordProcessor

title = open("nGramsCompNames.txt", "r")
sentences = open("sentences_short.txt", "r")
output = open("clean_sentences.txt", "w")

keyword_processor = KeywordProcessor(case_sensitive=True)
sentence = sentences.readlines()  # create an iterable object of sentenes

for line in title.readlines():
    string = line.split()
    keyword_processor.add_keyword(string[0].replace("_", " "), string[0])


def addUnderscore(sentence):
    target = keyword_processor.extract_keywords(sentence)
    new_line = keyword_processor.replace_keywords(sentence)
    return new_line


pool = multiprocessing.Pool(processes=4)  # number of threads
for result in pool.imap_unordered(addUnderscore, sentence):
    output.write(result)

output.close()
Пример #19
0
def GenerateTimeSeriesFromGDELT_NewsURLs_CP4(folder_name):
    global prog
    global indexedKeywords
    global keywords_to_narratives
    global MainDF

    dfArticleNar = pd.read_csv(os.path.join(folder_name,'labeledArticlesWithURLs.csv'))
    #indexedUrlList = list(dfArticleNar.url.unique())

    for i, row in dfArticleNar.iterrows():
        keywords_to_narratives[row['url']] = [ row['narrative'] ]
        indexedKeywords.append(row['url'])
    
    # Create regex and compile it for text match
    # print('Creating the regex...')
    # NProcs = multiprocessing.cpu_count()
    # SliceSize = int(math.floor(len(indexedKeywords) / NProcs))
    # paramList = [ (i * SliceSize, indexedKeywords[i * SliceSize : (i + 1) * SliceSize]) for i in range(NProcs - 1) ]
    # paramList.append(( (NProcs - 1) * SliceSize, indexedKeywords[(NProcs - 1) * SliceSize :] ))
    # with multiprocessing.Pool(NProcs) as p:
    #     results = p.starmap(createRegex, paramList)
    # pattern = ''.join(results)[:-1]
    # print('Compiling the regex...')
    # prog = re.compile(pattern)

    #keyword processor
    prog = KeywordProcessor()
    for kw in indexedKeywords:
        prog.add_keyword(kw)

    TS = []

    #---
    for gdeltQueryFile in glob.glob(folder_name + '/*gdelt*json.gz'):
        print('\t\tcurrently collected rows ' + str(len(TS)))
        print('\t\tReading File : ' + gdeltQueryFile)
        MainDF = pd.read_json(gdeltQueryFile,compression='gzip',lines=True)
        MainDF = MainDF.drop(columns=['QuadClass', 'MonthYear', 'ActionGeo_ADM1Code', 'EventBaseCode',
                                        'EventCode', 'NumSources',
                                        'Actor1Geo_CountryCode', 'EventRootCode',
                                        'ActionGeo_Long', 'Actor1Geo_ADM2Code',
                                        'Actor1Code', 'ActionGeo_CountryCode', 
                                        'Actor1CountryCode', 'Actor1Geo_FeatureID', 'Actor2Geo_Long',
                                        'IsRootEvent', 'Actor2Geo_CountryCode', 'Actor1Geo_Type',
                                        'Actor2Geo_FeatureID', 'globaleventid', 'ActionGeo_Type',
                                        'Actor2Geo_ADM1Code', 'Actor1Geo_Long', 'Actor2Geo_ADM2Code',
                                        'Actor1Geo_ADM1Code', 'Actor2Code', 'dateadded', 'Actor2CountryCode',
                                        'Actor2Geo_Type', 'ActionGeo_FeatureID', 'filename',
                                        'FractionDate', 'Year', 'NumMentions', 'Actor2Geo_Lat', 'Actor1Geo_Lat',
                                        'ActionGeo_ADM2Code', 'ActionGeo_Lat', 'NumArticles',
                                        'Actor1Type1Code', 'Actor2Type1Code', 'Actor1Type2Code',
                                        'Actor2KnownGroupCode', 'Actor2Type2Code', 'Actor1KnownGroupCode',
                                        'Actor1Religion2Code', 'Actor1Religion1Code', 'Actor2Type3Code',
                                        'Actor2Religion2Code', 'Actor2Religion1Code', 'Actor2EthnicCode',
                                        'Actor1EthnicCode', 'Actor1Type3Code',
                                        'Actor1Name', 'Actor2Name', 'ActionGeo_FullName', 'Actor1Geo_FullName', 'Actor2Geo_FullName'
                                        ])
        print(MainDF)
        print(MainDF.columns)
        print('\t\tRows to Proces ' + str(MainDF.shape[0]))
        NProcs = multiprocessing.cpu_count()
        print('\t\tNum of Procs ' + str(NProcs))
        sizePerProc = 6500
        print('\t\tRows per Proc ' + str(sizePerProc))
        rowsPerRun = sizePerProc * NProcs
        print('\t\tRows per Run ' + str(rowsPerRun))
        currentRowStart = 0
        currentRowEnd = rowsPerRun
        while currentRowEnd < MainDF.shape[0]:
            print('\t\tProcessing from ' + str(currentRowStart) + ' to ' + str(currentRowEnd))
            paramList = [(currentRowStart + i * sizePerProc, currentRowStart + (i + 1) * sizePerProc) for i in range(NProcs - 1) ]
            if currentRowEnd > paramList[-1][1]:
                paramList.append((paramList[-1][1], currentRowEnd ))
            print('\t\tAllocation:')
            s = ''
            for p in paramList:
                s += '[' + str(p[0]) + '-' + str(p[1]) + '),'
            print(s)
            with multiprocessing.Pool(NProcs) as p:
                results = p.starmap(processData, paramList)
            for pr in results:
                TS = TS + pr
            print('\t\tcurrently collected rows ' + str(len(TS)))
            currentRowStart = currentRowEnd
            currentRowEnd = currentRowStart + rowsPerRun
        
        print('\t\tdone looping the file.')
        if currentRowStart < MainDF.shape[0] and currentRowStart < currentRowEnd :
            thegap = MainDF.shape[0] - currentRowStart
            print('\t\tremaining rows ' + str(thegap) )
            numProcsReq = thegap // sizePerProc
            print('\t\tRequired Procs ' + str(numProcsReq))
            paramList = [(currentRowStart + i * sizePerProc, currentRowStart + (i+1) * sizePerProc ) for i in range(numProcsReq - 1)]
            #print(paramList)
            if paramList == []:
                paramList = [(currentRowStart, MainDF.shape[0])]
            if MainDF.shape[0] > paramList[-1][1]:
                paramList.append((paramList[-1][1], MainDF.shape[0]))
            #print(paramList)
            s = ''
            for p in paramList:
                s += '[' + str(p[0]) + '-' + str(p[1]) + '),'
            print(s)
            with multiprocessing.Pool(NProcs) as p:
                results = p.starmap(processData, paramList)
            for pr in results:
                TS = TS + pr
        
        print('\t\tcurrently collected rows ' + str(len(TS)))
            
    #---
    print('\t\tFinally collected rows ' + str(len(TS)))
    MainDF = pd.DataFrame(TS, columns=['time','actor','gs'])
    print("GDELT DataFrame Generation Done.")
    MainDF['time'] = MainDF.apply(lambda x: dt.datetime.strptime(x.time,"%Y-%m-%dT%H:%M:%S"),axis=1)
    MainDF = MainDF.sort_values(by='time')
    MainDF.to_csv(os.path.join(folder_name,'MACMProcessedGDELT.csv'))
    return pd.pivot_table(MainDF, columns=['actor'], index='time', values='gs',aggfunc=np.mean).resample("H").mean().ffill().fillna(0)