def abusive_analysis(description): ''' This function does all the prearrangement for keyword analysis and use find_abuse and remove_stopwords as a helper funcitons Parameter: description ''' #opening the text file of abusive words with open('bad_words_list.txt', 'r') as f: c = f.readlines() s = c[0].split(',') abusive_words = [] for i in s: i = i[1:] abusive_words.append(i) keywords = np.array(abusive_words) kp0 = KeywordProcessor() #creating object of KeywordProcessor for word in keywords: kp0.add_keyword(word) sw = stopwords.words('english') to_remove = [ '[]', '', '1', '()', '||', '=', '.', ',', '\n', ':', ';', '\\', '//', '/' ] #some additional stopwords provided manually for i in to_remove: sw.append(i) clean = remove_stopwords(description, sw) find_abuse(clean, kp0)
def test_replace_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Replace keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_replacer = KeywordProcessor() keyword_replacer.add_keywords_from_dict(test_case['keyword_dict']) new_sentence = keyword_replacer.replace_keywords( test_case['sentence']) replaced_sentence = test_case['sentence'] keyword_mapping = {} for val in test_case['keyword_dict']: for value in test_case['keyword_dict'][val]: keyword_mapping[value] = val for key in sorted(keyword_mapping, key=len, reverse=True): lowercase = re.compile(r'(?<!\w){}(?!\w)'.format( re.escape(key))) replaced_sentence = lowercase.sub(keyword_mapping[key], replaced_sentence) self.assertEqual( new_sentence, replaced_sentence, "new_sentence don't match the expected results for test case: {}" .format(test_id))
def get_kp(file): kp = KeywordProcessor() with open('../data/intents/' + file + '.csv', mode='r') as infile: reader = csv.reader(infile) for read in reader: kp.add_keyword(read[0], read[1]) return kp
def find_sentences(keywords_list, input_str, topn): from flashtext.keyword import KeywordProcessor from collections import Counter ''' :param keywords_list: list ['word1',..'wordn'] :param doc_txt: txt文档 :param topn: 显示重要程度TOPN的句子 :return:keywords_sentences: dict {'word1':['sen1','sen2',...],...} ''' keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(keywords_list) # 按标点符号切分句子 doc_cut_list = set(re.split('[?!…。?!]', input_str)) # 去重 # 查找关键词所在句子 sentences_wordcount = {} for sentence in doc_cut_list: keywords_found = keyword_processor.extract_keywords(sentence) if len(keywords_found) != 0: keywords_count = Counter(keywords_found) sentences_wordcount[sentence] = keywords_count keywords_sentences = {} for word in keywords_list: keywords_sentences[word] = [ k for k, v in sentences_wordcount.items() if word in v.keys() ][0:topn] return keywords_sentences
def fastcleaner(docs, replacewords): ''' 语料清洗工具, FastText可用于快速进行大规模语料库的文本搜索与替换 INPUT -> 文档集(词之间为空格)、替换词表 ''' docs_new = [] keyword_processor = KeywordProcessor() for doc in docs: for word1, word2 in replacewords: keyword_processor.add_keyword(word1, word2) # 前面一个词为定位词, 后面一个词为替换 docs_new.append(keyword_processor.replace_keywords(doc)) return docs_new
def extract_tag_frequency(target_date, target_file_path, target_dir_path): """ Extract tag frequency :param target_date: Target date :param target_file_path: Target file path :param target_dir_path: Target directory path """ target_tag_freq_dict = dict() target_tag_file_dict = dict() keyword_processor = KeywordProcessor() target_file = open(target_file_path) for tag in target_file: tag = ' ' + tag.strip() + ' ' keyword_processor.add_keyword(tag) target_file.close() for dir_path, sub_dirs, files in os.walk(target_dir_path): for file_name in files: nlp_file_path = os.path.join(dir_path, file_name) try: nlp_file = open(nlp_file_path) for line in nlp_file: line_list = line.strip().split('\t') tag_sent = ' ' + line_list[2] + ' ' keywords_found = keyword_processor.extract_keywords(tag_sent) for keyword in keywords_found: keyword = keyword.strip() if keyword not in target_tag_freq_dict: target_tag_freq_dict[keyword] = 1 else: target_tag_freq_dict[keyword] += 1 if keyword not in target_tag_file_dict: target_tag_file_dict[keyword] = [nlp_file_path] else: if nlp_file_path not in target_tag_file_dict[keyword]: target_tag_file_dict[keyword].append(nlp_file_path) nlp_file.close() except Exception: print traceback.format_exc() print "[ERROR] Can't analyze {0}".format(nlp_file_path) continue frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w') sorted_tag_list = sorted(target_tag_freq_dict) for tag in sorted_tag_list: print >> frequency_output_file, '{0}\t{1}'.format(tag, target_tag_freq_dict[tag]) frequency_output_file.close() file_list_output_file = open('{0}_file_list.txt'.format(target_date), 'w') sorted_file_list = sorted(target_tag_file_dict) for tag in sorted_file_list: for file_nm in target_tag_file_dict[tag]: print >> file_list_output_file, '{0}\t{1}'.format(tag, file_nm) file_list_output_file.close()
def __init__(self): self.num_with_text = re.compile(r"номер[еау][\s:]+[0-9]{3,12}") self.num_only = re.compile(r"[0-9]{3,12}") self.code_with_text = re.compile(r"код.+\s+сло.+[:= -]+[а-яА-Я ]{3,20}") self.code_only = re.compile(r"[а-яА-Я ]{3,20}") self.service_with_text = re.compile(r"(услуг(у|и)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))") self.service_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))") self.tariff_with_text = re.compile(r"(тари(фы|ф)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))") self.tariff_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))") synonims = {} with open("synonims.json", "r", encoding='utf-8') as syn_file: synonims = json.load(syn_file) self.yep_key_processor = KeywordProcessor() for synonim in synonims['yes']: self.yep_key_processor.add_keyword(synonim) self.nope_key_processor = KeywordProcessor() for synonim in synonims['not']: self.nope_key_processor.add_keyword(synonim) self.on_key_processor = KeywordProcessor() for synonim in synonims['on']: self.on_key_processor.add_keyword(synonim) self.off_key_processor = KeywordProcessor() for synonim in synonims['off']: self.off_key_processor.add_keyword(synonim)
def test_extract_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keywords_extracted = keyword_processor.extract_keywords( test_case['sentence']) self.assertEqual( keywords_extracted, test_case['keywords'], "keywords_extracted don't match the expected results for test case: {}" .format(test_id))
def detect_rule_update(detect_list): logging.info('------------- flashtext keyword updating--------------') kp = KeywordProcessor() DetectThread.detect_regex_list = {} for id in detect_list: str = detect_list[id]['regex'] if str.startswith('gfwmatch_'): str = str.replace('gfwmatch_', '') kp.add_keyword(str) else: DetectThread.detect_regex_list[id] = detect_list[id] DetectThread.keyword_processor = kp logging.debug('------------- flashtext keyword res %s --------------' % (kp.get_all_keywords()))
def load(): if KeyProcessor.file is not None: KeyProcessor.kp = KeywordProcessor() with open('../../data/intents/' + KeyProcessor.file + '.csv', mode='r') as infile: reader = csv.reader(infile) for row in reader: KeyProcessor.kp.add_keyword(row[0], row[1])
def __init__(self, stringList: list, isCaseSensitive: bool): """ Initializes TextParse object with a list of strings, and if parser should be case sensitive as object attributes """ self.stringList = stringList self.isCaseSensitive = isCaseSensitive self.setNormalizedList() self.processor = KeywordProcessor() self.setKeywords()
def __init__(self, companynames_from_file=False, companynames_filepath=None): self.companynames_from_file = companynames_from_file self.companynames_filepath = companynames_filepath self.concept_to_term = {} self.keyword_processor = KeywordProcessor() self._init_keyword_processor(self.keyword_processor) self._init_companynames(self.keyword_processor)
def find_sentences_weight(word_weight_dict, input_str, topn): from flashtext.keyword import KeywordProcessor from collections import Counter ''' :param keywords_list: list ['word1',..'wordn'] :param top_keywords: 计算TOP权重的关键词 :param input_str: str文档 :param topn: 显示重要程度TOPN的句子 :return:keywords_sentences: dict {'word1':['sen1','sen2',...],...} ''' # 读取词汇权重文件 keywords_list = list(word_weight_dict.keys()) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list(keywords_list)) input_str = re.sub('\s', '', input_str) doc_cut_list = set(re.split('[?!…。;?!]', input_str)) # 去重 # 计算句子重要程度 sentences_wordcount = {} # [('句子',(value,['word1',...'wordn']))] for sentence in doc_cut_list: keywords_found = keyword_processor.extract_keywords(sentence) if len(keywords_found) != 0: keywords_in_sentence = [] sentence_value = 0 keywords_count = Counter(keywords_found) for k, v in keywords_count.items(): sentence_value = sentence_value + v * word_weight_dict[ k] # 权重 = 求和(句子中出现的关键词权重) keywords_in_sentence.append(k) sentence_value = len(keywords_in_sentence) * sentence_value / len( sentence) # 权重 = 关键词个数 * 关键词权重 / 句子长度, 可根据需要注释 sentences_wordcount[sentence] = (sentence_value, keywords_in_sentence) sentences_wordcount_sort = sorted( sentences_wordcount.items(), key=lambda x: x[1][0], reverse=True) # 句子,权重 [('句子',(value,['word1',...'wordn']))] topn_sentences = [s[0] for s in sentences_wordcount_sort[0:topn]] return topn_sentences
def extract_word_frequency(target_date, target_file_path, target_dir_path): """ Extract word frequency :param target_date: Target date :param target_file_path: Target file path :param target_dir_path: Target directory path """ target_word_freq_dict = dict() keyword_processor = KeywordProcessor() target_file = open(target_file_path) for tag in target_file: tag = tag.strip() keyword_processor.add_keyword(tag) target_file.close() for dir_path, sub_dirs, files in os.walk(target_dir_path): for file_name in files: if not file_name.endswith('_trx.txt'): continue trx_file_path = os.path.join(dir_path, file_name) try: trx_file = open(trx_file_path) for line in trx_file: line = line.strip() keywords_found = keyword_processor.extract_keywords(line) for keyword in keywords_found: keyword = keyword.strip() if keyword not in target_word_freq_dict: target_word_freq_dict[keyword] = 1 else: target_word_freq_dict[keyword] += 1 trx_file.close() except Exception: print traceback.format_exc() print "[ERROR] Can't analyze {0}".format(trx_file_path) continue frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w') sorted_tag_list = sorted(target_word_freq_dict) for tag in sorted_tag_list: print >> frequency_output_file, '{0}\t{1}'.format( tag, target_word_freq_dict[tag]) frequency_output_file.close()
def getSemanticNeighbors(entNameDict, entTextDict): semNeiDict = {i: set() for i in dm.entityDict.keys()} keyword_processor = KeywordProcessor() name2midDict = {} for mid in dm.entityDict.keys(): if mid not in entNameDict.keys(): continue name = entNameDict[mid] if len(name) <= 3: continue keyword_processor.add_keyword(name) name2midDict[name] = mid for mid2 in entTextDict.keys(): if mid2 not in dm.entityDict.keys(): continue text = entTextDict[mid2].lower() keywords_found = keyword_processor.extract_keywords(text) for key in keywords_found: mid = name2midDict[key] if mid != mid2: semNeiDict[mid].add(mid2) semNeiDict[mid2].add(mid) return semNeiDict
def find_sentences_rule(keywords_list, doc_txt): from flashtext.keyword import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(keywords_list) # 按标点符号切分句子 fie = [ line.strip() for line in open(doc_txt, 'r', encoding='utf-8').readlines() if len(line) > 1 ] # 按特殊标点符号,分割句子 article = '' for i in fie: article += i.strip() + '*' doc_cut_list = re.split('[?!…。?!*]', article) # 对不同行,没有标点的句子通过‘* sentences_important = [] for sentence in doc_cut_list: keywords_found = keyword_processor.extract_keywords(sentence) if len(keywords_found) != 0: sentences_important.append(sentence) print(sentences_important) return sentences_important
def test_file_format_two(self): keyword_processor = KeywordProcessor() keyword_processor.add_keyword_from_file('test/keywords_format_two.txt') sentence = 'I know java and product management' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
def test_list_loading(self): keyword_processor = KeywordProcessor() keyword_list = ["java", "product management"] keyword_processor.add_keywords_from_list(keyword_list) sentence = 'I know java and product management' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
def create_keywordProcessor(list_of_terms, remove_stopwords=True, custom_stopword_list=[""]): """ Creates a new flashtext KeywordProcessor and optionally does some lightweight text cleaning to remove stopwords, including any provided by the user. """ # create a KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list_of_terms) # remove English stopwords if requested if remove_stopwords == True: keyword_processor.remove_keywords_from_list(stopwords.words('english')) # remove custom stopwords keyword_processor.remove_keywords_from_list(custom_stopword_list) return (keyword_processor)
def __init__(self): # init NLP self.nlp = Indonesian() # init flash text self.keyword_processor_slang_word = KeywordProcessor() self.keyword_processor_emoticon = KeywordProcessor() self.keyword_processor_meaning_text = KeywordProcessor() # init stemmer self.stemmer = StemmerFactory().create_stemmer() self.__init_flash_text_corpus() self.__init_custom_stop_word()
def __init__(self, client=None, index='narvalontology', stoplist=None, concept_type=None, include_misspelled=False): self.client = client self.index = index if stoplist is None: stoplist = [] self.stoplist = stoplist self.concept_type = concept_type self.include_misspelled = include_misspelled self.concept_to_term = {} self.keyword_processor = KeywordProcessor() self.init_keyword_processor(self.keyword_processor) self.init_ontology(self.keyword_processor)
def test_dictionary_loading(self): keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["product management techniques", "product management"] } keyword_processor.add_keywords_from_dict(keyword_dict) sentence = 'I know java_2e and product management techniques' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
#%% from flashtext.keyword import KeywordProcessor import sqlite3 keyword_processor = KeywordProcessor() conn = sqlite3.connect('./diming.sqlite') cursor = conn.cursor() QUERY_DIMING = """ select name from diming; """ # 执行sql语句 cursor.execute(QUERY_DIMING) v = cursor.fetchall() print(v) # 提交事务 conn.commit() # 关闭连接 conn.close()
return words_per_category words_frequency = remove_clutter_words(words_frequency) from flashtext.keyword import KeywordProcessor from collections import Counter from flashtext.keyword import KeywordProcessor from collections import Counter all_keywords = [] word_processors = {} print('word_processors') for category in words_frequency.keys(): all_keywords.extend(words_frequency[category]) word_processor = KeywordProcessor() for word in words_frequency[category]: word_processor.add_keyword(word) word_processors[category] = word_processor # remove duplicates all_keywords = set(all_keywords) all_keywords = list(all_keywords) all_words_processor = KeywordProcessor() for word in all_keywords: all_words_processor.add_keyword(word) def compute_percentage(dum0, dumx): try: ans = float(dumx) / float(dum0) ans = ans * 100
class Preprocessing(object): def __init__(self): # init NLP self.nlp = Indonesian() # init flash text self.keyword_processor_slang_word = KeywordProcessor() self.keyword_processor_emoticon = KeywordProcessor() self.keyword_processor_meaning_text = KeywordProcessor() # init stemmer self.stemmer = StemmerFactory().create_stemmer() self.__init_flash_text_corpus() self.__init_custom_stop_word() def __init_flash_text_corpus(self): """ Init flash text corpus. """ # build slang word corpus slang_words_raw = Repository.get_slang_word() for word in slang_words_raw.values: self.keyword_processor_slang_word.add_keyword(word[0], word[1]) # build emoticon corpus emoticon_raw = constant.EMOTICON_LIST for key, values in emoticon_raw: for value in values: self.keyword_processor_emoticon.add_keyword(value, key) # build meaning word corpus meaning_words_raw = Repository.get_meaning_text() for word in meaning_words_raw.values: self.keyword_processor_meaning_text.add_keyword(word[0], word[1]) def __init_custom_stop_word(self): """ Custom stop word for chat message content. """ for stop_word in constant.STOP_WORD: self.nlp.vocab[stop_word].is_stop = True for stop_word in constant.EXC_STOP_WORD: self.nlp.vocab[stop_word].is_stop = False def cleaning(self, chat_message_list): """ Pre-processing the content from ChatMessage. :param chat_message_list: dirty content from list of ChatMessage. :return: list of ChatMessage. """ chat_message_list_temp = [] if chat_message_list: logger.info('Pre-processing started...') start_time = time.time() chat_message_list = self.remove_repeated_message_from_agent( chat_message_list) for chat_message in chat_message_list: logger.info(f'BEFORE -> {chat_message.content}') content = self.__preprocessing_flow(chat_message.content) logger.info(f'AFTER -> {content}') chat_message.content = content if content.strip(): chat_message_list_temp.append(chat_message) logger.info( f'Pre-processing finished. {time.time() - start_time} seconds') else: logger.info('No chat message yet.') return chat_message_list_temp def cleaning_with_pipe(self, chat_message_list): """ [DEPRECATED] Pre-processing the content from ChatMessage with multi threading from spaCy. :param chat_message_list: dirty content from list of ChatMessage. :return: list of ChatMessage. """ if chat_message_list: logger.info('Pre-processing started...') start_time = time.time() index = 0 chat_content_list = [ chat_message.content for chat_message in chat_message_list ] for content in self.nlp.pipe(chat_content_list, n_threads=cpu_count()): chat_message_list[index].content = self.__preprocessing_flow( content.text) index = index + 1 logger.info( f'Pre-processing finished. {time.time() - start_time} seconds') else: logger.info('No chat message yet.') return chat_message_list def __preprocessing_flow(self, content): """ Preprocessing flow. """ # normalize emoticon # content = PreprocessingUtilsV2.normalize_emoticon(content, self.keyword_processor_emoticon) content = str(content) # normalize url content = PreprocessingUtils.normalize_url(content) # remove url content = PreprocessingUtils.remove_url(content) # remove email content = PreprocessingUtils.remove_email(content) # remove digit number content = PreprocessingUtils.remove_digit_number(content) # case folding lower case content = PreprocessingUtils.case_folding_lowercase(content) # remove punctuation content = PreprocessingUtils.remove_punctuation(content) # remove repeated character content = PreprocessingUtils.remove_repeated_character(content) # normalize slang word content = PreprocessingUtilsV2.normalize_slang_word( content, self.keyword_processor_slang_word) # stemming, tokenize, remove stop word content = PreprocessingUtils.stemming(content, self.nlp, self.stemmer) # remove unused character content = PreprocessingUtils.remove_unused_character(content) # join negation word content = PreprocessingUtils.join_negation(content) # remove extra space between word content = PreprocessingUtils.remove_extra_space(content) # normalize word content = PreprocessingUtilsV2.normalize_meaning_word( content, self.keyword_processor_meaning_text) # remove stop word content = PreprocessingUtils.remove_stop_word(content, self.nlp) # TODO add another pre-processing if needed return content @staticmethod def identify_phrase(documents): """ documents : iterable of iterable of str """ bigram = Phraser( Phrases(documents, min_count=5, delimiter=b'_', threshold=1)) trigram = Phraser( Phrases(bigram[documents], min_count=5, delimiter=b'_', threshold=1)) for i in range(len(documents)): for token in bigram[documents[i]]: if '_' in token: documents[i].append(token) for token in trigram[documents[i]]: if '_' in token: documents[i].append(token) return documents @staticmethod def remove_repeated_message_from_agent(message_history_list): """ documents : removed repeated chat message if repeat more than constant.MESSAGE_TEMPLATE_MIN_COUNT""" message_template_list = [] message_history_list_temp = [] counter = collections.Counter() for chat_message in message_history_list: if chat_message.sender_role == constant.SENDER_ROLE_AGENT: counter[chat_message.content] += 1 for key, value in counter.items(): if value > constant.MESSAGE_TEMPLATE_MIN_COUNT: message_template_list.append(key) for chat_message in message_history_list: if chat_message.content not in message_template_list: message_history_list_temp.append(chat_message) return message_history_list_temp
from flashtext.keyword import KeywordProcessor keyword_processor = KeywordProcessor() # keyword_processor.add_keyword('SQL injection') keyword_processor.add_keyword('SQL injection', ('vulnerability type', 'SQL injection')) keyword_processor.add_keyword('cross-site scripting', ('vulnerability type', 'cross-site scripting')) keyword_processor.add_keyword('cross-site scripting', 'XSS') # keyword_processor.add_keyword('parameter') # keyword_processor.add_keyword('function') # keyword_processor.add_keyword('variable') keyword_dict = {"cross-site scripting": ["XSS"], "parametert": ["variabler"]} # {'clean_name': ['list of unclean names']} keyword_processor.add_keywords_from_dict(keyword_dict) # Or add keywords from a list: keyword_processor.add_keywords_from_list(["parameter", "function", "variable"]) ####keyword replacement # keyword_processor.add_keyword('cross-site scripting', 'XSS') # keyword_processor.replace_keywords('vulnerability is cross-site scripting') keyword_processor.extract_keywords( 'SQL injection vulnerability in the update_zone function in catalog/admin/geo_zones.php in osCommerce Online Merchant 2.3.3.4 and earlier allows remote administrators to execute arbitrary SQL commands via the zID parameter in a list action. ' )
pdfFileObj = open('JavaBasics-notes.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) num_pages = pdfReader.numPages count = 0 text = "" while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() #now part for Java-keywords #aho-corasick algorithm for string matching #trie data-structure keywords = open("keywords.txt", "r") keys = keywords.read().splitlines() keyword_processor = KeywordProcessor() for i in range(len(keys)): keyword_processor.add_keyword(keys[i]) keywords_found = keyword_processor.extract_keywords(text) found = keywords_found # Given a list of words, return a dictionary of # word-frequency pairs. def wordListToFreqDict(wordlist): wordfreq = [wordlist.count(p) for p in wordlist] return dict(zip(wordlist, wordfreq)) # Sort a dictionary of word-frequency pairs in # order of descending frequency.
def classification(textSplit): keyword_cyber_security_risk = ["ad fraud", "cyberattack", "malware", "botnets", "CnC", "Command and Control", "compromised accounts", \ "hacking", "key logging", "phishing", "spyware", "worm", "trojan", "RAT", "APT", "adware", "fileless attack", "cryptocurrency mining", \ "spam", "ransomware", "denial-of-service", "sql injection", "man-in-the-middle", "compromised pcs", "spam site", "malicious payload", "apt", \ "advanced persistent threat", "spoofing", "virus", "slowloris", "xss", "cross-site scripting", "exploit", "vulnerability", "cve", "day zero","backdoor","blackhat" ,"bruteForce","botNet","cracking","forensics","exploit","scanning","enumeration","reconnaisance","" "adware", "autorun worms", "advanced persistent threats", "attack vector", "backdoor", "blended attack", "botnet", "browser hijacker", "brute force attack", "clickjacking", "command and control servers", "content spoofing", "cross site scripting", "xss" "xsrf", "data theft", "denial of service attack", "dictionary attack", "drive-by download", "email spoofing", "email virus", "exploit", "form grabber", "identity theft", "insider attack", "keylogger", "likejacking", "malware", "mman in the middle" "ransomware", "rootkit", "spam", "spyware", "SQL injection", "wabbit", "website spoofing " ,"ip","tcp","router","network","cisco", ""] Adult_Content = [ "pornography", "violence", "blood", "gore", "sex", "nudity", "erotic", "hardcore", "fetish", "intercourse", "explicit content", "hentai", "masturbation", "dick", 'pussy', "penis", "v****a", "anus", 'boobs', 'p**n', 'xxx' ] keyword_Aggressive = [ "attacking", "fighting", "invading", "assailing", "threaten", "slashing", "beating", "destroy", "destruction", 'assault' ] Keyword_arts = [ "painting", "drawing", "ceramics", "pottery", "photography", "sculpture", "dance" ] keyword_automotive = [ 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti', 'chevrolet', 'chrysler', 'citroen', 'dodge', 'ferrari', 'honda', 'toyota', 'hyundai', 'kia', 'lamborghini', 'lexus', 'mazda', 'mercedes', 'skoda', 'mitsubishi', 'nissan', 'porsche', 'subaru', 'suzuki', 'tesla', 'volkswagen', 'volvo', 'horsepower', 'torque' ] keyword_cloud_service = [ "cloud backup", "cloud storage", "cloud processing", "iaas", "paas", "saas", "aws", "azure", "google cloud", "amazon web services", "infrastructure as a service", "platform as a service", "software as a service", "cloud software", "IBM cloud", "vmware", "salesforce", "oracle cloud", "sap cloud", "alibaba cloud", "cloud service" ] keywords_IM = [ "discord", "skype", "viber", "whatsapp", "facebook messenger", "wechat", "telegram", "line", "qq mobile" ] keyword_Criminal_Activities = [ "arson", "assault", "bigamy", "blackmail", "bribery", "burglary", "child abuse", "conspiracy", "espionage", "forgery", "fraud", "genocide", "hijacking", "homicide", "kidnap", "manslaughter", "mugging", "murder", "kill", "perjury", "rape", "riot", "robbery", "shoplift", "slander", "smuggle", "treason", "trespass", "gang fights", "steal", "theft", "cyber crime", "corruption", "domestic", "violence", "ransom", "vandalism", "child abuse ", "terrorism", "militia", 'insurgent', 'bombing', 'terrorist', 'make bomb', 'bomb making', 'bombs' ] keyword_dating = [ "online dating", "tinder", "okcupid", "valentines", "romantic", "roses", "presents", "anniversary", "rings", "dating ideas", "movie dates", 'wedding', 'hook up' ] keyword_softwareDevelopement = [ "pycharm", "netbeans", "sqlite", "linux", "visual studio", "node.js", "codenvy", "angularjs", "eclipse", "react native", 'python', 'java', 'c++', 'ruby on rails', 'flutter', 'javascript', 'html', 'maven', 'node.js', 'html', 'css', 'php', 'database', 'sql', 'db', 'pip', 'web development', 'code', 'debug', 'c#', 'kotlin', 'objective-c', 'visual basic', 'perl', 'matlab', 'libraries', 'stack development', 'backend', 'frontend', 'framework', 'software develop', 'machine learning', 'tensorflow', 'AI', 'API', 'application programming interface' ] keyword_Ecommerce_Tools = [ "ecommerce website tools", "research tools", "business tools", "marketing tools", "analytics tools", "bigcommerce", "x-cart", "shopify", "woocommerce", "prestashop", "junglescout", "semrush", "ahrefs", "sourcify", "veeqo", "tickspot", "asana", "inventory source", "oberlo", "shipwire", "tradegecko", "shippingeasy", "wave", "ecomdash", "mailchimp", "campaign monitor", "feeds4", "active campaign", "bulk.ly", "buffer", "omnistar", "antavo", "smile.lo", "user testing", "wishpond", "klaviyo", "buzzstream", "exitbee", "metrilo", "storeya", "instasize", "visual website optimizer", "optimizely analytics", "google analytics", "neatly", 'search engine optimization', 'SEO' ] keyword_Entertainment = [ "plays", "comedy", "puppet shows", "sports", "performance art", "animation", "karaoke", "video games", "dance", "magic", "television programs", "music", "acting", "nightclubs", "fashion shows", "netflix", "concerts", "circus", "parties", "symphonies", "theatre", "variety shows" ] keyword_Software_Downloads_Sites = [ "download.com", "filehippo", "zdnet download", "softpedia", "tucows", "freewarefiles", "majorgeeks", "filecluster", "soft32", "torrent", "softonic", "freewarehome", "ninite", "download crew", "filehorse", "filepuma", "sourceforge", "software" "informer", "alternativeto" ] keyword_Finance_Accounting = [ "Accounts payable", "accounts receivable", "accrued expense", "balance sheet", "book value", "equity", "inventory", "zoho books", "xero" ] keyword_Food_drinks = [ "macdonald", "kfc", "grabfood", "subway", "jolibee", "coke", "laksa", "chicken rice", "yong tau foo", "buffet", "pizza", "bbq", "black pepper", "beef", "mutton", "curry", "nasi lemak", "carrot cake", "green tea", "bubble tea", "pudding jelly", "cake", "bread", "milo", "ice cream", "fishball" ] keyword_Gambling = [ "poker", "roulette", "slot-machines", "bingo", "baccarat", "casino war", "craps", "carribean stud", "keno", "let it ride", 'betting' ] keyword_government_legal = [ "moe", "mof", "mha", "mfa", "mti", "msf", "mod", "mol", "mom", "moh", "mot", ".gov.sg", 'government', 'ministry of', 'minister', 'minister of' ] keyword_Hobbies_Interests = [ "Sports", "music", "travel", "fishing", "social work", "volunteer work", "painting", "dancing", "reading", "writing", "gardening", "animal care", "cooking", "bowling", "computer gaming", "fashion", "ice skating", "magic", "origami", "photography", "sculpting", "comedy", "winemaking", "yoga", "computer programming", "diving", "football", "basketball", "tennis", "badminton", "table tennis", "soccer", "rugby", "jogging", "marathon", "cycling", "rock climbing", "swimming", "cheerleading", "fencing", "laser tag", "darts", "eating", "sleeping", "hockey", "weightlifting", "volleyball", "martial arts", "hiking", "backpacking", "archery", "wrestling", "boxing", "poker", "chess" ] keyword_insurance = [ "life insurance", "health insurance", "travel insurance", "home insurance", "child insurance", "maid insurance", "car insurance", "pet insurance", "personal accident insurance", "term life insurance", "whole life insurance", "ntuc income", "great eastern", "prudential", "AIA", "aviva", "savings plan", "integrated shield plan", "trip delays", "baggage delay", "lost items", "medical coverage", "missed flights" ] keyword_jobsearch = [ "career@gov", "jobstreet", "gumtree", "indeed", "jobsdb", "stjobs", "mycareerfuture", "jobscentral", "linkedin", "startupjobs" ] Keyword_kids = [ "hasbro", "nursery rhythms", "fox kids", "smiggle", "kiddy palace", "Playground", "toy r us", "avent", "enfagrow", "kinder joy" ] keyword_Military = [ "army", "air force", "navy", "rank", "infrantry", "armour", "artillery", "special forces", "rangers", "guards", "military police", "signals", "combat engineers", "field engineers", "sar 21", "machine guns", "missile launcher", "weapons", "medic", "tanks", "fighter jets", "helicopter", "armoured vehicles", "rocket artillery", "armoured carriers", "sergeant", "officer", "encik" ] keyword_news_and_media = [ "cna", "bbc", "thestraitstime", "thenewspaper", "mediacorp", "techredar", "asiaone", "yahoo", "msn", "flipboard", "twitter", "dailymail", "today", "thebusinesstimes", 'reporters' ] keyword_peer2peer = [ "pirate bay", "kickass torrent", "torrent", "rarbg", "1337x", "torlock", "YTS", " qBittorrent", "Vuze", "Deluge", "uTorrent", "BitTorrent", "EZTV", "ETTV", "Popcorn Time", "LimeTorrents" ] keyword_pets = [ "cat", "dog", "rabbit", "hamster", "fish", "bird", "guinea pig", "chinchilla", "cow", "chicken", "sheep", "lamb", "pig", "llama", "turtle", "tortoise", "frog" ] keywords_realEstate = [ "hdb", "bungalow", "studio", "semi-detached", "condos", "landed", "propnex realty", "huttons asia", "era", "propseller", 'condominium', 'apartment', 'mansionette', 'property guru', 'property agent' ] keyword_Search_engines = [ 'google', 'yahoo', 'bing', 'duckduckgo', 'wiki.com', 'gibiru', 'boardreader', 'baidu', 'torsearch', 'ask.com' ] keywords_shopping = [ "qoo10", "lazada", "shopee", "zalora", "taobao", "amazon", "carousell", "ebay", "redmart", "reebonz" "online shopping", "online sale", "free shipping", "free delivery", "next day delivery" ] Keyword_social = [ "imgur", "facebook", "twitter", "instagram", "tumblr", "flicker", "google+", "youtube", "pinterest", "reddit", "snapchat", "baidu tieba", "skype", "telegram", "whatsapp", "hardwarezone", "forum" ] keyword_mediaStreaming = [ "netflix", "youtube", "apple Tv", "chromecast", "subsonic", "audio galaxy", "tudou", "baidu", "dailymotion", "vimeo" ] keywords_trading_invest = [ "stocks", "money", "profits", "srs", "blue-chip", "growth", "dividend", "nasdaq", "corporate bonds", "etf" ] Keyword_translation = [ "google translate", "yandex", "babelfish", "tradukka", "linguee", "systranet", "permondo", "translatesonline.com" ] keyword_webhosting_isp_telco = [ "singtel", "starhub", "m1", "circlelife", "tpg", "myrepublic", "viewquest", "alibaba", "apc", "amazon web" ] keyword_web_hosting = [ "bluehost", "inmotion hosting", "hostgator", "hostinger", "godaddy", "tsohost", "wix", "siteground", "hostwinds", "weebly", "squarespace", "vodien", "a2 hosting", "dreamHost", "website hosting", "domain name", "namecheap", "host website", "domain registration", "whois", "website server", "apache", "nginx" "web host" ] keyword_proxies_vpn = [ "expressvpn", "nordvpn", "ipvanish", "hotspot shield", "tunnelbear", "hidester", "hide.me", "proxysite.com", "kproxy", "VPNbook", "whoer.net", "megaproxy" ] keyword_webmail = [ "gmail", "hotmail", "live", "yahoo", "outlook", "aol", "zoho", "protonmail" ] keyword_travel = [ 'booking.com', 'tripadvisor', 'expedia', 'airbnb', 'agoda', 'priceline', 'skyscanner', 'kayak.com', 'makemytrip', 'cheapoair', 'trivago', 'travelocity', 'orbitz', 'hotelurbano', 'book hotel', 'air tickets', 'airfares', 'hotels', 'cheap flight', 'cheap hotel', 'airline', 'flights' ] keyword_drugs = [ 'marijuana', 'opium', 'heroin', 'cocaine', 'barbiturates', 'meth', 'ice', 'crystal meth', 'ecstacy', 'weed', 'cannabis' ] Keyword_weapons = [ "gun", "sword", "machine gun", "butterfly knife", "rocket", "bazooka", "flamethrower", "pistol", "rifle", "grenade", "sniper" ] keyword_sports = [ "soccer", "football", "tennis", "basketball", "hockey", "bowling", "table-tennis", "kayaking", "canoeing", "snorkeling", "diving", "swimming", "scuba-diving", 'martial arts' ] Keyword_religion = [ "Buddihsm", "Hinduism", "Sikhism", "Christianity", "Islam", "Judaism", "Spiritism", "Shinto", "Taoism" ] Keyword_technology = [ "cloud computing", "5g", "computer ai", "wireless", "ssd", "smartphone", "drones", "robots", "gaming", "smartwatch" ] keyword_cyber_security_solutions = [ "identity and access management", "IAM", "cloud security", "risk and compliance management", "encryption", "data loss prevention", "DLP", "UTM", "unified threat management", "firewall", "antivirus", "antimalware", "IDS", "intrusion detection system", "intrusion prevention system", "IPS", "disaster recovery", "ddos mitigation", "cyber security solution", "IT security", "cisco", "symantec", "norton", "trend micro", "avast", "carbon black", "crowd strike", "fortinet", "palo alto", "splunk", "mcafee", "sophos", "proofpoint", "imperva", "fireye", "LogRythm", "Netskope", "trustwave" ] keyword_education = [ ".edu", "coursera", "khan academy", "open culture", "udemy", "academic earth", "edx", "university", "polytechnic", "diploma", "bachelors", "degree", "phd", "masters", "professor", "scholarship", "schooling", "teaching", "learning", "education", "online learning", "distance learning", "institute" ] keyword_tobacco = [ 'marlboro', 'camel', 'cigarette', 'tobacco', 'lucky strike', 'winston', 'dunhill', 'lung cancer', 'viceroy', 'smoking', 'vape', 'e-cigarette', 'cigar', 'vaping', 'vaporiser', 'electronic cigarette' ] keywords=keyword_cyber_security_risk+Adult_Content+keyword_Aggressive+Keyword_arts+keyword_automotive+keyword_cloud_service+\ keywords_IM+keyword_Criminal_Activities+keyword_dating+keyword_softwareDevelopement+keyword_Ecommerce_Tools+keyword_Entertainment+\ keyword_Software_Downloads_Sites+keyword_Finance_Accounting+keyword_Food_drinks+keyword_Gambling+keyword_government_legal+\ keyword_Hobbies_Interests+keyword_insurance+keyword_jobsearch+Keyword_kids+keyword_Military+keyword_news_and_media+\ keyword_peer2peer+keyword_pets+keywords_realEstate+keyword_Search_engines+keywords_shopping+Keyword_social+keyword_mediaStreaming+\ keywords_trading_invest+Keyword_translation+keyword_webhosting_isp_telco+keyword_web_hosting+keyword_proxies_vpn+keyword_webmail+keyword_travel+\ keyword_drugs+Keyword_weapons+keyword_sports+Keyword_religion+Keyword_technology+keyword_cyber_security_solutions+keyword_education+keyword_tobacco kp0 = KeywordProcessor() kp1 = KeywordProcessor() kp2 = KeywordProcessor() kp3 = KeywordProcessor() kp4 = KeywordProcessor() kp5 = KeywordProcessor() kp6 = KeywordProcessor() kp7 = KeywordProcessor() kp8 = KeywordProcessor() kp9 = KeywordProcessor() kp10 = KeywordProcessor() kp11 = KeywordProcessor() kp12 = KeywordProcessor() kp13 = KeywordProcessor() kp14 = KeywordProcessor() kp15 = KeywordProcessor() kp16 = KeywordProcessor() kp17 = KeywordProcessor() kp18 = KeywordProcessor() kp19 = KeywordProcessor() kp20 = KeywordProcessor() kp21 = KeywordProcessor() kp22 = KeywordProcessor() kp23 = KeywordProcessor() kp24 = KeywordProcessor() kp25 = KeywordProcessor() kp26 = KeywordProcessor() kp27 = KeywordProcessor() kp28 = KeywordProcessor() kp29 = KeywordProcessor() kp30 = KeywordProcessor() kp31 = KeywordProcessor() kp32 = KeywordProcessor() kp33 = KeywordProcessor() kp34 = KeywordProcessor() kp35 = KeywordProcessor() kp36 = KeywordProcessor() kp37 = KeywordProcessor() kp38 = KeywordProcessor() kp39 = KeywordProcessor() kp40 = KeywordProcessor() kp41 = KeywordProcessor() kp42 = KeywordProcessor() kp43 = KeywordProcessor() kp44 = KeywordProcessor() kp45 = KeywordProcessor() for word in keywords: kp0.add_keyword(word) for word in keyword_cyber_security_risk: kp1.add_keyword(word) for word in Adult_Content: kp2.add_keyword(word) for word in keyword_Aggressive: kp3.add_keyword(word) for word in Keyword_arts: kp4.add_keyword(word) for word in keyword_automotive: kp5.add_keyword(word) for word in keyword_cloud_service: kp6.add_keyword(word) for word in keywords_IM: kp7.add_keyword(word) for word in keyword_Criminal_Activities: kp8.add_keyword(word) for word in keyword_dating: kp9.add_keyword(word) for word in keyword_softwareDevelopement: kp10.add_keyword(word) for word in keyword_Ecommerce_Tools: kp11.add_keyword(word) for word in keyword_Entertainment: kp12.add_keyword(word) for word in keyword_Software_Downloads_Sites: kp13.add_keyword(word) for word in keyword_Finance_Accounting: kp14.add_keyword(word) for word in keyword_Food_drinks: kp15.add_keyword(word) for word in keyword_Gambling: kp16.add_keyword(word) for word in keyword_government_legal: kp17.add_keyword(word) for word in keyword_Hobbies_Interests: kp18.add_keyword(word) for word in keyword_insurance: kp19.add_keyword(word) for word in keyword_jobsearch: kp20.add_keyword(word) for word in Keyword_kids: kp21.add_keyword(word) for word in keyword_Military: kp22.add_keyword(word) for word in keyword_news_and_media: kp23.add_keyword(word) for word in keyword_peer2peer: kp24.add_keyword(word) for word in keyword_pets: kp25.add_keyword(word) for word in keywords_realEstate: kp26.add_keyword(word) for word in keyword_Search_engines: kp27.add_keyword(word) for word in keywords_shopping: kp28.add_keyword(word) for word in Keyword_social: kp29.add_keyword(word) for word in keyword_mediaStreaming: kp30.add_keyword(word) for word in keywords_trading_invest: kp31.add_keyword(word) for word in Keyword_translation: kp32.add_keyword(word) for word in keyword_webhosting_isp_telco: kp33.add_keyword(word) for word in keyword_web_hosting: kp34.add_keyword(word) for word in keyword_proxies_vpn: kp35.add_keyword(word) for word in keyword_webmail: kp36.add_keyword(word) for word in keyword_travel: kp37.add_keyword(word) for word in keyword_drugs: kp38.add_keyword(word) for word in Keyword_weapons: kp39.add_keyword(word) for word in keyword_sports: kp40.add_keyword(word) for word in Keyword_religion: kp41.add_keyword(word) for word in Keyword_technology: kp42.add_keyword(word) for word in keyword_cyber_security_solutions: kp43.add_keyword(word) for word in keyword_education: kp44.add_keyword(word) for word in keyword_tobacco: kp45.add_keyword(word) x = textSplit y0 = len(kp0.extract_keywords(x)) y1 = len(kp1.extract_keywords(x)) y2 = len(kp2.extract_keywords(x)) y3 = len(kp3.extract_keywords(x)) y4 = len(kp4.extract_keywords(x)) y5 = len(kp5.extract_keywords(x)) y6 = len(kp6.extract_keywords(x)) y7 = len(kp7.extract_keywords(x)) y8 = len(kp8.extract_keywords(x)) y9 = len(kp9.extract_keywords(x)) y10 = len(kp10.extract_keywords(x)) y11 = len(kp11.extract_keywords(x)) y12 = len(kp12.extract_keywords(x)) y13 = len(kp13.extract_keywords(x)) y14 = len(kp14.extract_keywords(x)) y15 = len(kp15.extract_keywords(x)) y16 = len(kp16.extract_keywords(x)) y17 = len(kp17.extract_keywords(x)) y18 = len(kp18.extract_keywords(x)) y19 = len(kp19.extract_keywords(x)) y20 = len(kp20.extract_keywords(x)) y21 = len(kp21.extract_keywords(x)) y22 = len(kp22.extract_keywords(x)) y23 = len(kp23.extract_keywords(x)) y24 = len(kp24.extract_keywords(x)) y25 = len(kp25.extract_keywords(x)) y26 = len(kp26.extract_keywords(x)) y27 = len(kp27.extract_keywords(x)) y28 = len(kp28.extract_keywords(x)) y29 = len(kp29.extract_keywords(x)) y30 = len(kp30.extract_keywords(x)) y31 = len(kp31.extract_keywords(x)) y32 = len(kp32.extract_keywords(x)) y33 = len(kp33.extract_keywords(x)) y34 = len(kp34.extract_keywords(x)) y35 = len(kp35.extract_keywords(x)) y36 = len(kp36.extract_keywords(x)) y37 = len(kp37.extract_keywords(x)) y38 = len(kp38.extract_keywords(x)) y39 = len(kp39.extract_keywords(x)) y40 = len(kp40.extract_keywords(x)) y41 = len(kp41.extract_keywords(x)) y42 = len(kp42.extract_keywords(x)) y43 = len(kp43.extract_keywords(x)) y44 = len(kp44.extract_keywords(x)) y45 = len(kp45.extract_keywords(x)) Total_matches = y0 per1 = float(percentage1(y0, y1)) per2 = float(percentage1(y0, y2)) per3 = float(percentage1(y0, y3)) per4 = float(percentage1(y0, y4)) per5 = float(percentage1(y0, y5)) per6 = float(percentage1(y0, y6)) per7 = float(percentage1(y0, y7)) per8 = float(percentage1(y0, y8)) per9 = float(percentage1(y0, y9)) per10 = float(percentage1(y0, y10)) per11 = float(percentage1(y0, y11)) per12 = float(percentage1(y0, y12)) per13 = float(percentage1(y0, y13)) per14 = float(percentage1(y0, y14)) per15 = float(percentage1(y0, y15)) per16 = float(percentage1(y0, y16)) per17 = float(percentage1(y0, y17)) per18 = float(percentage1(y0, y18)) per19 = float(percentage1(y0, y19)) per20 = float(percentage1(y0, y20)) per21 = float(percentage1(y0, y21)) per22 = float(percentage1(y0, y22)) per23 = float(percentage1(y0, y23)) per24 = float(percentage1(y0, y24)) per25 = float(percentage1(y0, y25)) per26 = float(percentage1(y0, y26)) per27 = float(percentage1(y0, y27)) per28 = float(percentage1(y0, y28)) per29 = float(percentage1(y0, y29)) per30 = float(percentage1(y0, y30)) per31 = float(percentage1(y0, y31)) per32 = float(percentage1(y0, y32)) per33 = float(percentage1(y0, y33)) per34 = float(percentage1(y0, y34)) per35 = float(percentage1(y0, y35)) per36 = float(percentage1(y0, y36)) per37 = float(percentage1(y0, y37)) per38 = float(percentage1(y0, y38)) per39 = float(percentage1(y0, y39)) per40 = float(percentage1(y0, y40)) per41 = float(percentage1(y0, y41)) per42 = float(percentage1(y0, y42)) per43 = float(percentage1(y0, y43)) per44 = float(percentage1(y0, y44)) per45 = float(percentage1(y0, y45)) allP = [ per1, per2, per3, per4, per5, per6, per7, per8, per9, per10, per11, per12, per13, per14, per15, per16, per17, per18, per19, per20, per21, per22, per23, per24, per25, per26, per27, per28, per29, per30, per31, per32, per33, per34, per35, per36, per37, per38, per39, per40, per41, per42, per43, per44, per45 ] allP.sort(key=float) if y0 == 0: Category = 'None' else: if per1 >= allP[-1]: Category = 'Cyber-Security Risk' elif per2 >= allP[-1]: Category = 'Adult Content' elif per3 >= allP[-1]: Category = 'Aggresive' elif per4 >= allP[-1]: Category = 'Arts' elif per5 >= allP[-1]: Category = 'Automotive' elif per6 >= allP[-1]: Category = 'Cloud Services' elif per7 >= allP[-1]: Category = 'Instant Messaging' elif per8 >= allP[-1]: Category = 'Criminal Activities' elif per9 >= allP[-1]: Category = 'Dating' elif per10 >= allP[-1]: Category = 'Software Development' elif per11 >= allP[-1]: Category = 'Ecommerce Tools' elif per12 >= allP[-1]: Category = 'Entertainment' elif per13 >= allP[-1]: Category = 'Software Download Sites' elif per14 >= allP[-1]: Category = 'Finance & Accounting' elif per15 >= allP[-1]: Category = 'Food and Drinks' elif per16 >= allP[-1]: Category = 'Gambling' elif per17 >= allP[-1]: Category = 'Government' elif per18 >= allP[-1]: Category = 'Hobbies and Interests' elif per19 >= allP[-1]: Category = 'Insurance' elif per20 >= allP[-1]: Category = 'Job Search' elif per21 >= allP[-1]: Category = 'Kids' elif per22 >= allP[-1]: Category = 'Military' elif per23 >= allP[-1]: Category = 'News & Media' elif per24 >= allP[-1]: Category = 'Peer 2 Peer' elif per25 >= allP[-1]: Category = 'Pets' elif per26 >= allP[-1]: Category = 'Real Estate' elif per27 >= allP[-1]: Category = 'Search Engine' elif per28 >= allP[-1]: Category = 'Shopping' elif per29 >= allP[-1]: Category = 'Social' elif per30 >= allP[-1]: Category = 'Media Streaming' elif per31 >= allP[-1]: Category = 'Trading & Investment' elif per32 >= allP[-1]: Category = 'Translation' elif per33 >= allP[-1]: Category = 'WebHosting_ISP_Telco' elif per34 >= allP[-1]: Category = 'Webhosting' elif per35 >= allP[-1]: Category = 'Proxies & VPN' elif per36 >= allP[-1]: Category = 'Webmail' elif per37 >= allP[-1]: Category = 'Travel' elif per38 >= allP[-1]: Category = 'Drugs' elif per39 >= allP[-1]: Category = 'Weapons' elif per40 >= allP[-1]: Category = 'Sports' elif per41 >= allP[-1]: Category = 'Religion' elif per42 >= allP[-1]: Category = 'Technology' elif per43 >= allP[-1]: Category = 'Cyber-Security Technologies' elif per44 >= allP[-1]: Category = 'Education' elif per45 >= allP[-1]: Category = 'Tobacco' return Category
import requests from flashtext.keyword import KeywordProcessor from nltk.corpus import stopwords # let's read in a couple of forum posts forum_posts = pd.read_csv("../input/ForumMessages.csv") # get a smaller sub-set for playing around with sample_posts = forum_posts.Message[0:3] # get data from list of top 5000 pypi packages (last 30 days) url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json' data = requests.get(url).json() # get just the list of package names list_of_packages = [data_item['project'] for data_item in data['rows']] # create a KeywordProcess keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list_of_packages) # remove english stopwords keyword_processor.remove_keywords_from_list(stopwords.words('english')) # remove custom stopwords keyword_processor.remove_keywords_from_list(['http','kaggle']) # test our keyword processor for post in sample_posts: keywords_found = keyword_processor.extract_keywords(post, span_info=True) print(keywords_found)
import spacy import re import os import pandas as pd from flashtext.keyword import KeywordProcessor from fairseq.models.roberta import RobertaModel from fairseq.data.encoders.fastbpe import fastBPE from fairseq import options nlp = spacy.load("en_core_web_sm") ignore_word_regex = re.compile( r"(coronavirus|covid|corona|my|test|positive|negative|virus)") keyword_processor = KeywordProcessor(case_sensitive=True) def find_mask_word_per_tweet(text): words = [] for doc in nlp(text): if doc.is_stop: continue if len(doc.text) <= 1: continue if re.findall(ignore_word_regex, doc.text): continue if doc.pos_ in ["PROPN", "VERB", "NOUN"]: words.append(doc.text) # print(doc.text, doc.pos_) return words def _init_model(pretrain_model):