def clean_text(dirty_text, external_vocab_filename=None, external_vocab_level="no"): """(str, str, str) -> str external_vocab_level can be: no, with_ngrams, only. if you choose with_ngrams or only, you need to add an external_vocab_filename The text is cleaned in the following way: # substitute word1.word2 by word1. word2 # split text into rough sentences based on '<\\%>'. This symbol was added to denote a new line on the original product description # split the rough sentences using a sentence tokenizer from nltk # if zappos_ngrams is True, combine the zappos ngrams into one token. e.g., short sleeves -> short_sleeves # concatenate all tokenized words into one string and return string. An excerpt of text from the item looks like this: Sheath/Column One-Shoulder Short/Mini Bridesmaid Dress With Lace<\\%>SKU#:1020234<\\%>New Style Cocktail Dress<\\%> Color:The color as picture<\\%>Category:Bridesmaid Dress<\\%> Brand:Biggoldapple<\\%> Silhouette:Sheath/Column<\\%> Neckline:One-Shoulder<\\%> Hemline/Train:Short/Mini<\\%> """ external_vocab_words = [] if external_vocab_filename is not None: external_vocab_words = load_vocab_words(external_vocab_filename) # transform ngrams into tuples external_vocab_with_tuples = [tuple(z.split('_')) for z in external_vocab_words] # assume that ngrams are separated by underscore: word1_word2. # multiple word tokenizer, more info: http://www.nltk.org/api/nltk.tokenize.html tokenizer_mwe = MWETokenizer(external_vocab_with_tuples) # all external_vocab_words are added out_clean_text = '' # substitute word1.word2 by word1. word2 dirty_text = re.sub(r"(\w[A-Z]|[a-z.])\.([^.)\s])", r"\1. \2", dirty_text) rough_sentences = dirty_text.split('<\\%>') # sentences based on splitting by '<\\%>' sentences = [] for rs in rough_sentences: rs = rs.replace("3/4", "3_4") # just to keep the 3/4 as 3_4 sentences.extend(SENT_TOKENIZER.tokenize(rs)) # sentences based on NLTK tokenizer for sentence in sentences: words = WORD_TOKENIZER.tokenize(sentence.lower()) # tokenize based on words. ignore that zappos vocabulary exists if external_vocab_level == 'with_ngrams': # keep all words (even those not in zappos), but group zappos ngrams into one token words = tokenizer_mwe.tokenize(words) # group zappos_ngrams into one token. elif external_vocab_level == 'only': words = tokenizer_mwe.tokenize(words) # group zappos_ngrams into one token. words = [w for w in words if w in external_vocab_words] # only keep words in the zappos vocabulary words = [w for w in words if (not w.isdigit() or w == '3_4')] # remove words that are just digits, but leave 3_4 words_concat = ' '.join(words) + '\n' out_clean_text += words_concat return out_clean_text
def fit(self, X, **fit_params): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score (and satisfy other criteria set out by the optional parameters). """ self.tokenization_ = X n_tokens = sum([len(x) for x in X]) for i in range(self.max_iterations): bigramer = BigramCollocationFinder.from_documents( self.tokenization_) if not self.ignored_tokens == None: ignore_fn = lambda w: w in self.ignored_tokens bigramer.apply_word_filter(ignore_fn) if not self.excluded_token_regex == None: exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex, w) is not None) bigramer.apply_word_filter(exclude_fn) if not self.min_token_occurrences == None: minocc_fn = lambda w: bigramer.word_fd[ w] < self.min_token_occurrences bigramer.apply_word_filter(minocc_fn) if not self.max_token_occurrences == None: maxocc_fn = lambda w: bigramer.word_fd[ w] > self.max_token_occurrences bigramer.apply_word_filter(maxocc_fn) if not self.min_token_frequency == None: minfreq_fn = (lambda w: bigramer.word_fd[w] < self. min_token_frequency * n_tokens) bigramer.apply_word_filter(minfreq_fn) if not self.max_token_frequency == None: maxfreq_fn = (lambda w: bigramer.word_fd[w] > self. max_token_frequency * n_tokens) bigramer.apply_word_filter(maxfreq_fn) if not self.min_ngram_occurrences == None: bigramer.apply_freq_filter(self.min_ngram_occurrences) new_grams = list( bigramer.above_score(self.score_function, self.min_score)) if len(new_grams) == 0: break self.mtes_.append(new_grams) contracter = MWETokenizer(new_grams) self.tokenization_ = tuple([ tuple(contracter.tokenize(doc)) for doc in self.tokenization_ ]) return self
def tokenize_and_remove_punct(text): text = text.translate(str.maketrans('', '', string.punctuation)) mtokenizer = MWETokenizer() mwe = mtokenizer.tokenize(text.split()) words = [] for t in mwe: if t.isalpha(): words.append(t) return words
def phrase_eval(params): list_phrases, unigram_set, target_token, idf, agg_score, pid = params idf_list = [*idf] idf_set = set(idf_list) tokenizer = MWETokenizer(separator=' ') for e in unigram_set: tokenizer.add_mwe(nltk.word_tokenize(e)) phrases_score = {} for phrase in tqdm(list_phrases, desc='phrase-eval-{}'.format(pid), mininterval=10): score = 0 tokens = nltk.word_tokenize(phrase) if not set(tokens).issubset(idf_set): continue nonstop_tokens = [token for token in tokens if token not in stop] if len(nonstop_tokens) / len(tokens) <= 0.5: continue raw_tokenized = tokenizer.tokenize(tokens) tokenized_set = set(raw_tokenized) keywords = tokenized_set.intersection(unigram_set) for token in keywords: score += agg_score[token] score /= (1 + np.log(len(nonstop_tokens))) vocab = set(target_token).union(set(tokens)) vocab = list(vocab.intersection(idf_set)) target_vec = [0] * len(vocab) phrase_vec = [0] * len(vocab) target_token_freq = dict(Counter(target_token)) target_token_subset = list(set(vocab).intersection(set(target_token))) for token in target_token_subset: index = vocab.index(token) target_vec[index] = target_token_freq[token] / len( target_token) * idf[token] phrase_token_freq = dict(Counter(tokens)) for token in tokens: index = vocab.index(token) phrase_vec[index] = phrase_token_freq[token] / len( tokens) * idf[token] tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec) phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}}) rearrange = {} for k, v in phrases_score.items(): rearrange.update({k: v['score']}) top_10 = nlargest(10, rearrange, key=rearrange.get) return {key: phrases_score[key] for key in top_10}
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def tokenizer_sent(dataset): tokenizer = MWETokenizer() aspect_tokenized = [] sentence_tokenized = [] for i in range(0, len(dataset.index)): aspect_split = tuple(dataset['aspect_term'][i].lower().split()) res = tokenizer.add_mwe(aspect_split) aspect_tokenized.append(res) for j in range(0, len(dataset.index)): tok = nltk.pos_tag( tokenizer.tokenize(dataset['text'][i].lower().split())) sentence_tokenized.append(tok)
def multiword_tokenize(text, mwe): # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词 protected_tuples = [word_tokenize(word) for word in mwe] protected_tuples_underscore = ['_'.join(word) for word in protected_tuples] tokenizer = MWETokenizer(protected_tuples) # Tokenize the text. tokenized_text = tokenizer.tokenize(word_tokenize(text)) # Replace the underscored protected words with the original MWE for i, token in enumerate(tokenized_text): if token in protected_tuples_underscore: tokenized_text[i] = mwe[protected_tuples_underscore.index(token)] return tokenized_text
def sentence_filter(self, sentence): # 对句子进行初步的分词和清洗 if self.language == 'chinese': import jieba.posseg as psg return psg.cut(sentence) # 使用jieba的分词接口直接完成分词和清洗 elif self.language == 'english': from nltk.tokenize import MWETokenizer # 使用MWE分词器 tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 # for word in self.userdict: # spacy添加自定义词语,貌似无效 # lex = nlp.vocab[word] # 清洗标点符号 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,") sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) # 使用nltk和spacy得到分词结果,使用pke则得到完整句子 # return nlp(' '.join(sentence.split())) # spacy return nlp(' '.join(tokenizer.tokenize(sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词 # return sentence # pke elif self.language == 'japanese': mecab = MeCab.Tagger('') # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币) # 清洗标点符号 punc_pattern = re.compile( "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※") sentence = re.sub(punc_pattern, ' ', sentence) sentence = [ ( chunk.split('\t')[0], chunk.split('\t')[1].split(',')[0] ) for chunk in mecab.parse(sentence).splitlines()[:-1] ] # 根据词条结构获取词根和词型 return sentence
def text_process_group(mess): """ 1. Lower case the input 2. Remove punctuation expect '-' 3. Apply custom tokenizer 4. Return column of clean text words""" mess.lower() regex = r"[^\P{P}-]+" new_mess = re.sub(regex, " ", mess, 0) tokenizer = MWETokenizer(all_list, separator=' ') token = tokenizer.tokenize(new_mess.lower().split()) sw = [x for x in token if x not in stopwords.words('english')] return sw
class AllKeywordsMatcher: def __init__(self, keywords: Set[str]): keywords_tuples = [tuple(k.split()) for k in keywords] self.keywords = keywords self._mwe_tokenizer = MWETokenizer(keywords_tuples, separator=" ") self._punc_regex = re.compile(r"[^\w\s]") def all_occurring_keywords(self, text: str) -> List[str]: text_without_punc = self._punc_regex.sub("", text) queried_text = self._mwe_tokenizer.tokenize(text_without_punc.split()) found_words = [word for word in queried_text if word in self.keywords] return found_words
def multi_word_tokenizer(relevant_words, text): mwetokenizer = MWETokenizer() #add tuples of words into multiword tokenizer for word in relevant_words: token = str(word).split() move_data=[] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) #execute multitokenization return mwetokenizer.tokenize(text)
def sentence_filter(self, sentence): tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|," ) sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) return nlp(' '.join(tokenizer.tokenize( sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词
def search_term(term): #i = 0 for tweet in tweets: #if i == 10: # break #print(tweet['text']) tt = tweet['text'].lower() tm = term.lower() if ' ' in term: # MultiToken tokenize_term = word_tokenize(tm) tokenizer = MWETokenizer('', separator=" ") before_tokens = len(tokenizer.tokenize(tt.split())) tokenizer = MWETokenizer([tokenize_term], separator=" ") after_tokens = len(tokenizer.tokenize(tt.split())) if after_tokens < before_tokens: sheet['H' + str(i + 1)] = sheet['H' + str(i + 1)].value + 1 #book.save('tfreq-gl-filter.xlsx') #exit() #create_frequency_matrix(tm) #print('#MultiToken:', tm) #print(tokenizer.tokenize(tt.split())) else: # Token if tm in word_tokenize(tt): sheet['H' + str(i + 1)] = sheet['H' + str(i + 1)].value + 1
def tokenize_sentence(self, string, max_sentence_len, with_labels=False): merger = MWETokenizer([('<', 'unk', '>')], separator = '') sentence = word_tokenize(string.strip()) # tokenize sentence sentence = merger.tokenize(sentence) # merge <unk> if with_labels: sentence = sentence[1:] sentence = [token.lower() for token in sentence] sentence = sentence[:max_sentence_len - 2] # cut sentence at max_sentence_length sentence = ['<sos>'] + sentence + ['<eos>'] # add start and end-of-sentence tags # pad the rest of the sentence padded_sentence = sentence.copy() padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence))) return sentence, padded_sentence
def multiword_tokenize(text, mwe): # Initialize the MWETokenizer protected_tuples = [word_tokenize(word) for word in mwe] protected_tuples_underscore = ['_'.join(word) for word in protected_tuples] tokenizer = MWETokenizer(protected_tuples) # Tokenize the text. #tokenized_text = tokenizer.tokenize(word_tokenize(text,language='French')) #print(tokenize(text)) tokenized_text = tokenizer.tokenize(tokenize(text)) #print(tokenized_text) # Replace the underscored protected words with the original MWE for i, token in enumerate(tokenized_text): if token in protected_tuples_underscore: tokenized_text[i] = mwe[protected_tuples_underscore.index(token)] return tokenized_text
def merge_task(task_list, args): with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f: raw_list = f.read() f.close() nlp = spacy.load('en_core_web_lg', disable=['ner']) entityset = set(raw_list.split('\n')) tokenizer = MWETokenizer(separator=' ') for e in entityset: tokenizer.add_mwe(nltk.word_tokenize(e)) print("successfully read entity file and initialized tokenizer") sys.stdout.flush() for fname in task_list: outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1]) context = [] with open('{}/{}'.format(args.input_dir,fname), 'r') as f: doc = f.readlines() f.close() for item in tqdm(doc, desc='{}'.format(fname), mininterval=30): item_dict = json.loads(item) sent = nltk.word_tokenize(item_dict['text']) raw_tokenized = tokenizer.tokenize(sent) tokenized_set = set(raw_tokenized) mentioned_entity = list(tokenized_set.intersection(entityset)) if len(mentioned_entity) != 0: doc = nlp(item_dict['text']) item_dict.update({'entityMentioned':mentioned_entity}) unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)] item_dict['unigram'] = unigram tokens = [token.text for token in doc] pos = [token.pos_ for token in doc] phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos) item_dict['phrases'] = list(phrases['counts']) context.append(json.dumps(item_dict)) with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def k_tokenizer(text): text = text.encode('ascii', errors='ignore').replace('-', '') """ We should use a better way to remove non-english words """ tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(text) # stopset = set(stopwords.words('english')) # tokens = [word for word in tokens if not word in stopset] """ Synonyms using wordnet """ mwe_tokenizer = MWETokenizer([ ('ios', '9'), ]) mwe_tokens = mwe_tokenizer.tokenize(tokens) """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """ """ train -> train_NN train_V""" tagged = nltk.pos_tag(mwe_tokens) def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # we preserve the original form of any unknown word wordnet_lemmatizer = WordNetLemmatizer() final_doc = [] for token, tag in tagged: word = tag + '_' + wordnet_lemmatizer.lemmatize( token, get_wordnet_pos(tag)) final_doc.append(word) # porter = PorterStemmer() # final_doc=[] # for token in mwe_tokens: # final_doc.append(porter.stem(token)) return final_doc
def k_tokenizer(text): text = text.encode('ascii',errors='ignore').replace('-', '') """ We should use a better way to remove non-english words """ tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(text) # stopset = set(stopwords.words('english')) # tokens = [word for word in tokens if not word in stopset] """ Synonyms using wordnet """ mwe_tokenizer = MWETokenizer([('ios', '9'),]) mwe_tokens = mwe_tokenizer.tokenize(tokens) """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """ """ train -> train_NN train_V""" tagged = nltk.pos_tag(mwe_tokens) def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # we preserve the original form of any unknown word wordnet_lemmatizer = WordNetLemmatizer() final_doc=[] for token, tag in tagged: word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag)) final_doc.append(word) # porter = PorterStemmer() # final_doc=[] # for token in mwe_tokens: # final_doc.append(porter.stem(token)) return final_doc
def processText(self): stop_words = set(stopwords.words('english')) # tokenizer = nltk.RegexpTokenizer(r"\w+") tokenizer = MWETokenizer([('web', 'framework'), ('file', 'system'), ('command', 'line')]) # 针对短语 text_process = TextToWord.removeUseless(self, self.text) # text_token = TextToWord.lower(self,tokenizer.tokenize(text_process)) # 转成 token 不应该先lower,因为pos 的时候会根据大写判断 专有名词 text_token = tokenizer.tokenize(word_tokenize(text_process)) text_token = TextToWord.lower(self, text_token) filtered_sentence = [w for w in text_token if not w in stop_words] # print(filtered_sentence) tagged_sent = pos_tag(filtered_sentence) # 获取单词词性 wnl = WordNetLemmatizer() text_processed = [] custom_dictionary = ['fs'] for tag in tagged_sent: if tag[0] in custom_dictionary: wordnet_pos = 'NNP' else: wordnet_pos = TextToWord.get_wordnet_pos( self, tag[1]) or wordnet.NOUN if wordnet_pos == 'NNP': # 专有名词不进行还原 text_processed.append(tag[0]) else: text_processed.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原 with open( 'C:/Users/Admin/Documents/我的坚果云/NPM_Cate/material/delete-words', encoding="utf-8") as files: delete_words_list = files.read() delete_words = delete_words_list.split() # with open('C:/Users/Admin/Documents/我的坚果云/NPM_Cate/material/uninformative-words.json', 'r') as json_file: # uninformative_words = json.load(json_file) # # delete_words = set(delete_words + uninformative_words) text_processed = [w for w in text_processed if not w in delete_words] return text_processed # 最后小写化
def main(): # Testing purposes test_sentence1 = "merhaba, [email protected] <html>!! www.abc.com #hello selam# nasılsınız: Milli Eğitim Bakanlığı 2.01.1997'de 20:02'de aradı" test_sentence2 = "www.assignment.com.tr adresine gir. [email protected] a Dr. hanıma mail at." test_sentence3 = "bizi new jersey bekler" # Multiword expressions # Test for including multiword expressions: mwe = MWETokenizer([('Milli', 'Eğitim', 'Bakanlığı'), ('Bilkent', 'Üniversitesi')], separator='_') tokenizer = RuleBasedTokenizer() list_of_tokens = tokenizer.tokenize(test_sentence2) mwe_list_of_tokens = mwe.tokenize(list_of_tokens) print(list_of_tokens)
def vectorize(patient): tokenizer = MWETokenizer([("bleeding", "gum"), ("chest", "pain"), ("abdominal", "pain"), ("muscle", "pain"), ("joint", "pain"), ("eye", "pain"), ("nerve", "pain"), ("ligament", "pain"), ("tendon", "pain"), ("bleeding", "nose")]) dict = [ "headach", "vomit", "nausea", "bleeding_gum", "itch", "rash", "fever", "diarrhea", "discomfort", "chest_pain", "abdominal_pain", "fatigu", "muscle_pain", "chill", "eye_pain", "joint_pain", "nerve_pain", "ligament_pain", "tendon_pain", "bleeding_nos" ] dict2 = [ "headache", "vomit", "nausea", "bleeding_gum", "itch", "rash", "fever", "diarrhea", "discomfort", "chest_pain", "abdominal_pain", "fatigue", "muscle_pain", "chill", "eye_pain", "joint_pain", "nerve_pain", "ligament_pain", "tendon_pain", "bleeding_nose" ] synonyms_dict = [get_synonyms(dict2[x]) for x in range(len(dict2))] tokens = tokenizer.tokenize(word_tokenize(patient)) ps = PorterStemmer() modified_tokens = [ps.stem(word) for word in tokens] #print(modified_tokens) token_set = [] arra = [0 for x in range(len(dict))] for word in modified_tokens: for x in range(len(dict)): if word == dict[x]: token_set.append(word) arra[x] = 1 else: for x in range(len(dict)): if word in synonyms_dict[x]: token_set.append(word) arra[x] = 1 return arra
def trim_bio(text): # keywords to return keywords = [] ## define important words #important_words = [ ["data", "science"], # ["data", "scientist"], # ["machine", "learning"], # ["data", "engineer"], # ["data", "analytics"], # ["artificial", "intelligence"], # ["ai"], ["phd"], ["founder"], ["professor"],["candidate"],["ceo"], # ["student"], ["engineer"], ["computer", "science"] # ] # load from file after custom edit df_keyword = pd.read_csv("data/keywords/df.csv") ## convert df to list important_words = df_keyword["Unnamed: 0"].tolist() ## format important words so that they can be registered to tokenizer important_words = [x.split() for x in important_words] # initialize tokenizer tokenizer = MWETokenizer() for iw in important_words: tokenizer.add_mwe([x for x in iw]) # add important words #tokenizer.add_mwe(iw) # add important words # tokenize bio tokens = tokenizer.tokenize([word.lower() for word in text.split()]) # find important words from tokens, append it to keyword for iw in important_words: iw_joined = "_".join(iw) if (iw_joined in tokens): keywords.append(iw_joined) return keywords
def suggest_commplete(inSentence): suggestion_sentences = [] tokenizer = MWETokenizer([('hors', "d'oeuvre"), ('program', 'me')], separator='+') # to define the words separated by spaces. personal_dictionary = os.path.abspath(os.path.join(CUR_DIR, 'resources', 'sg_words.txt')) d = enchant.DictWithPWL("en_US", personal_dictionary) new_sentence = inSentence meaning_words = tokenizer.tokenize(inSentence.split()) for word in meaning_words: if (word in [".", "?", ","]): continue word = re.sub(r"[(),!?\'\`@-]", "", word) if (not d.check(word)): new_words = d.suggest(word) for new_word in new_words: new_sentence = inSentence.replace(word, new_word) suggestion_sentences.append(new_sentence) return suggestion_sentences
def Tokenize(text): tokenizer = MWETokenizer(category.all()) for word in category: if word.find(' '): tokenizer.add_mwe(word.split()) for word in sub_category: if word.find(' '): tokenizer.add_mwe(word.split()) for word in brand: if word.find(' '): tokenizer.add_mwe(word.split()) for word in article: if word.find(' '): tokenizer.add_mwe(word.split()) token = tokenizer.tokenize(text.split()) tokens = [] for word in token: word = word.replace("_", " ") tokens.append(word) return tokens
def text_process(text): #number removal if text == -2: return '' body = re.sub(r'\d+', '', text) #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~] # punc = string.punctuation # punct_mapping = {"_":" ", "'":" "} # punc += "“”’" punc = "/-'?!,#$%\'()*+-/:;<=>@\\^_`{|}~[]" + '""“”’' # punc = re.sub("-","", punc) body = body.translate(body.maketrans(punc, " " * len(punc))) #text lower body = body.lower() #multi-word tokenize multi_word_list = [('north', 'korea'), ('south', 'korea'), ('north', 'korean'), ('south', 'korean'), ('kim', 'jong', 'un'), ('park', 'geun', 'hye')] tokenizer = MWETokenizer() for mw in multi_word_list: tokenizer.add_mwe(mw) text = tokenizer.tokenize(body.split()) #stopwort removal stopset = set(stopwords.words('english')) # text = word_tokenize(body) text = [x for x in text if x not in stopset] text = [word for word in text if len(word) > 3] #lemmatization lemmatizer = WordNetLemmatizer() lemma_text = [lemmatizer.lemmatize(x) for x in text] return lemma_text
def merge_task(task_list, args): with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f: raw_list = f.read() f.close() entityset = set(raw_list.split('\n')) tokenizer = MWETokenizer(separator=' ') for e in entityset: tokenizer.add_mwe(nltk.word_tokenize(e)) print("successfully read entity file and initialized tokenizer") sys.stdout.flush() for fname in task_list: outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1]) context = [] with open('{}/{}'.format(args.input_dir,fname), 'r') as f: doc = f.readlines() f.close() for item in tqdm(doc, desc='{}'.format(fname), mininterval=30): item_dict = json.loads(item) if set(item_dict['nsubj']).issubset(pronoun) or item_dict['nsubj'] == []: continue sent = nltk.word_tokenize(item_dict['text']) raw_tokenized = tokenizer.tokenize(sent) tokenized_set = set(raw_tokenized) mentioned_entity = list(tokenized_set.intersection(entityset)) if len(mentioned_entity) != 0: item_dict.update({'entityMentioned':mentioned_entity}) item_dict['iid'] = '{}{}{}'.format(item_dict['did'],item_dict['pid'],item_dict['sid']) context.append(json.dumps(item_dict)) with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def remove_words_tuples_corpus(self,doclist,updated_ngramlist,filter_words): """ Removes those words and ngrams(tuples) from given filter word list(which contains words and ngram tuples to be removed) Parameters ---------- doclist: list list of word tokenized document list which may or may not be mwetokenized with ngrams updated_ngramlist: list list which contains ngrams which have been updated by having ngram to be filtered removed using remove_words_from_ngramlist method filter_words: list list which contains words and ngrams(tuples) to be filtered from document list Returns ------- doclist1: list list of filtered documents """ # accepts doclist which hasnt been updated with mwetokenizer yet # But if it already has been mwetokenized and ngram have been introduced with _ then # convert ngram concatenated by _ to tuples first doclist0 =[ [ tuple(token.split('_')) if '_' in token else token for token in doc ] for doc in list_tokens] #removing required tokens from bigram list print('total lenth of doclist = ',len(doclist0)) print('total words in filter list',len(filter_words)) start_time = time.time() filter_words = set(filter_words) doclist1 = [[z for z in doc if z not in filter_words] for doc in doclist0] print("--- %s seconds for removal ---" % (time.time() - start_time)) #combining the filtered bigram list and doclist using mwetokenizer start_time = time.time() mwe_tokenizer = MWETokenizer(updated_ngramlist) doclist1 = [mwe_tokenizer.tokenize(doc) for doc in doclist1] print("--- %s seconds for MWE ---" % (time.time() - start_time)) return doclist1
def train_phrase_model_to_xlsx(corpus_file, out_file, phrases): raw = ' '.join([ line.strip('\n').lower() for line in open(corpus_file, 'r', encoding='UTF-8').readlines() ]) raw = re.sub(r"[{}]+".format(punctuation), '', raw) word_tokens = word_tokenize(raw) tokenizer = MWETokenizer([word_tokenize(phrase) for phrase in phrases], separator='_') word_tokens = tokenizer.tokenize(word_tokens) stop_words = set(stopwords.words('english')) word_tokens = [token for token in word_tokens if token not in stop_words] model = word2vec.Word2Vec([word_tokens], size=100, window=5, min_count=1, workers=5) df = pd.DataFrame([model.wv.get_vector(word) for word in model.wv.vocab], index=model.wv.vocab) df.to_excel(out_file)
def get_instances(text: str, idx: int = -1) -> List[Instance]: """ Return all candidate instances from the given marked text. A candidate instance must either be directly marked or (contain only titled-case words and have <= 3 words) :param text: marked text, each entity is marked by <p>...</p> :param idx: file index :return: a list of instances """ instances = [] tokenizer = MWETokenizer([('<', 'p', '>'), ('<', '/p', '>')], separator='') for sent in sent_tokenize(text): tok_w_label = tokenizer.tokenize(word_tokenize(sent)) tok, label = extract_label(tok_w_label) pos = [get_simple_pos(t[1]) for t in pos_tag(tok)] assert len(pos) == len(tok) instances += _get_instances(tok, pos, label, idx) return instances
def get_context(self, query_str, text, k=10): if query_str in text: tokenizer = MWETokenizer() query_str_tokens = tuple(query_str.split()) query_str_dashed = "_".join(query_str_tokens) tokenizer.add_mwe(query_str_tokens) text_token = tokenizer.tokenize(text.split()) try: t_start = text_token.index(query_str_dashed) except: return None, None, None t_end = t_start + 1 start_index = max(t_start - k, 0) end_index = min(t_end + k, len(text_token)) text_token_query = text_token[start_index:t_start] + text_token[ t_end + 1:end_index] context = " ".join(text_token_query) context_mention = text_token[start_index:t_start] + [ query_str ] + text_token[t_end + 1:end_index] context_mention = " ".join(context_mention) return context, text_token_query, context_mention else: return None, None, None
flags = reduce(lambda ls, d: ls + d["flags"], states, []) tokenizer = MWETokenizer(flags) # transcript = transcript.splitlines() # need for text block # print(flags) # print(transcript) ## Add flags in the token representation for each state for state in states: state["token_flags"] = list(map(lambda phr: '_'.join(phr), state["flags"])) for trans in transcript: line = trans["transcript"] line_no_punct = line.translate(str.maketrans('', '', PUNCT)).lower() tokens = tokenizer.tokenize(line_no_punct.split()) ls = list( map( lambda tok: reduce( lambda p_res, s: (tok, s["state"]) if (tok in s["token_flags"]) else p_res, states, None), tokens)) states_found = list(filter(None, ls)) if (states_found != []): timing = (trans["word_timings"][0]["start_time"], trans["word_timings"][-1]["end_time"]) print('\n', line, '\n', states_found, timing) # print(line)
#basic search: searches for food words within tw_sentence #match = re.search(r"[Cc]loud.?\W+(?:\w+\W+){0,4}?[Tt]aste.?(?:\w+\W+){0,4}?[Ll]ike[^.](?:\w+\W+)[^.]*", tw_sentence) # basic2 search: searches for food words within tw_sentence in two directions match = re.search(r"[^\.\!\?\n]*(?:[Tt]aste.?\W+(?:\w+\W+){0,2}?[Ll]ike[\s][Cc]loud.?|[Cc]loud.?\W+(?:\w+\W+){0,2}?[Tt]aste.?\W+(?:\w+\W+){0,2}?[Ll]ike.?\W+(?:\w+\W+))[^\.\!\?\n]*", tw_sentence) #only output sentences that have the phrase clouds taste like <food from database> #exception handling try: phrase = match.group() except: phrase = None if phrase: phrase = phrase.lower() ph_tokens = word_tokenize(phrase) mwe_tokens = mwe_tokenizer.tokenize((phrase).split()) for mwe_token in mwe_tokens: for list_join_wd_permutation in list_join_wd_permutations: if mwe_token == list_join_wd_permutation: print(mwe_tokens) if not list_join_wd_permutation in counter: print('Adding new food to dictionary...') counter[list_join_wd_permutation] = 1 else: print('Incrementing existing food in dictionary...') counter[list_join_wd_permutation] += 1 print("Dictionary is: ", counter) print('---')
def extract_expressions(self, document, features=None): """Returns expressions from given features and multi-word expressions. In addition to passing a document into this method, MWEs or Multi-Word Expressions can be given to treat some multi words as one expression. >>> from document import ArthurDocument >>> pdf_path = base_path + '/test/test.pdf' >>> with open(pdf_path, 'rb') as f: ... document = ArthurDocument(f.read()) >>> features = document.get_features()[730:816,:] >>> print(document.get_text(features)) # doctest:+ELLIPSIS VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive Multi-word expression should be detected: >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates']) >>> expressions = clusterer.extract_expressions(document, features) >>> print(expressions[2]['text']) CROWN JEWEL x position should equal x of "C" from "CROWN JEWEL" : >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')] True and width should equal to width of "CROWN JEWEL": >>> expr_width = expressions[2]['x1']-expressions[2]['x'] >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')] >>> expr_width == ftr_width True Args: document(ArthurDocument): Document to extract data fields from. features(list): List of features containing data fields to extract. If not given, use all document features. mwes(list): List of Multi-Word Expressions. Example value: `['property type', 'single family)]`. With that list, both "property type" and "single family" will each be treated as single expressions. Returns: np.array: An array of data_fields. """ mwes = self.mwes if features is None: features = document.get_features() text = document.get_text(features) for idx, mwe in enumerate(mwes): if isinstance(mwe, str): mwes[idx] = word_tokenize(mwe.lower()) elif hasattr(mwe, '__iter__'): mwes[idx] = [x.lower() for x in mwe] tokenizer = MWETokenizer(mwes, separator=' ') tokenized = tokenizer.tokenize(word_tokenize(text.lower())) expressions = [] pos = 0 for token in tokenized: # token could be "deez nutz" but text contains multiple spaces e.g. "deez nutz", # so we need to split the token and find position of first and last characters. words = token.split() start_pos = text.lower().index(words[0], pos) for word in words: ipos = text.lower().index(word, pos) end_pos = ipos + len(word) pos = end_pos min_x = 0 max_x = 0 min_y = 0 max_y = 0 page = 0 if len(features[start_pos:end_pos,:] > 0): min_x = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')] max_x = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')] min_y = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')] max_y = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')] page = features[start_pos, ArthurDocument.get_feature_id('page')] expressions.append({ 'text': text[start_pos:end_pos], 'x': min_x, 'x1': max_x, 'y': min_y, 'y1': max_y, 'page': page }) return expressions
tool.append((0, 1)) abstracts.append(data[i]['abstract'] + ' ' + data[i]['title']) abstracts[i] = re.sub(r'https?:\/\/github\.com\S*', 'githuburl', abstracts[i]) abstracts[i] = re.sub(r'https?:\/\/bioconductor\.org\S*', 'bioconductorurl', abstracts[i]) abstracts[i] = re.sub(r'https?:\/\/sourceforge\.net\S*', 'sourceforgeurl', abstracts[i]) abstracts[i] = re.sub(r'https?:\/\/bitbucket\.org\S*', 'bitbucketurl', abstracts[i]) # remove all other urls abstracts[i] = re.sub(r'https?:\/\/\S*', 'url', abstracts[i]) abstracts[i] = re.sub(r'www\.\S*', 'url', abstracts[i]) # remove email addresses abstracts[i] = re.sub(r'\S*@\S*', ' ', abstracts[i]) # change slashes to spaces abstracts[i] = re.sub(r'\/', ' ', abstracts[i]) abstracts[i] = word_tokenize(abstracts[i]) abstracts[i] = tokenizer.tokenize(abstracts[i]) # filter stopwords and punctuation stopwords = nltk.corpus.stopwords.words('english') # lemmatize words #es = EnglishStemmer() # dictionary created to be used later when preprocessing testing data dictionary = set() fdist = FreqDist() #if not args.train: with open('dict.csv', 'r', newline='') as csvfile: reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
ps = PorterStemmer() print("Stemming:") for word in filter(lambda a: " " not in a, words): print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word))) sentiment_bag.add(word) sentiment_bag.add(st.stem(word)) # I like this one the best # Process all the lists for (label, files) in sorted(makecloud.TRANSCRIPTS.items()): scores = [] print("{}:\n{}=".format(label, "=" * len(label))) target_words = [] for fname in files: scount = 0 tokens = word_tokenize(raw(fname)) tokens = mwe_tokenizer.tokenize(tokens) tokens = list(filter(st.stem, tokens)) bar = "" for t in tokens: if t in sentiment_bag: bar += "*" scount += 1 target_words.append(t) score = scount / len(tokens) print("{:35s} {:3.6f} {:4} {}".format(os.path.basename(fname), score, scount, bar)) scores.append(score) print("\n{} Average Score of {}: {:3.6f}".format(str.capitalize(label), len(scores), sum(scores) / len(scores))) print("\n\n") makecloud.cloud_for_document(outfile=label + ".png", fulltext=" ".join(target_words))
def umbc_sim (title1, title2): ''' compares the similarity of title1 and title2 :param title1: :param title2: :return: a bool value, 0 for similar, 1 for not similar ''' #print datetime.now(), " Preprocessing titles..." title1 = title_prepocessing(title1) title2 = title_prepocessing(title2) #print datetime.now(), " Tokenization and parsing starts..." tokenizer = MWETokenizer(wn_bst.multi_words_xpn()) tokens1 = tokenizer.tokenize(title1.split()) #print datetime.now(), " First title tokenized." tagged1 = nltk.pos_tag(tokens1) #print datetime.now(), " First title parsed." tokens2 = tokenizer.tokenize(title2.split()) #print datetime.now(), " Second title tokenized." tagged2 = nltk.pos_tag(tokens2) #print datetime.now(), " Second title parsed." # remove tokens that are not supported by WordNet tagged1 = [x for x in tagged1 if not wn_bst.get_wordnet_pos(x[1])==''] tagged2 = [x for x in tagged2 if not wn_bst.get_wordnet_pos(x[1])==''] #print datetime.now(), " Tokens cleaned." # use a matrix to store the result for later use #print datetime.now(), " Building matrix..." len1 = len(tagged1) len2 = len(tagged2) Matrix = np.zeros((len2,len1)) result1 = {} result2 = {} for x in range(len1): token1=tagged1[x][0] pos1 = tagged1[x][1] simi = 0 counterpart1 = '' for y in range(len2): token2 = tagged2[y][0] pos2 = tagged2[y][1] Matrix[y, x] = sim(token1, pos1, token2, pos2) if Matrix[y,x]>simi: simi = Matrix[y, x] counterpart1 = token2 penalty1 = umbc_penalty(token1, pos1, tokens1, simi, counterpart1) result1[token1] = {'sim':simi, 'p':penalty1, 'counter':counterpart1} #print datetime.now(), " Title1 result calculated..." for y in range (0, len2): token2=tagged2[y][0] pos2 = tagged2[y][1] simi = 0 counterpart2 = '' for x in range(0, len1): if Matrix[y,x]>simi: simi = Matrix[y,x] counterpart2 = tagged1[x][0] #print token2, counterpart2, simi penalty2 = umbc_penalty(token2, pos2, tokens2, simi, counterpart2) result2[token2] = {'sim':simi, 'p':penalty2, 'counter':counterpart2} #print datetime.now(), " Title2 result calculated..." #print result1 sum1 = umbc_sum(result1) sum1 = float(sum1) #print result2 sum2 = umbc_sum(result2) sum2 = float(sum2) #print sum1, sum2 score = sum1/(2*len1)+sum2/(2*len2) #cut upper and lower bound if score < 0: score = 0 return score