def get_context(self, query_str, text, k=10): if query_str in text: tokenizer = MWETokenizer() query_str_tokens = tuple(query_str.split()) query_str_dashed = "_".join(query_str_tokens) tokenizer.add_mwe(query_str_tokens) text_token = tokenizer.tokenize(text.split()) try: t_start = text_token.index(query_str_dashed) except: return None, None, None t_end = t_start + 1 start_index = max(t_start - k, 0) end_index = min(t_end + k, len(text_token)) text_token_query = text_token[start_index:t_start] + text_token[ t_end + 1:end_index] context = " ".join(text_token_query) context_mention = text_token[start_index:t_start] + [ query_str ] + text_token[t_end + 1:end_index] context_mention = " ".join(context_mention) return context, text_token_query, context_mention else: logging.info('error, query not in text') return None, None, None
def __init__(self, locations): self.tokenizer = MWETokenizer() self.time_tagger = TimeTagger() for a in locations: self.tokenizer.add_mwe(a.split()) # Rules defined self.specials = { "ACTIVITY": activities.union(["driving", "flight"]), "REGION": regions, "KEYWORD": [word for word in all_keywords if ' ' in word], "LOCATION": locations, "QUANTITY": ["at least", "more than", "less than", "at most", "not more than", "a number of"], "IN": ["in front of", "called"], "NN": [phrase.replace("_", " ") for phrase in list(phrases.keys())], "SPACE": ["living room", "fastfood restaurant", "restaurant kitchen", "restaurant", "dining hall", "food court", "butchers shop", "restaurant patio", "coffee shop", "room", "hotel room", "kitchen", "office", "airport", "salon"], "POSITION": ["side", "foreground", "background", "right", "left", "image"], "TOBE": ["am", "is", "are", "be", "is being", "am being", "are being", "being"], "WAS": ["was", "were", "had been", "have been"], "TIMEPREP": ["prior to", "then", "earlier than", "later than", "sooner than"], "POSITION_PREP": ["near", "distance to"], } for tag in self.specials: for keyword in self.specials[tag]: if ' ' in keyword: self.tokenizer.add_mwe(keyword.split())
def timexTagAndTokenizeText(self, altText=None): """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored""" if altText is not None: raw = altText altOutputStep1 = self.timexTagText(raw) altOutputStep2 = self.wordTokenizeText(altOutputStep1) time_tagged_and_tokenizedText = MWETokenizer( mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(altOutputStep2) return time_tagged_and_tokenizedText else: """Tag all temporal expressions with timex2 tags.""" """Don't need to open file here, because it's opened in timexTagText()""" tagged = self.timexTagText() """Word-tokenize all text above""" word_tagged = self.wordTokenizeText(tagged) '''consolidate all broken apart Timex2 tags into single "words"''' if self.textList.get('timexTagAndTokenizeText') is None: self.textList['timexTagAndTokenizeText'] = [ MWETokenizer(mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(x) for x in word_tagged ] print self.textList.get('timexTagAndTokenizeText') return self.textList.get('timexTagAndTokenizeText')
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def segment(text, userdict_filepath="userdict2.txt", stopwords_filepath='stopwords.txt'): import nltk stopwords = [ line.strip().lower() for line in open( stopwords_filepath, 'r', encoding='utf-8').readlines() ] # final_list = [] temp_list = [] with open(userdict_filepath, 'r', encoding='utf-8') as f: for line in f: temp_list.append(line.strip(' ').strip('\n')) f.close() temp = [] for line in temp_list: for li in line.lower().split(' '): if len(li) != 0: temp.append(li.strip('\t')) final_list.append(tuple(temp)) temp.clear() userdict_list = final_list tokenizer = MWETokenizer(userdict_list, separator=' ') seg_list = tokenizer.tokenize( nltk.word_tokenize(remove_symbols(text).lower())) seg_list_without_stopwords = [] for word in seg_list: if word not in stopwords: if word != '\t': seg_list_without_stopwords.append(word) return seg_list_without_stopwords
def __init__(self, ents=None, tag2ent=None, collocations=special_collocations, appos=collocations.appos): self.__tokenizer = TweetTokenizer(reduce_len=True) self.__collocations = collocations self.__tknzr = MWETokenizer(self.__collocations) self.__lemm = WordNetLemmatizer() self.__nlp = spacy.load("en_core_web_sm") if ents is None: self.__ents = {} if tag2ent is not None: raise ValueError( "ent2tag and ents should be None or not None both") self.__tag2ent = {} else: if tag2ent is None: raise ValueError( "ent2tag and ents should be None or not None both") self.__ents = ents self.__tag2ent = tag2ent self.__appos = appos for a in appos: self.__appos[a] = '_'.join(self.__appos[a].split()) self.__punctuation = punctuation + "“”‘’‚" self.__stop_symbols = '←↓→↑'
def trim_bio(text): # keywords to return keywords = [] # load from file after custom edit df_keyword = pd.read_csv(local_data + "data/keywords/df.csv") ## convert df to list important_words = df_keyword["Unnamed: 0"].tolist() ## format important words so that they can be registered to tokenizer important_words = [x.split() for x in important_words] # initialize tokenizer tokenizer = MWETokenizer() for iw in important_words: tokenizer.add_mwe([x for x in iw]) # add important words #tokenizer.add_mwe(iw) # add important words # tokenize bio tokens = tokenizer.tokenize([word.lower() for word in text.split()]) # find important words from tokens, append it to keyword for iw in important_words: iw_joined = "_".join(iw) if (iw_joined in tokens): keywords.append(iw_joined) return keywords
def __init__(self, filename): """initializes a LyricsCleaner object""" self._filename = filename self._tokenizer = MWETokenizer() for word in SIGNAL_WORDS: self._tokenizer.add_mwe(('[', word, ']')) self._stemmer = LancasterStemmer()
def tokenization(docs): documents = {} for doc in docs: document_plain = docs[doc] document_plain = document_plain.replace("/", "").replace("-", " ") #re.sub(r'\([^)]*\)', '', document_plain) re.sub(r'\([0-9]*\)', '', document_plain) relevant_words = [] mwetokenizer = MWETokenizer() document_ner = spacy_nlp(document_plain) for element in document_ner.ents: # don't consider numbers if element.label_ not in "CARDINAL": relevant_words.append(element) #for each relevant word, if whitespace is present, create a single token with all the words for word in relevant_words: token = str(word).split() if len(token) > 1: move_data = [] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) document_tokenized = word_tokenize(document_plain) document_retokenized = mwetokenizer.tokenize(document_tokenized) documents[doc] = document_retokenized return documents
def initialize_known_phrase_tokenization(phrases): from nltk.tokenize import MWETokenizer tokenizer = MWETokenizer() for phrase in phrases: if (phrase): phrase_as_list = phrase.replace("_", " ").split() tokenizer.add_mwe(phrase_as_list) return tokenizer
def multiword_tokenizer(token_list, bigram_list): """ Tokenize a list of unigram tokens into bigram tokens, given a list of bigrams. Bigrams are separated with "__" """ mwetokenizer = MWETokenizer(bigram_list, separator="__") return mwetokenizer.tokenize(token_list)
def fit(self, X, **fit_params): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score (and satisfy other criteria set out by the optional parameters). """ self.tokenization_ = X n_tokens = sum([len(x) for x in X]) for i in range(self.max_iterations): bigramer = BigramCollocationFinder.from_documents( self.tokenization_) if not self.ignored_tokens == None: ignore_fn = lambda w: w in self.ignored_tokens bigramer.apply_word_filter(ignore_fn) if not self.excluded_token_regex == None: exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex, w) is not None) bigramer.apply_word_filter(exclude_fn) if not self.min_token_occurrences == None: minocc_fn = lambda w: bigramer.word_fd[ w] < self.min_token_occurrences bigramer.apply_word_filter(minocc_fn) if not self.max_token_occurrences == None: maxocc_fn = lambda w: bigramer.word_fd[ w] > self.max_token_occurrences bigramer.apply_word_filter(maxocc_fn) if not self.min_token_frequency == None: minfreq_fn = (lambda w: bigramer.word_fd[w] < self. min_token_frequency * n_tokens) bigramer.apply_word_filter(minfreq_fn) if not self.max_token_frequency == None: maxfreq_fn = (lambda w: bigramer.word_fd[w] > self. max_token_frequency * n_tokens) bigramer.apply_word_filter(maxfreq_fn) if not self.min_ngram_occurrences == None: bigramer.apply_freq_filter(self.min_ngram_occurrences) new_grams = list( bigramer.above_score(self.score_function, self.min_score)) if len(new_grams) == 0: break self.mtes_.append(new_grams) contracter = MWETokenizer(new_grams) self.tokenization_ = tuple([ tuple(contracter.tokenize(doc)) for doc in self.tokenization_ ]) return self
def clean_text(dirty_text, external_vocab_filename=None, external_vocab_level="no"): """(str, str, str) -> str external_vocab_level can be: no, with_ngrams, only. if you choose with_ngrams or only, you need to add an external_vocab_filename The text is cleaned in the following way: # substitute word1.word2 by word1. word2 # split text into rough sentences based on '<\\%>'. This symbol was added to denote a new line on the original product description # split the rough sentences using a sentence tokenizer from nltk # if zappos_ngrams is True, combine the zappos ngrams into one token. e.g., short sleeves -> short_sleeves # concatenate all tokenized words into one string and return string. An excerpt of text from the item looks like this: Sheath/Column One-Shoulder Short/Mini Bridesmaid Dress With Lace<\\%>SKU#:1020234<\\%>New Style Cocktail Dress<\\%> Color:The color as picture<\\%>Category:Bridesmaid Dress<\\%> Brand:Biggoldapple<\\%> Silhouette:Sheath/Column<\\%> Neckline:One-Shoulder<\\%> Hemline/Train:Short/Mini<\\%> """ external_vocab_words = [] if external_vocab_filename is not None: external_vocab_words = load_vocab_words(external_vocab_filename) # transform ngrams into tuples external_vocab_with_tuples = [tuple(z.split('_')) for z in external_vocab_words] # assume that ngrams are separated by underscore: word1_word2. # multiple word tokenizer, more info: http://www.nltk.org/api/nltk.tokenize.html tokenizer_mwe = MWETokenizer(external_vocab_with_tuples) # all external_vocab_words are added out_clean_text = '' # substitute word1.word2 by word1. word2 dirty_text = re.sub(r"(\w[A-Z]|[a-z.])\.([^.)\s])", r"\1. \2", dirty_text) rough_sentences = dirty_text.split('<\\%>') # sentences based on splitting by '<\\%>' sentences = [] for rs in rough_sentences: rs = rs.replace("3/4", "3_4") # just to keep the 3/4 as 3_4 sentences.extend(SENT_TOKENIZER.tokenize(rs)) # sentences based on NLTK tokenizer for sentence in sentences: words = WORD_TOKENIZER.tokenize(sentence.lower()) # tokenize based on words. ignore that zappos vocabulary exists if external_vocab_level == 'with_ngrams': # keep all words (even those not in zappos), but group zappos ngrams into one token words = tokenizer_mwe.tokenize(words) # group zappos_ngrams into one token. elif external_vocab_level == 'only': words = tokenizer_mwe.tokenize(words) # group zappos_ngrams into one token. words = [w for w in words if w in external_vocab_words] # only keep words in the zappos vocabulary words = [w for w in words if (not w.isdigit() or w == '3_4')] # remove words that are just digits, but leave 3_4 words_concat = ' '.join(words) + '\n' out_clean_text += words_concat return out_clean_text
def tokenize_and_remove_punct(text): text = text.translate(str.maketrans('', '', string.punctuation)) mtokenizer = MWETokenizer() mwe = mtokenizer.tokenize(text.split()) words = [] for t in mwe: if t.isalpha(): words.append(t) return words
def phrase_eval(params): list_phrases, unigram_set, target_token, idf, agg_score, pid = params idf_list = [*idf] idf_set = set(idf_list) tokenizer = MWETokenizer(separator=' ') for e in unigram_set: tokenizer.add_mwe(nltk.word_tokenize(e)) phrases_score = {} for phrase in tqdm(list_phrases, desc='phrase-eval-{}'.format(pid), mininterval=10): score = 0 tokens = nltk.word_tokenize(phrase) if not set(tokens).issubset(idf_set): continue nonstop_tokens = [token for token in tokens if token not in stop] if len(nonstop_tokens) / len(tokens) <= 0.5: continue raw_tokenized = tokenizer.tokenize(tokens) tokenized_set = set(raw_tokenized) keywords = tokenized_set.intersection(unigram_set) for token in keywords: score += agg_score[token] score /= (1 + np.log(len(nonstop_tokens))) vocab = set(target_token).union(set(tokens)) vocab = list(vocab.intersection(idf_set)) target_vec = [0] * len(vocab) phrase_vec = [0] * len(vocab) target_token_freq = dict(Counter(target_token)) target_token_subset = list(set(vocab).intersection(set(target_token))) for token in target_token_subset: index = vocab.index(token) target_vec[index] = target_token_freq[token] / len( target_token) * idf[token] phrase_token_freq = dict(Counter(tokens)) for token in tokens: index = vocab.index(token) phrase_vec[index] = phrase_token_freq[token] / len( tokens) * idf[token] tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec) phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}}) rearrange = {} for k, v in phrases_score.items(): rearrange.update({k: v['score']}) top_10 = nlargest(10, rearrange, key=rearrange.get) return {key: phrases_score[key] for key in top_10}
def tokenizer_sent(dataset): tokenizer = MWETokenizer() aspect_tokenized = [] sentence_tokenized = [] for i in range(0, len(dataset.index)): aspect_split = tuple(dataset['aspect_term'][i].lower().split()) res = tokenizer.add_mwe(aspect_split) aspect_tokenized.append(res) for j in range(0, len(dataset.index)): tok = nltk.pos_tag( tokenizer.tokenize(dataset['text'][i].lower().split())) sentence_tokenized.append(tok)
def multiword_tokenize(text, mwe): # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词 protected_tuples = [word_tokenize(word) for word in mwe] protected_tuples_underscore = ['_'.join(word) for word in protected_tuples] tokenizer = MWETokenizer(protected_tuples) # Tokenize the text. tokenized_text = tokenizer.tokenize(word_tokenize(text)) # Replace the underscored protected words with the original MWE for i, token in enumerate(tokenized_text): if token in protected_tuples_underscore: tokenized_text[i] = mwe[protected_tuples_underscore.index(token)] return tokenized_text
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def sentence_filter(self, sentence): # 对句子进行初步的分词和清洗 if self.language == 'chinese': import jieba.posseg as psg return psg.cut(sentence) # 使用jieba的分词接口直接完成分词和清洗 elif self.language == 'english': from nltk.tokenize import MWETokenizer # 使用MWE分词器 tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 # for word in self.userdict: # spacy添加自定义词语,貌似无效 # lex = nlp.vocab[word] # 清洗标点符号 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,") sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) # 使用nltk和spacy得到分词结果,使用pke则得到完整句子 # return nlp(' '.join(sentence.split())) # spacy return nlp(' '.join(tokenizer.tokenize(sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词 # return sentence # pke elif self.language == 'japanese': mecab = MeCab.Tagger('') # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币) # 清洗标点符号 punc_pattern = re.compile( "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※") sentence = re.sub(punc_pattern, ' ', sentence) sentence = [ ( chunk.split('\t')[0], chunk.split('\t')[1].split(',')[0] ) for chunk in mecab.parse(sentence).splitlines()[:-1] ] # 根据词条结构获取词根和词型 return sentence
def text_process_group(mess): """ 1. Lower case the input 2. Remove punctuation expect '-' 3. Apply custom tokenizer 4. Return column of clean text words""" mess.lower() regex = r"[^\P{P}-]+" new_mess = re.sub(regex, " ", mess, 0) tokenizer = MWETokenizer(all_list, separator=' ') token = tokenizer.tokenize(new_mess.lower().split()) sw = [x for x in token if x not in stopwords.words('english')] return sw
def sentence_filter(self, sentence): tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|," ) sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) return nlp(' '.join(tokenizer.tokenize( sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词
def multi_word_tokenizer(relevant_words, text): mwetokenizer = MWETokenizer() #add tuples of words into multiword tokenizer for word in relevant_words: token = str(word).split() move_data=[] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) #execute multitokenization return mwetokenizer.tokenize(text)
def tokenize_sentence(self, string, max_sentence_len, with_labels=False): merger = MWETokenizer([('<', 'unk', '>')], separator = '') sentence = word_tokenize(string.strip()) # tokenize sentence sentence = merger.tokenize(sentence) # merge <unk> if with_labels: sentence = sentence[1:] sentence = [token.lower() for token in sentence] sentence = sentence[:max_sentence_len - 2] # cut sentence at max_sentence_length sentence = ['<sos>'] + sentence + ['<eos>'] # add start and end-of-sentence tags # pad the rest of the sentence padded_sentence = sentence.copy() padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence))) return sentence, padded_sentence
def multiword_tokenize(text, mwe): # Initialize the MWETokenizer protected_tuples = [word_tokenize(word) for word in mwe] protected_tuples_underscore = ['_'.join(word) for word in protected_tuples] tokenizer = MWETokenizer(protected_tuples) # Tokenize the text. #tokenized_text = tokenizer.tokenize(word_tokenize(text,language='French')) #print(tokenize(text)) tokenized_text = tokenizer.tokenize(tokenize(text)) #print(tokenized_text) # Replace the underscored protected words with the original MWE for i, token in enumerate(tokenized_text): if token in protected_tuples_underscore: tokenized_text[i] = mwe[protected_tuples_underscore.index(token)] return tokenized_text
def spans(txt): tokens = MWETokenizer.tokenize(word_tokenize(txt)) offset = 0 for token in tokens: offset = txt.find(token, offset) yield token, offset, offset + len(token) offset += len(token)
def __init__(self, address_and_gps): self.tokenizer = MWETokenizer() self.time_tagger = TimeTagger() for a in address_and_gps: self.tokenizer.add_mwe(a.split()) # Rules defined self.specials = { "QUANTITY": [ "at least", "more than", "less than", "at most", "not more than", "a number of" ], "IN": ["in front of", "called"], "NN": [ "cafe sign", "traffic light", "fire hydrant", "stop sign", "parking meter", "baseball bat", "baseball glove", "cell phone", "teddy bear", "hair drier" "airport vehicles", "airport vehicle", "screen" ], "SPACE": [ "living room", "fastfood restaurant", "restaurant kitchen", "restaurant", "dining hall", "food court", "butchers shop", "restaurant patio", "coffee shop", "room", "hotel room", "kitchen", "office", "airport", "salon" ], "POSITION": ["side", "foreground", "background", "right", "left", "image"], "LOCATION": [ "home", "school", "oslo", "norway", "hotel", "tromso", "bank", "ireland", "china", "japan", "vietnam", 'dcu', 'dublin', 'dublin city university' ], "TOBE": [ "am", "is", "are", "be", "is being", "am being", "are being", "being" ], "WAS": ["was", "were", "had been", "have been"], "TIMEPREP": ["prior to", "then"], "POSITION_PREP": ["near", "distance to"] } for tag in [ "QUANTITY", "IN", "NN", "SPACE", "POSITION", "LOCATION", "TOBE", "WAS", "TIMEPREP", "POSITION_PREP" ]: for keyword in self.specials[tag]: if ' ' in keyword: self.tokenizer.add_mwe(keyword.split())
def _init_mwe_tokenizer(self): def multi_word_expressions(): for entity in self.vocab: if entity.find(self._PHRASE_DELIMITER) != -1: yield entity.split(self._PHRASE_DELIMITER) it = multi_word_expressions() self._mwe_tokenizer = MWETokenizer(it)
def k_tokenizer(text): text = text.encode('ascii',errors='ignore').replace('-', '') """ We should use a better way to remove non-english words """ tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(text) # stopset = set(stopwords.words('english')) # tokens = [word for word in tokens if not word in stopset] """ Synonyms using wordnet """ mwe_tokenizer = MWETokenizer([('ios', '9'),]) mwe_tokens = mwe_tokenizer.tokenize(tokens) """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """ """ train -> train_NN train_V""" tagged = nltk.pos_tag(mwe_tokens) def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # we preserve the original form of any unknown word wordnet_lemmatizer = WordNetLemmatizer() final_doc=[] for token, tag in tagged: word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag)) final_doc.append(word) # porter = PorterStemmer() # final_doc=[] # for token in mwe_tokens: # final_doc.append(porter.stem(token)) return final_doc
def k_tokenizer(text): text = text.encode('ascii', errors='ignore').replace('-', '') """ We should use a better way to remove non-english words """ tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(text) # stopset = set(stopwords.words('english')) # tokens = [word for word in tokens if not word in stopset] """ Synonyms using wordnet """ mwe_tokenizer = MWETokenizer([ ('ios', '9'), ]) mwe_tokens = mwe_tokenizer.tokenize(tokens) """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """ """ train -> train_NN train_V""" tagged = nltk.pos_tag(mwe_tokens) def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # we preserve the original form of any unknown word wordnet_lemmatizer = WordNetLemmatizer() final_doc = [] for token, tag in tagged: word = tag + '_' + wordnet_lemmatizer.lemmatize( token, get_wordnet_pos(tag)) final_doc.append(word) # porter = PorterStemmer() # final_doc=[] # for token in mwe_tokens: # final_doc.append(porter.stem(token)) return final_doc
def tokenize(multi_word_queries, text): """Returns a list of words that make up the text. Params: {text: String} Returns: List """ lower_case = text.lower() tokenizer = RegexpTokenizer( 'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+') result = tokenizer.tokenize(lower_case) multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'), ('average', 'prices'), ('union', 'square'), ('real', 'estate'), ('ice', 'cream'), ('whole', 'foods'), ('co', 'op'), ('wall', 'street'), ('world', 'trade'), ('high', 'school'), ('dim', 'sum'), ('empire', 'state'), ('high', 'rise'), ('walk', 'ups')]) if len(multi_word_queries) > 0: for tok in multi_word_queries: if (len(tok.split('_')) > 1): multi_tokenizer.add_mwe(tuple(tok.split('_'))) #add neighborhood names for n in neighborhood_name_phrases: multi_tokenizer.add_mwe(tuple(n.split('_'))) result2 = multi_tokenizer.tokenize(result) return result2
def merge_task(task_list, args): with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f: raw_list = f.read() f.close() nlp = spacy.load('en_core_web_lg', disable=['ner']) entityset = set(raw_list.split('\n')) tokenizer = MWETokenizer(separator=' ') for e in entityset: tokenizer.add_mwe(nltk.word_tokenize(e)) print("successfully read entity file and initialized tokenizer") sys.stdout.flush() for fname in task_list: outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1]) context = [] with open('{}/{}'.format(args.input_dir,fname), 'r') as f: doc = f.readlines() f.close() for item in tqdm(doc, desc='{}'.format(fname), mininterval=30): item_dict = json.loads(item) sent = nltk.word_tokenize(item_dict['text']) raw_tokenized = tokenizer.tokenize(sent) tokenized_set = set(raw_tokenized) mentioned_entity = list(tokenized_set.intersection(entityset)) if len(mentioned_entity) != 0: doc = nlp(item_dict['text']) item_dict.update({'entityMentioned':mentioned_entity}) unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)] item_dict['unigram'] = unigram tokens = [token.text for token in doc] pos = [token.pos_ for token in doc] phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos) item_dict['phrases'] = list(phrases['counts']) context.append(json.dumps(item_dict)) with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def iteratively_contract_bigrams(self): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score """ for i in range(self.max_collocation_iterations): bigramer = BigramCollocationFinder.from_documents(self.tokens_by_sent()) mwes = list( bigramer.above_score( self.collocation_score_function, self.min_collocation_score ) ) if len(mwes) == 0: break contracter = MWETokenizer(mwes) self.tokens_by_sent_by_doc_ = [ contracter.tokenize_sents(doc) for doc in self.tokens_by_sent_by_doc() ]
" mean: {} median: {} stddev: {}".format( statistics.mean(counts), statistics.median(counts), statistics.stdev(counts) ) ) print("\n") # Read the words of interest words = open("emotion_words.txt").read().lower().split("\n") sentiment_bag = set() # Get the multi-word expression tokenizer and add each to the sentiment_bag mwe = set(filter(lambda a: " " in a, words)) print("Multi-word expressions in emotion words: {}".format(",".join(mwe))) # Create the MWE tokenizer mwe_tokenizer = MWETokenizer() for s in mwe: print("Add mwe ", s) mwe_tokenizer.add_mwe(s.split(" ")) sentiment_bag.add(s.replace(" ", "_")) lmtzr = WordNetLemmatizer() st = LancasterStemmer() ps = PorterStemmer() print("Stemming:") for word in filter(lambda a: " " not in a, words): print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word))) sentiment_bag.add(word) sentiment_bag.add(st.stem(word)) # I like this one the best # Process all the lists
def extract_expressions(self, document, features=None): """Returns expressions from given features and multi-word expressions. In addition to passing a document into this method, MWEs or Multi-Word Expressions can be given to treat some multi words as one expression. >>> from document import ArthurDocument >>> pdf_path = base_path + '/test/test.pdf' >>> with open(pdf_path, 'rb') as f: ... document = ArthurDocument(f.read()) >>> features = document.get_features()[730:816,:] >>> print(document.get_text(features)) # doctest:+ELLIPSIS VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive Multi-word expression should be detected: >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates']) >>> expressions = clusterer.extract_expressions(document, features) >>> print(expressions[2]['text']) CROWN JEWEL x position should equal x of "C" from "CROWN JEWEL" : >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')] True and width should equal to width of "CROWN JEWEL": >>> expr_width = expressions[2]['x1']-expressions[2]['x'] >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')] >>> expr_width == ftr_width True Args: document(ArthurDocument): Document to extract data fields from. features(list): List of features containing data fields to extract. If not given, use all document features. mwes(list): List of Multi-Word Expressions. Example value: `['property type', 'single family)]`. With that list, both "property type" and "single family" will each be treated as single expressions. Returns: np.array: An array of data_fields. """ mwes = self.mwes if features is None: features = document.get_features() text = document.get_text(features) for idx, mwe in enumerate(mwes): if isinstance(mwe, str): mwes[idx] = word_tokenize(mwe.lower()) elif hasattr(mwe, '__iter__'): mwes[idx] = [x.lower() for x in mwe] tokenizer = MWETokenizer(mwes, separator=' ') tokenized = tokenizer.tokenize(word_tokenize(text.lower())) expressions = [] pos = 0 for token in tokenized: # token could be "deez nutz" but text contains multiple spaces e.g. "deez nutz", # so we need to split the token and find position of first and last characters. words = token.split() start_pos = text.lower().index(words[0], pos) for word in words: ipos = text.lower().index(word, pos) end_pos = ipos + len(word) pos = end_pos min_x = 0 max_x = 0 min_y = 0 max_y = 0 page = 0 if len(features[start_pos:end_pos,:] > 0): min_x = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')] max_x = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')] min_y = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')] max_y = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')] page = features[start_pos, ArthurDocument.get_feature_id('page')] expressions.append({ 'text': text[start_pos:end_pos], 'x': min_x, 'x1': max_x, 'y': min_y, 'y1': max_y, 'page': page }) return expressions
help='Use as training data and create dictionary (default: test data)') args = parser.parse_args() data = [] # json output from extract.py for line in fileinput.input(files='-'): data = json.loads(line) miscFeatures = [] # other features abstracts = [] tool = [] # tokenize the text # pairs for conditional frequency distribution pairs = [] tokenizer = MWETokenizer([('web', 'server'), ('software', 'package'), ('R', 'package'), ('freely', 'available'), ('source', 'code')]) for i in range(len(data)): # change github, bioconductor, sourceforge urls to unique words title = data[i]['title'] colon = title.find(':') miscFeatures.append([]) if colon >= 0: miscFeatures[i].append(1) pairs.append((i+1, '-hascolon')) a = title[0:colon] b = a.lower() diff = sum(a[k] != b[k] for k in range(len(a))) # ratio = diff/len(a)
def umbc_sim (title1, title2): ''' compares the similarity of title1 and title2 :param title1: :param title2: :return: a bool value, 0 for similar, 1 for not similar ''' #print datetime.now(), " Preprocessing titles..." title1 = title_prepocessing(title1) title2 = title_prepocessing(title2) #print datetime.now(), " Tokenization and parsing starts..." tokenizer = MWETokenizer(wn_bst.multi_words_xpn()) tokens1 = tokenizer.tokenize(title1.split()) #print datetime.now(), " First title tokenized." tagged1 = nltk.pos_tag(tokens1) #print datetime.now(), " First title parsed." tokens2 = tokenizer.tokenize(title2.split()) #print datetime.now(), " Second title tokenized." tagged2 = nltk.pos_tag(tokens2) #print datetime.now(), " Second title parsed." # remove tokens that are not supported by WordNet tagged1 = [x for x in tagged1 if not wn_bst.get_wordnet_pos(x[1])==''] tagged2 = [x for x in tagged2 if not wn_bst.get_wordnet_pos(x[1])==''] #print datetime.now(), " Tokens cleaned." # use a matrix to store the result for later use #print datetime.now(), " Building matrix..." len1 = len(tagged1) len2 = len(tagged2) Matrix = np.zeros((len2,len1)) result1 = {} result2 = {} for x in range(len1): token1=tagged1[x][0] pos1 = tagged1[x][1] simi = 0 counterpart1 = '' for y in range(len2): token2 = tagged2[y][0] pos2 = tagged2[y][1] Matrix[y, x] = sim(token1, pos1, token2, pos2) if Matrix[y,x]>simi: simi = Matrix[y, x] counterpart1 = token2 penalty1 = umbc_penalty(token1, pos1, tokens1, simi, counterpart1) result1[token1] = {'sim':simi, 'p':penalty1, 'counter':counterpart1} #print datetime.now(), " Title1 result calculated..." for y in range (0, len2): token2=tagged2[y][0] pos2 = tagged2[y][1] simi = 0 counterpart2 = '' for x in range(0, len1): if Matrix[y,x]>simi: simi = Matrix[y,x] counterpart2 = tagged1[x][0] #print token2, counterpart2, simi penalty2 = umbc_penalty(token2, pos2, tokens2, simi, counterpart2) result2[token2] = {'sim':simi, 'p':penalty2, 'counter':counterpart2} #print datetime.now(), " Title2 result calculated..." #print result1 sum1 = umbc_sum(result1) sum1 = float(sum1) #print result2 sum2 = umbc_sum(result2) sum2 = float(sum2) #print sum1, sum2 score = sum1/(2*len1)+sum2/(2*len2) #cut upper and lower bound if score < 0: score = 0 return score
whitespace_tokenizer = WhitespaceTokenizer() wnl = WordNetLemmatizer() from spacy.en import English nlp = English() # This is for multi-word-phrases. MWE = [] path = "/".join(os.path.realpath(__file__).split("/")[:-2]) + '/input/' print 'path', path with open(path+'STREUSLE2.1-mwes.tsv') as f: for line in f.readlines(): multiword_expression = line.split('\t')[0].split()[1:] MWE.append(multiword_expression) MWE_tokenizer = MWETokenizer(MWE, separator='-') # Add whatever additional custom multi-word-expressions. MWE_tokenizer.add_mwe(( 'dive', 'bar')) # Stopwords stops = set(stopwords.words("english") + stopwords.words("spanish")) keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 'most', ' without', 'nor', 'no', 'very', 'against','don','aren'] stops = set([word for word in stops if word not in keep_list]) table = string.maketrans("","") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') whitespace_tokenizer = WhitespaceTokenizer()