def process(self, data): """new_dialog: [(a, 1/0), (a,1/0)], new_meta: (a, b, topic), new_utt: [[a,b,c)""" """ 1 is own utt and 0 is other's utt""" new_dialog = [] new_meta = [] new_utts = [] bod_utt = ["<s>", "<d>", "</s>"] all_lenes = [] for l in data: if self.config.use_merge: if self.config.merge_type == 'init': lower_utts = [ (caller, ["<tag>", "<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"], feat) for caller, utt, feat in l["utts"] ] elif self.config.merge_type == 'last': lower_utts = [ (caller, ["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>", '<tag>'], feat) for caller, utt, feat in l["utts"] ] else: lower_utts = [ (caller, ["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"], feat) for caller, utt, feat in l["utts"] ] all_lenes.extend([len(u) for c, u, f in lower_utts]) a_age = float(l["A"]["age"]) / 100.0 b_age = float(l["B"]["age"]) / 100.0 a_edu = float(l["A"]["education"]) / 3.0 b_edu = float(l["B"]["education"]) / 3.0 vec_a_meta = [a_age, a_edu ] + ([0, 1] if l["A"]["sex"] == "FEMALE" else [1, 0]) vec_b_meta = [b_age, b_edu ] + ([0, 1] if l["B"]["sex"] == "FEMALE" else [1, 0]) # for joint model we mode two side of speakers together. if A then its 0 other wise 1 meta = (vec_a_meta, vec_b_meta, l["topic"]) dialog = [(bod_utt, 0, None) ] + [(utt, int(caller == "B"), feat) for caller, utt, feat in lower_utts] new_utts.extend([bod_utt] + [utt for caller, utt, feat in lower_utts]) new_dialog.append(dialog) new_meta.append(meta) print("Max utt len %d, mean utt len %.2f" % (np.max(all_lenes), float(np.mean(all_lenes)))) return new_dialog, new_meta, new_utts
def frombatch_pipeline(self, batch): #TODO """@batch: a TensorFlow Dataset Batch""" pre_process = [ self.clean_punctuation(doc.decode("utf-8")) for doc in batch ] logging.info('frombatch_pipeline: clean punctuation') pre_process = [self.split_camel_case_token(doc) for doc in pre_process] logging.info('frombatch_pipeline: camel case') pre_process = [doc.lower() for doc in pre_process] logging.info('frombatch_pipeline: lowe case') pre_process = [doc.strip() for doc in pre_process ] # Leading whitepsace are removed logging.info('frombatch_pipeline: white space removed') pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process ] logging.info('frombatch_pipeline: WordPunctTokenizer') filtered_tokens = [self.stop_words(doc) for doc in pre_process_tokens] #Stop Words logging.info('frombatch_pipeline: Stop words') filtered_tokens = [self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings logging.info('frombatch_pipeline: Stemmings') filtered_tokens = [self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms logging.info('frombatch_pipeline: Removed Special Terns') #pre_process = [ ' '.join(doc) for doc in filtered_tokens] logging.info('frombatch_pipeline [END]') return filtered_tokens
def fromdocs_pipeline(self, docs): #TODO """@tokenized_file: a list of tokens that represents a document/code""" pre_process = [self.clean_punctuation(doc) for doc in docs] logging.info('fromtokens_pipeline: clean punctuation') pre_process = [self.split_camel_case_token(doc) for doc in pre_process] logging.info('fromtokens_pipeline: camel case') pre_process = [doc.lower() for doc in pre_process] logging.info('fromtokens_pipeline: lowe case') pre_process = [doc.strip() for doc in pre_process ] # Leading whitepsace are removed logging.info('fromtokens_pipeline: white space removed') pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process ] logging.info('fromtokens_pipeline: WordPunctTokenizer') filtered_tokens = [self.stop_words(doc) for doc in pre_process_tokens] #Stop Words logging.info('fromtokens_pipeline: Stop words') filtered_tokens = [self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings logging.info('fromtokens_pipeline: Stemmings') filtered_tokens = [self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms logging.info('fromtokens_pipeline: Removed Special Terns') pre_process = [' '.join(doc) for doc in filtered_tokens] logging.info('fromtokens_pipeline END') return pre_process
def isBulling(text): # tokenize words wpt = nltk.WordPunctTokenizer() wordnet_lemmatizer = nltk.WordNetLemmatizer() matrix = [] words = wpt.tokenize(text) # remove punctuations clean_words = [ word.lower() for word in words if word not in set(string.punctuation) ] # remove stop words english_stops = nltk.corpus.stopwords.words('english') characters_to_remove = [ "''", '``', "rt", "https", "’", "“", "”", "\u200b", "--", "n't", "'s", "...", "//t.c" ] clean_words = [word for word in clean_words if word not in english_stops] clean_words = [ word for word in clean_words if word not in set(characters_to_remove) ] # Lematise words lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words] s = '' for y in lemma_list: s = s + ' ' + y s = re.sub('[^A-Za-z0-9" "]+', '', s) s = s.lstrip() matrix.append(s) re_tv_matrix = tv.transform(matrix) re_tv_matrix = re_tv_matrix.toarray() OUTPUT = svm.predict(re_tv_matrix) SEVERITY = svm.predict_proba(re_tv_matrix) # print(re_tv_matrix) return SEVERITY[0][0]
def BOW(sentence): WPT = nltk.WordPunctTokenizer() #nltk.download('stopwords') #nltk.download('punkt') #nltk.download('wordnet') lemmatizer = WordNetLemmatizer() stop_word_list = nltk.corpus.stopwords.words('english') '''Remove numbers and special characters in sentence''' sentence = re.sub(" \d+", " ", sentence) #digits sentence = re.sub(r'[0-9]+', "", sentence) #digits sentence = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", sentence) pattern = r"[{}]".format("-_)(,;:$%#.") #special characters sentence = re.sub(pattern, "", sentence) sentence = re.sub(r"[\']", "", sentence) sentence = re.sub(r"[/']", "", sentence) sentence = re.sub(r'\b[a-zA-Z]\b', '', sentence) #remove single letter words sentence = re.sub('\s+', ' ', sentence).strip() #remove double spaces '''Lowercase''' sentence = sentence.lower() sentence = sentence.strip() '''Tokenize''' tokens = WPT.tokenize(sentence) filtered_tokens = [ token for token in tokens if token not in stop_word_list ] '''Lem''' k = [] for word in range(len(filtered_tokens)): k.append(lemmatizer.lemmatize(filtered_tokens[word])) sentence = ' '.join(k) return sentence
def __init__(self): self.tokenizer = nltk.WordPunctTokenizer()#nltk.RegexpTokenizer("[\w]", flags=re.UNICODE) self.stopwords = self.getStopWordList('./frame/stop-words-english4.txt') #https://gist.github.com/alexbowe/ self.sentence_re = r'''(?x) ([A-Z])(\.[A-Z])+\.? | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] ''' # Grammar from this paper http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/PAPERS/pdf/PAPERS065.pdf self.grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # NP: {<NBAR>} {<NBAR><IN><NBAR>} """ self.chunker = nltk.RegexpParser(self.grammar) self.toks = "" self.postoks = "" self.lemmatizer = nltk.WordNetLemmatizer() self.stemmer = nltk.stem.porter.PorterStemmer() self.tree = ""
def filter_text_with_marks(text): text = text.replace('"', "'").replace('\\', '') # this is too dangerous but fix a particular annoying case: # text = text.replace('**', '*') puncts = re.escape(string.punctuation) text = re.sub(r'([%s])(\*)' % puncts, r'\g<1> \g<2>', text) text = re.sub(r'(\*)([%s])' % puncts, r'\g<1> \g<2>', text) tokenizer = nltk.WordPunctTokenizer() tokens = tokenizer.tokenize(text) new_text = [] new_marks = [] b = False for i in range(len(tokens)): tk_i = tokens[i] if tk_i != '*': if '*' in tk_i: print(tk_i, tokens) new_text.append(tk_i) if tk_i == '*' and b is False: b = True elif tk_i == '*' and b is True: b = False if tk_i == '*' and b is True: new_marks.append(len(new_text)) new_text = ' '.join(new_text).replace('*', '') return new_text, new_marks
def process(self, data): """ TODO """ new_dialog = [] new_meta = [] new_utts = [] bod_utt = ["<s>", "<d>", "</s>"] # TODO what do we do about topic for this? all_lenes = [] for l in data: lower_utts = [ (["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"]) for utt in l ] # for utt in l['utts']] all_lenes.extend([len(u) for u in lower_utts]) # dialog = [(bod_utt, 0)] + [(utt, int(ind==len(lower_utts)-2)) for ind, utt in enumerate(lower_utts)] dialog = [(utt, int(ind == len(lower_utts) - 2)) for ind, utt in enumerate(lower_utts)] new_utts.extend([bod_utt] + lower_utts) new_dialog.append(dialog) print("Max utt len %d, mean utt len %.2f" % (np.max(all_lenes), float(np.mean(all_lenes)))) return new_dialog, new_utts
def nltk_hacking(text_file=OUTPUT_FILE): f = open(text_file).read() f = clean(f) f = unescape(f) text = nltk.WordPunctTokenizer().tokenize(f) txt = nltk.Text(text) return txt
def transform(self, X, y=None): wpt = nltk.WordPunctTokenizer() tokenized_corpus = [ wpt.tokenize(document) for document in X['normalize_review_text'] ] # Set values for various parameters feature_size = 100 # Word vector dimensionality window_context = 5 # Context window size min_word_count = 2 # Minimum word count sample = 1e-3 # Downsample setting for frequent words self.word2vec = Word2Vec(tokenized_corpus, size=feature_size, window=window_context, min_count=min_word_count, sample=sample, iter=10) vocabulary = set(self.word2vec.wv.index2word) features = [ self._col_transform(tokenized_sentence, self.word2vec, vocabulary, num_features=100) for tokenized_sentence in tokenized_corpus ] return np.array(features)
def clean_data(train_data): with open("stopwords.txt", 'r') as f: stopwords = f.read().split("\n") w_punc = nltk.WordPunctTokenizer() stopwords.append('br') stopwords = [re.sub(r'[^\w\s\d]', '', sw.lower()) for sw in stopwords] vocab_dict = [] vocab_set = set() for idx, review in enumerate(train_data): review = review.decode('utf-8') review = review.replace('\n', ' ') review = re.sub(" \d+", " ", review) pattern = r"[{}]".format("-?!,.;:/<>'\(\)\"\"") review = re.sub(pattern, " ", review) review = review.lower() review = review.strip() tokens = w_punc.tokenize(review) filtered_tokens = [token for token in tokens if token not in stopwords] review = ' '.join(filtered_tokens) train_data[idx] = review word_list = review.split(" ") word_dict = {} for w in word_list: vocab_set.add(w) word_dict[w] = review.count(w) vocab_dict.append(word_dict) return train_data, vocab_dict, vocab_set
def main(): _file = input( "Enter the file name for the words/synonyms you want to find: ") read = open(_file, 'r') real_file = read.read() #print (real_file) count = 0 _word = input("Enter the word that you want to search or find: ") new_corp = nltk.WordPunctTokenizer().tokenize(real_file) #_pos = nltk.pos_tag(nltk.word_tokenize(_word)) #print (_pos) #thing = nltk.wsd.lesk (lang_,_word) #print (thing) #print(new_corp) for x in new_corp: #print(x) for _syns in wordnet.synsets(_word): for i in _syns.lemmas(): print(i.name()) if x == i.name() or x == _word: if i.name() == _word: break else: if x == _word: count += 1 continue print("Your corpus has listed", count, "of the word", _word, "and it's synonyms") import re
def normalization_data(self): import nltk nltk.download('stopwords') import re import numpy as np WPT = nltk.WordPunctTokenizer() stop_word_list = nltk.corpus.stopwords.words('turkish') from snowballstemmer import stemmer from TurkishStemmer import TurkishStemmer stemmer = TurkishStemmer() yorumlar = [] for i in range(0, len(self.df)): yorum = re.sub( "[^AaBbCcÇçDdEeFfGgĞğHhİiIıJjKkLlMmNnOoÖöPpRrSsŞşTtUuÜüVvYyZz']", ' ', self.df['text'][i]) #drop things that without letters yorum = re.sub("[']", '', yorum) #drop things that without letters yorum = yorum.lower() yorum = yorum.strip() yorum = yorum.split() yorum = [ stemmer.stem(word) for word in yorum if word not in stop_word_list ] yorum = ' '.join(yorum) yorumlar.append(yorum) # print(yorumlar)ds return yorumlar
def normalize(cls, text): # eliminate non-word characters text = re.sub("[^\w\s]","",text) # replace unicode character in "Decor" pattern = re.compile(u'\uFFFD', re.UNICODE) text = re.sub(pattern, "e", text) # Exceptions - handling some phrases separately #TODO: do this more ellegantly # Exclude "Home Theater", put it together with electronics text = text.replace("Home Theater", "Electronics") # Exclude "Cloud Player for Home" text = text.replace("Cloud Player for Home","Cloud Player") text = text.replace("Home & Portable Audio", "Portable Audio") #print text stopset = set(stopwords.words('english'))#["and", "the", "&", "for", "of", "on", "as", "to", "in"] stemmer = nltk.PorterStemmer() tokens = nltk.WordPunctTokenizer().tokenize(text) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 0] final = [stemmer.stem(word) for word in clean] # Fill tokens and stems dictionaries for token in tokens: cls.upper[token.lower()] = token for index in range(len(final)): if final[index] not in cls.stems: cls.stems[final[index]] = cls.upper[clean[index]] return final
def __init__(self, data: pd.DataFrame, feature_column: str, target_column: str, cls2idx, max_len=128): self.tokenizer = nltk.WordPunctTokenizer() self.data = data self.feature_column = feature_column self.target_column = target_column self.max_len = max_len self.cls2idx = cls2idx
def preprocess_text(self, document): document = re.sub(r'\W', ' ', str(document)) document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) document = re.sub(r'\s+', ' ', document, flags=re.I) document = re.sub(r'^b\s+', '', document) document = document.lower() tokens = document.split() tokens = [self.stemmer.lemmatize(word) for word in tokens] tokens = [word for word in tokens if word not in en_stop] tokens = [word for word in tokens if len(word) > 3] preprocessed_text = ' '.join(tokens) word_punctuation_tokenizer = nltk.WordPunctTokenizer() word_tokenized_corpus = word_punctuation_tokenizer.tokenize( preprocessed_text) return word_tokenized_corpus
def create_fields_tuples(): tokenizer = nltk.WordPunctTokenizer() fields_tuples = [ ('words', WordsField(tokenize=tokenizer.tokenize)), ('target', TagsField()) ] return fields_tuples
def text_preprocess(data): tokenizer = nltk.WordPunctTokenizer() porter_stemmer = PorterStemmer() stop_words = set(stopwords.words('English')) comp = re.compile('[^A-Z^a-z ]') for i in range(len(data)): data_item = data[i] if data_item is None: x = 1 # Clean by a regular expression data_item = comp.sub('', data_item) # Spilt into words data_item = tokenizer.tokenize(str(data_item).lower()) # Remove stop words data_item = [word for word in data_item if word not in stop_words] # Stemming for w_idx in range(len(data_item)): data_item[w_idx] = porter_stemmer.stem(data_item[w_idx]) data[i] = ' '.join(data_item) return data
def convert_kb(data_dir, data_type, vocab_dict, pad_id=0, unk_id=2): kb_text_file_path = os.path.join(data_dir, data_type + "_kb_text.txt") kb_out_text_file_path = os.path.join(data_dir, data_type + "_kb_text.pkl") kb_out_len_file_path = os.path.join(data_dir, data_type + "_kb_text_len.pkl") kb_out_file_path = os.path.join(data_dir, data_type + "_kb_text_both.pkl") wpt = nltk.WordPunctTokenizer() kb_text_file = open(kb_text_file_path, 'r') kb_utterance = [] kb_seq_len = [] for utterance in kb_text_file: utterance = utterance.strip() utterance_words = wpt.tokenize(utterance) utterance_word_ids = [] for word in utterance_words: if word in vocab_dict: word_id = vocab_dict[word] elif word == '': word_id = pad_id #Corner case else: word_id = unk_id utterance_word_ids.append(word_id) length_utterance, utterance_word_ids = pad_or_clip_utterance( utterance_word_ids) kb_utterance.append(utterance_word_ids) kb_seq_len.append(length_utterance) # length_utterance, utterance_word_ids = pad_or_clip_utterance(utterance_word_ids) kb_corpora = [kb_seq_len, kb_utterance] with open(kb_out_file_path, 'wb') as f: pkl.dump(kb_corpora, f, protocol=pkl.HIGHEST_PROTOCOL)
def remove_stop_words(doc): wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') tokens = wpt.tokenize(doc) filtered_tokens = [token for token in tokens if token not in stop_words] doc = ' '.join(filtered_tokens).lower() return doc
def _normalize_document(self, sentence): wlp = nltk.WordPunctTokenizer() stopword_list = stopwords.words('english') stopword_list.remove('not') stopword_list.remove('no') word_list = words.words() lemmatizer = nltk.WordNetLemmatizer() # lower case and remove special characters\whitespaces sentence = re.sub(r'[^a-zA-Z\s]', '', sentence, re.I | re.A) sentence = sentence.strip().lower() # tokenizing sentence = wlp.tokenize(sentence) # lemmatizing sentence = [lemmatizer.lemmatize(x) for x in sentence] # removing stop words sentence = [x for x in sentence if x.lower() not in stopword_list and x.lower() in word_list] return ' '.join(sentence)
def process(self, data): new_dialog = [] new_meta = [] new_utts = [] all_lens = [] new_persona = [] new_persona_word = [] for l in data: lower_utts = [ (caller, ["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"], feat) for caller, utt, feat in l["utts"] ] all_lens.extend([len(u) for c, u, f in lower_utts]) vec_a_meta = [0, 0] + [0, 0] vec_b_meta = [0, 0] + [0, 0] meta = (vec_a_meta, vec_b_meta, 'NULL') dialog = [(utt, int(caller == "A"), feat) for caller, utt, feat in lower_utts] new_utts.extend([utt for caller, utt, feat in lower_utts]) new_dialog.append(dialog) new_meta.append(meta) new_persona.append([(p.split(' ')) for p in l['persona']]) new_persona_word.append(l['persona_word']) print("Max utt len %d, mean utt len %.2f" % (np.max(all_lens), float(np.mean(all_lens)))) return new_dialog, new_meta, new_utts, new_persona, new_persona_word
def __init__(self, context_size, max_images, user_start_id, user_end_id, sys_start_id, sys_end_id, unk_id, pad_id, start_id, end_id, cutoff=-1): logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger('prepare_data_for_hred') self.context_size = context_size self.max_images = max_images self.unk_id = unk_id self.pad_id = pad_id self.user_start_id = user_start_id self.user_end_id = user_end_id self.sys_start_id = sys_start_id self.sys_end_id = sys_end_id self.start_id = start_id self.end_id = end_id self.user_start_sym = '<u>' self.user_end_sym = '</u>' self.sys_start_sym = '<s>' self.sys_end_sym = '</s>' self.pad_symbol = '<pad>' self.unk_symbol = '<unk>' self.start_sym = '<sos>' self.end_sym = '<eos>' self.cutoff = cutoff # Vocab frequency cutoff self.dir_path = None self.data_type = None self.vocab_file = None self.vocab_dict = None self.word_counter = None self.context_text_file = None self.context_image_file = None self.user_state_file = None self.target_text_file = None self.wpt = nltk.WordPunctTokenizer()
def process(self, data): """new_dialog: [(a, 1/0), (a,1/0)], new_utt: [[a,b,c)""" """ 1 is own utt and 0 is other's utt""" new_dialog = [] new_utts = [] bod_utt = ["<s>", "<d>", "</s>"] # indicator of a start of a dialog all_lenes = [] for l in data: lower_utts = [["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"] for utt in l.split('__eou__')[:-1]] all_lenes.extend([len(u) for u in lower_utts]) dialog = [(bod_utt, 0)] floor = 1 for utt in lower_utts: floor = floor + 1 dialog = dialog + [(utt, int(floor % 2 == 0))] new_utts.extend([bod_utt] + [utt for utt in lower_utts]) new_dialog.append(dialog) print("Max utt len %d, mean utt len %.2f" % (np.max(all_lenes), float(np.mean(all_lenes)))) return new_dialog, new_utts
def prepare_mul_ref(self, dir='./data/test_multi_ref.json'): import json import nltk with open('./data/test_multi_ref.json') as f: mul_raws = json.load(f) self.mul_ref_outs = [] self.mul_ref_das = [] for raw in mul_raws: raw_outs = raw['responses'] raw_das = raw['resp_dialog_acts'] out_tokens = [["<s>"] + nltk.WordPunctTokenizer().tokenize(raw_out.lower()) + ["</s>"] for raw_out in raw_outs] out_ids = [[self.rev_vocab.get(t, self.rev_vocab["<unk>"]) for t in line] for line in out_tokens] tmp_outs = [] for out_id in out_ids: if len(out_id) >= 40: tmp_outs.append(np.array(out_id[0:39] + [out_id[-1]])) else: tmp_outs.append(np.array(out_id)) tmp_das = [self.rev_da_vocab[raw_da] for raw_da in raw_das] assert(len(tmp_das) == len(tmp_outs)) self.mul_ref_outs.append(tmp_outs) self.mul_ref_das.append(tmp_das)
def spell_correction(document, vocab): ######################with suggestions with open(vocab, 'rb') as f: vocab = pickle.load(f) with open(document, 'rb') as f: rawtext = pickle.load(f) tokens = nltk.WordPunctTokenizer().tokenize(rawtext) tokens = [x for x in tokens if x] #print (tokens) #print(vocab) wrongwords = [word for word in tokens if word not in vocab] #print (wrongwords) error_location = [x for x, _ in enumerate(tokens) if _ in wrongwords] #print(wrongwords) suggestions = {} mindistances_sugg = {} bestmatch = {} for word in wrongwords: mindistances = [] mindistances_word = [] for v in vocab: if edit_distance(word, v) <= 4: mindistances.append(v) mindistances_word.append(edit_distance(word, v)) suggestions[word] = mindistances try: mindistances_sugg[word] = min(mindistances_word) except: pass #print(mindistances_word) for word in (wrongwords): dist = mindistances_sugg[word] key = [x for x in suggestions[word] if edit_distance(word, x) == dist] bestmatch[word] = key #print(bestmatch) big_table = list(zip(wrongwords, error_location, bestmatch.values())) df = pd.DataFrame(big_table) df.columns = ["wrongspellings", "location", "correction"] #print (error_location) #print(big_table) for word in wrongwords: print(word + "....?", "did you mean .........", suggestions[word]) print( "best match ......", bestmatch[word], "please do humanity a favor and go to school boy ....................." ) return df
def __init__(self, word2vec, data, labels, train=True, max_length=32): self.data = data self.labels = labels self.tokenizer = nltk.WordPunctTokenizer() self.max_length = max_length self.vocab = set( self.tokenizer.tokenize(" ".join(d for d in self.data))) self.word2idx = {word: idx + 1 for idx, word in enumerate(self.vocab)}
def __init__(self, list_content: list): self.model = KeyedVectors.load_word2vec_format( './nlp/GoogleNews-vectors-negative300.bin', binary=True) self.stop_words = set(stopwords.words('english')) self.list_content = list_content self.wpt = nltk.WordPunctTokenizer() self.word2weight = None self.corpus = []
def create_fields_tuples(): # note that words and words_hyp share the same field, therefore when we # call words_field.build_vocab() we are creating a shared vocab. # Hence, words.vocab == words_hyp.vocab tokenizer = nltk.WordPunctTokenizer() words_field = WordsField(tokenize=tokenizer.tokenize) fields_tuples = [('words', words_field), ('words_hyp', words_field), ('target', TagsField())] return fields_tuples
def create_fields_tuples(): # if you choose tokenizer='spacy', please install the en package: # python3 -m spacy download en tokenizer = nltk.WordPunctTokenizer() # tokenizer = nltk.TreebankWordTokenizer() fields_tuples = [('words', fields.WordsField(tokenize=tokenizer.tokenize)), ('target', fields.TagsField())] return fields_tuples