def create_vocab(self, data): assert self.split == 'train', \ "Vocabulary can only be created for training file." w2c = OrderedCounter() w2i, i2w = dict(), dict() special_tokens = [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) for program in data: tokens = program.split() w2c.update(tokens) for w, c in w2c.items(): if c > self.min_occ: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) return vocab
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) patients = np.load(self.raw_data_path)[()] for patient in patients.keys(): for visit in patients[patient]: w2c.update(visit) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." %len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def _create_vocab(self): assert self.split == 'train', "Vocabulary can only be created for training file." tokenizer = PunktSentenceTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(self.raw_data_path, 'r') as file: for i, line in enumerate(file): words = tokenizer.tokenize(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def f_create_vocab(self, vocab_obj, train_reviews): tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) # train_reviews = train_df.review max_line = self.m_max_line line_i = 0 for review in train_reviews: words = tokenizer.tokenize(review) w2c.update(words) if line_i > max_line: break line_i += 1 print("max line", max_line) for w, c in w2c.items(): if c > self.m_min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) print("len(i2w)", len(i2w)) vocab_obj.f_set_vocab(w2i, i2w)
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) data_folder = self.raw_data_path + "/" + self.split line_num = 0 for filename in os.listdir(data_folder): if "news.en" not in filename: continue if line_num > self.max_line: break full_filename = os.path.join(data_folder, filename) print("file", full_filename) file = open(full_filename, "r") print("max line", self.max_line) for i, line in enumerate(file): words = tokenizer.tokenize(line) w2c.update(words) line_num += 1 if line_num > self.max_line: break print("line_num", line_num) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) self.w2c = OrderedCounter() self.w2i = dict() self.i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: self.i2w[len(self.w2i)] = st self.w2i[st] = len(self.w2i) labels = ['0', '1'] for l in labels: print("updating vocab with sentences of label {}".format(l)) file = open(self.raw_data_path + l, 'r') num_lines = self.num_lines_0 if l=='0' else self.num_lines_1 for i, line in enumerate(tqdm(file, total=num_lines)): if(i == num_lines): break words = tokenizer.tokenize(line) if(len(words) > self.max_sequence_length): continue self.w2c.update(words) file.close() print("done creating w2c") for w, c in tqdm(self.w2c.items()): if c > self.min_occ and w not in special_tokens: self.i2w[len(self.w2i)] = w self.w2i[w] = len(self.w2i) print("done creating w2i") assert len(self.w2i) == len(self.i2w) print("Vocablurary of %i keys created." % len(self.w2i)) vocab = dict(w2i=self.w2i, i2w=self.i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab() self.v_size = len(self.w2i)
def create_vocab(self, vocab_file): self.vocab_file = vocab_file df = pd.DataFrame() tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() preprocess_word = Preprocess_Word() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(self.vocab_file, 'r') as file: lines = [] for i, line in enumerate(file): lines.append(line) #line = rewrite_to_toklen(line) #words = tokenizer.tokenize(line) #words = [c for c in line] words = preprocess_word.to_words(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) self.w2i = w2i self.i2w = i2w self.dump() if self.train_with_vocab: df['url'] = lines self.create_data(df) elif self.train_file is not None: df = pd.read_csv(self.train_file, names=['url']) self.create_data(df)
def _create_vocab(self, raw_data_file, **kwargs): assert self.split == 'train', "Only for training data Vocablurary can be created." print("Creating New Vocablurary.") tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(raw_data_file) as file: for i, line in enumerate(file): line = self._preprocess(line) question, answer = line.split('|||') question = tokenizer.tokenize(question) question = question[:self.max_utterance_length] answer = tokenizer.tokenize(answer) answer = answer[:self.max_utterance_length - 1] # sos or eos token will be added words = question + answer w2c.update(question + answer) if i > 1000000: break for w, c in w2c.items(): if c > self.min_occ: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) vocab_file_path = os.path.join(self.root, self.vocab_file_name) vocab = dict(w2i=w2i, i2w=i2w) with io.open(vocab_file_path, 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def __generate_features_dictionaries__(self): for feature in self.getFeatureNames(): self.features_dicts[feature] = OrderedCounter() for graph in self.graphs: for edge in graph.get_edges(): self.__basic_features_dictionaries__(edge) self.__add_basic_missing_tags__()
def f_create_vocab(self, vocab_obj): # assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(self.m_raw_train_data_path, 'r') as file: max_i = 0 for i, line in enumerate(file): words = tokenizer.tokenize(line) w2c.update(words) max_i = i if i > self.m_max_line: break print("max_i", max_i) for w, c in w2c.items(): if c > self.m_min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) print("len(i2w)", len(i2w)) vocab_obj.f_set_vocab(w2i, i2w)
def _create_vocab(self): assert self.split == 'train', "Vocabulary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) df = pandas.read_csv(self.raw_data_path) for i in range(len(df)): poem = df.iloc[i]["Poem"] poem = poem.replace("\r\r\n", " <nl> ") words = tokenizer.tokenize(poem) # Filter out poems that don't have newlines if words.count('<nl>') <= 2: continue w2c.update(words) for w, c in w2c.items(): if c >= self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def create_vocab(self, hints, test_hints): w2i = dict() i2w = dict() w2c = OrderedCounter() special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) for hint in hints: hint_tokens = hint.split() w2c.update(hint_tokens) if test_hints is not None: for hint in test_hints: hint_tokens = hint.split() w2c.update(hint_tokens) # sort token so that different instantiations of the dataset is compatible for w, c in sorted(list(w2c.items())): i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) == len(w2c)+len(special_tokens) vocab = dict(w2i=w2i, i2w=i2w, w2c=w2c) self.vocab = vocab logging.info('Created vocab with %d words.' % len(w2c))
def create_vocab(self, hints, test_hints): w2i = dict() i2w = dict() w2c = OrderedCounter() special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) for hint in hints: hint_tokens = hint.split() w2c.update(hint_tokens) if test_hints is not None: for hint in test_hints: hint_tokens = hint.split() w2c.update(hint_tokens) for w, c in list(w2c.items()): i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) vocab = dict(w2i=w2i, i2w=i2w) self.vocab = vocab logging.info('Created vocab with %d words.' % len(w2c))
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) #print("PATH: ", self.raw_data_path) if self.rows > 0: file = pd.read_csv(self.raw_data_path, nrows=self.rows)['text'] else: file = pd.read_csv(self.raw_data_path)['text'] #print("Data size: ", file.shape, ) file = file.dropna(axis=0) for i, line in enumerate(file): #if i == 27054: # continue words = tokenizer.tokenize(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) #print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.vocab_directory, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def _create_vocab(self, vocab_obj, train_reviews): tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) # train_reviews = train_df.review max_line = self.m_max_line line_i = 0 for review in train_reviews: words = tokenizer.tokenize(review) w2c.update(words) if line_i > max_line: break line_i += 1 print("max line", max_line) for w, c in w2c.items(): if c > self.m_min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) # print("vocabulary of %i keys created"%len(w2i)) # vocab = dict(w2i=w2i, i2w=i2w) # with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file: # data = json.dumps(vocab, ensure_ascii=False) # vocab_file.write(data.encode('utf8', 'replace')) print("len(i2w)", len(i2w)) vocab_obj.f_set_vocab(w2i, i2w)
def _create_vocab(self, dataset_raw_file, **kwargs): assert self.split == 'train', "Vocablurary can only be created from training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() # add speical tokens to vocab special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(dataset_raw_file, 'r') as file: # read data and count token occurences for line in file.readlines(): tokens = tokenizer.tokenize(line) w2c.update(tokens) # create vocab with for w, c in w2c.items(): if c > self.min_occ: i2w[len(w2i)] = w w2i[w] = len(w2i) vocab = dict(w2i=w2i, i2w=i2w) # save vocab to file vocab_file_path = os.path.join(self.root, 'vocab.json') with io.open(vocab_file_path, 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) print( "Vocablurary created with %i tokens. Minimum occurence criterion = %i." % (len(w2i), self.min_occ)) self._load_vocab()
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) # NOTE: vocab中 で特殊トークンはConditionEncoderでも横断的に現れそうなので、 # そのidは統一しておきたい assert w2i['<pad>'] == PAD_INDEX assert w2i['<unk>'] == UNK_INDEX assert w2i['<sos>'] == SOS_INDEX assert w2i['<eos>'] == EOS_INDEX with open(self.raw_data_path, 'r') as file: for i, line in enumerate(file): words = self.tokenize(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def _create_combined_vocab(self): # this function uses both snli + yelp to create vocab assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) # first for yelp with open(self.yelp_raw_data_path, 'r') as file: for i, line in enumerate(tqdm(file, total=self.num_lines)): if (i == self.num_lines): break words = tokenizer.tokenize(line) w2c.update(words) # now for snli with open(self.snli_raw_data_path, 'r') as file: for i, line in enumerate(tqdm(file, total=self.num_lines)): if (i == self.num_lines): break words = tokenizer.tokenize(line) w2c.update(words) for w, c in tqdm(w2c.items()): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
class Yelp(Dataset): def __init__(self, split, create_data=False, have_vocab=False, **kwargs): super().__init__() self.data_dir = "./data/yelp/" self.save_model_path = "./saved_vae_models" self.split = split if(split == "train"): self.num_lines_0 = 176787 self.num_lines_1 = 267314 else: self.num_lines_0 = 50278 self.num_lines_1 = 76392 self.filter_sentiment_words = True self.filter_stop_words = True self.embedding_size = 300 self.max_sequence_length = 15 self.min_occ = kwargs.get('min_occ', 2) self.have_vocab = have_vocab self.raw_data_path = "./data/yelp/sentiment." + split + '.' self.preprocessed_data_file = 'yelp.'+split+'.json' self.vocab_file = 'yelp.vocab.json' self.path_to_w2v_embds = './data/yelp/yelp_w2v_embeddings' self.path_to_w2v_weights = './data/yelp/yelp_w2v_weights' if create_data: print("Creating new %s ptb data." % split.upper()) self._create_data() elif not os.path.exists(os.path.join(self.data_dir, self.preprocessed_data_file)): print("%s preprocessed file not found at %s. Creating new." % ( split.upper(), os.path.join(self.data_dir, self.preprocessed_data_file))) self._create_data() else: print(" found preprocessed files, no need tooo create data!") self._load_data() # load bow vocab with open("./data/yelp/bow.json") as f: self.bow_filtered_vocab_indices = json.load(f) self.bow_hidden_dim = len(self.bow_filtered_vocab_indices) def __len__(self): return len(self.data) def __getitem__(self, idx): idx = str(idx) return { 'input': np.asarray(self.data[idx]['input']), 'target': np.asarray(self.data[idx]['target']), 'bow': self._get_bow_representations(self.data[idx]['input']), # 'label': np.asarray(self.data[idx]['label']), 'label': np.asarray([1-self.data[idx]['label'], self.data[idx]['label']]), # we need to make it 2 dim to match predicted label dim. 'length': self.data[idx]['length'] } @property def vocab_size(self): return len(self.w2i) @property def pad_idx(self): return self.w2i['<pad>'] @property def sos_idx(self): return self.w2i['<sos>'] @property def eos_idx(self): return self.w2i['<eos>'] @property def unk_idx(self): return self.w2i['<unk>'] def get_w2i(self): return self.w2i def get_i2w(self): return self.i2w def _load_data(self, vocab=True): print("loading preprocessed json data...") with open(os.path.join(self.data_dir, self.preprocessed_data_file), 'r') as file: self.data = json.load(file) if vocab: with open(os.path.join(self.data_dir, self.vocab_file), 'r') as file: vocab = json.load(file) self.w2i, self.i2w = vocab['w2i'], vocab['i2w'] def _load_vocab(self): with open(os.path.join(self.data_dir, self.vocab_file), 'r') as vocab_file: vocab = json.load(vocab_file) self.w2i, self.i2w = vocab['w2i'], vocab['i2w'] def _create_data(self): if not self.have_vocab and self.split == 'train': print("creating vocab for train!") self._create_vocab() print("finished creating vocab!") print("creating bow vocab for train!") self.create_bow_vocab(self.w2i) print("finished creating bow vocab!") print("creating w2v embs matrix") self.create_w2v_weight_matrix() print("finished creating w2v embs matrix!") else: self._load_vocab() print("loaded vocab from mem!") tokenizer = TweetTokenizer(preserve_case=False) data = defaultdict(dict) labels = ['0', '1'] for l in labels: print("import data with label {}".format(l)) file = open(self.raw_data_path + l, 'r') num_lines = self.num_lines_0 if l=='0' else self.num_lines_1 for i, line in enumerate(tqdm(file, total=num_lines)): if(i == num_lines): break words = tokenizer.tokenize(line) # filter out the words greater than this limit if(len(words) > self.max_sequence_length): continue input = ['<sos>'] + words input = input[:self.max_sequence_length] target = words[:self.max_sequence_length-1] target = target + ['<eos>'] assert len(input) == len(target), "%i, %i" % (len(input), len(target)) length = len(input) input.extend(['<pad>'] * (self.max_sequence_length-length)) target.extend(['<pad>'] * (self.max_sequence_length-length)) input = [self.w2i.get(w, self.w2i['<unk>']) for w in input] target = [self.w2i.get(w, self.w2i['<unk>']) for w in target] id = len(data) data[id]['input'] = input data[id]['label'] = int(l) data[id]['target'] = target data[id]['length'] = length file.close() # shuffle the combined data print("Shuffling the combined data!") data = self.shuffle(data) with io.open(os.path.join(self.data_dir, self.preprocessed_data_file), 'wb') as preprocessed_data_file: data = json.dumps(data, ensure_ascii=False) preprocessed_data_file.write(data.encode('utf8', 'replace')) self._load_data(vocab=False) def shuffle(self, data): keys = [i for i in range(len(data))] random.shuffle(keys) data_shuffled = defaultdict(dict) i = 0 for k in keys: if(data[k] is None): print("error in shuffle") exit() data_shuffled[i] = data[k] i = i+1 return data_shuffled def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) self.w2c = OrderedCounter() self.w2i = dict() self.i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: self.i2w[len(self.w2i)] = st self.w2i[st] = len(self.w2i) labels = ['0', '1'] for l in labels: print("updating vocab with sentences of label {}".format(l)) file = open(self.raw_data_path + l, 'r') num_lines = self.num_lines_0 if l=='0' else self.num_lines_1 for i, line in enumerate(tqdm(file, total=num_lines)): if(i == num_lines): break words = tokenizer.tokenize(line) if(len(words) > self.max_sequence_length): continue self.w2c.update(words) file.close() print("done creating w2c") for w, c in tqdm(self.w2c.items()): if c > self.min_occ and w not in special_tokens: self.i2w[len(self.w2i)] = w self.w2i[w] = len(self.w2i) print("done creating w2i") assert len(self.w2i) == len(self.i2w) print("Vocablurary of %i keys created." % len(self.w2i)) vocab = dict(w2i=self.w2i, i2w=self.i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab() self.v_size = len(self.w2i) def create_w2v_weight_matrix(self): self.emb_matrix = np.zeros((self.v_size, self.embedding_size)) # load the pretrained word embeddings w2v_model = KeyedVectors.load_word2vec_format(self.path_to_w2v_embds) found = 0 not_found = 0 for index in range(self.v_size): word = self.i2w[str(index)] if w2v_model.has_index_for(word): self.emb_matrix[index] = w2v_model.get_vector(word) found += 1 else: self.emb_matrix[index] = np.random.randn(self.embedding_size) # print("word: {} was not found ".format(word)) not_found += 1 np.save(self.path_to_w2v_weights, self.emb_matrix) print("Done creating w2v embedding matrix. {} found and {} unfound".format(found, not_found)) def _get_bow_representations(self, text_sequence): """ Returns BOW representation of every sequence of the batch """ # self.bow_hidden_dim = len(self.bow_filtered_vocab_indices) sequence_bow_representation = np.zeros(shape=self.bow_hidden_dim, dtype=np.float32) # Iterate over each word in the sequence for index in text_sequence: if str(index) in self.bow_filtered_vocab_indices: bow_index = self.bow_filtered_vocab_indices[str(index)] sequence_bow_representation[bow_index] += 1 # removing normalisation because the loss becomes too low with it, anyway it wont change correctness sequence_bow_representation /= np.max([np.sum(sequence_bow_representation), 1]) return np.asarray(sequence_bow_representation) def create_bow_vocab(self, word_index): """ Creates a dict of vocab indeces of non-stopwords and non-sentiment words """ blacklisted_words = set() bow_filtered_vocab_indices = dict() # The '|' operator on sets in python acts as a union operator # blacklisted_words |= set(self.predefined_word_index.values()) if self.filter_sentiment_words: blacklisted_words |= self._get_sentiment_words() if self.filter_stop_words: blacklisted_words |= self._get_stopwords() allowed_vocab = word_index.keys() - blacklisted_words i = 0 for word in allowed_vocab: vocab_index = word_index[word] bow_filtered_vocab_indices[vocab_index] = i i += 1 self.bow_hidden_dim = len(allowed_vocab) print("Created word index blacklist for BoW") print("BoW size: {}".format(self.bow_hidden_dim)) # saving bow vocab with open('./data/yelp/bow.json', 'w') as json_file: json.dump(bow_filtered_vocab_indices, json_file) print("Saved bow.json at {}".format('./data/yelp/bow.json')) def _get_sentiment_words(self): """ Returns all the sentiment words (positive and negative) which are excluded from the main vocab to form the BoW vocab """ with open(file='./data/lexicon/positive-words.txt', mode='r', encoding='ISO-8859-1') as pos_sentiment_words_file,\ open(file='./data/lexicon/negative-words.txt', mode='r', encoding='ISO-8859-1') as neg_sentiment_words_file: pos_words = pos_sentiment_words_file.readlines() neg_words = neg_sentiment_words_file.readlines() words = pos_words + neg_words words = set(word.strip() for word in words) return words def _get_stopwords(self): """ Returns all the stopwords which excluded from the main vocab to form the BoW vocab """ nltk_stopwords = set(stopwords.words('english')) sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS all_stopwords = set() # The '|' operator on sets in python acts as a union operator all_stopwords |= spacy_stopwords all_stopwords |= nltk_stopwords all_stopwords |= sklearn_stopwords return all_stopwords
def _create_vocab(self, bert): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = OrderedCounter() a2c = OrderedCounter() w2i = dict() i2w = dict() a2i = dict() i2a = dict() if bert: self.pad = '[PAD]' self.unk = '[UNK]' self.sos = '[CLS]' self.eos = '[SEP]' else: self.pad = '[PAD]' self.unk = '[UNK]' self.sos = '[SOS]' self.eos = '[EOS]' special_tokens = [self.pad, self.unk, self.sos, self.eos] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) i2a[len(a2i)] = st a2i[st] = len(a2i) with open(self.raw_definition_path, 'r') as file: for i, line in enumerate(file): words = tokenizer.tokenize(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) with open(self.raw_word_path, 'r') as file: for i, line in enumerate(file): words = list(line.strip()) a2c.update(words) for w, c in a2c.items(): if c > self.min_occ and w not in special_tokens: i2a[len(a2i)] = w a2i[w] = len(a2i) assert len(a2i) == len(i2a) print("Vocabulary of %i keys created." % len(w2i)) print("Alphabet of %i keys created." % len(a2i)) vocab = dict(w2i=w2i, i2w=i2w, a2i=a2i, i2a=i2a) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab(bert)