def get_augmented_data(train_txt_path, augmentation, alpha, n_aug=1): output_pkl_path = train_txt_path.parent.joinpath(f"train_aug_{augmentation}_data_{alpha}.pkl") if not output_pkl_path.exists(): print(f"creating {output_pkl_path}") lines = open(train_txt_path, 'r').readlines() sentence_to_augmented_sentences = {} for line in lines: parts = line[:-1].split('\t') sentence = parts[1] if augmentation == 'swap': augmented_sentences = eda.get_swap_sentences(sentence, n_aug, alpha) elif augmentation == 'insert': augmented_sentences = eda.get_insert_sentences(sentence, n_aug, alpha) elif augmentation == 'delete': augmented_sentences = eda.get_insert_sentences(sentence, n_aug, alpha) sentence_to_augmented_sentences[sentence] = augmented_sentences utils_common.save_pickle(output_pkl_path, sentence_to_augmented_sentences) return utils_common.load_pickle(output_pkl_path)
def get_x_y(txt_path, embedding_path): lines = open(txt_path).readlines() string_to_embedding = utils_common.load_pickle(embedding_path) x = np.zeros((len(lines), 768)) y = np.zeros((len(lines), )) for i, line in enumerate(lines): parts = line[:-1].split('\t') label = int(parts[0]) string = parts[1] assert string in string_to_embedding embedding = string_to_embedding[string] x[i, :] = embedding y[i] = label x, y = shuffle(x, y, random_state=0) return x, y
def get_split_train_embedding_dict(sentence_to_augmented_sentences, train_txt_path, augmentation, alpha): embeddings_dict_path = train_txt_path.parent.joinpath( f"train_aug_{augmentation}_embeddings_{alpha}.pkl") if not embeddings_dict_path.exists(): print(f"creating {embeddings_dict_path}") string_to_embedding = {} for sentence, augmented_sentences in tqdm( sentence_to_augmented_sentences.items()): embedding = get_embedding(sentence, tokenizer, model) string_to_embedding[sentence] = embedding for augmented_sentence in augmented_sentences: aug_embedding = get_embedding(augmented_sentence, tokenizer, model) string_to_embedding[augmented_sentence] = aug_embedding utils_common.save_pickle(embeddings_dict_path, string_to_embedding) return utils_common.load_pickle(embeddings_dict_path)
return last_hidden_states def save_word_to_embedding_pickle(word_to_aoa, output_path): bert_vocab = tokenizer.get_vocab() print( f"{len(bert_vocab)} in bert vocab, {len(word_to_aoa)} in aoa or abstractness" ) word_to_embedding = {} for word in tqdm(list(word_to_aoa.keys())): if word in bert_vocab: embedding = get_embedding(word, tokenizer, model) word_to_embedding[word] = embedding utils_common.save_pickle(word_to_embedding, output_path) print(f"{len(word_to_embedding)} words saved") if __name__ == "__main__": # word_to_aoa = utils_common.load_pickle(config.aoa_dict_path) # save_word_to_embedding_pickle(word_to_aoa, config.aoa_embedding_path) # word_to_embedding = utils_common.load_pickle(config.aoa_embedding_path) word_to_abstractness = utils_common.load_pickle( config.abstractness_dict_path) save_word_to_embedding_pickle(word_to_abstractness, config.abstractness_embedding_path) word_to_embedding = utils_common.load_pickle( config.abstractness_embedding_path)
def get_split_train_x_y(train_txt_path, train_subset, seed_num, setup, alpha): setup_to_augmentations = { 'swap': ['swap'], 'delete': ['delete'], 'insert': ['insert'], 'swap-mtl': ['swap'], 'delete-mtl': ['delete'], 'insert-mtl': ['insert'], 'three_aug': ['delete', 'insert', 'swap'], 'three_aug-mtl': ['delete', 'insert', 'swap'], 'vanilla': []} augmentations = setup_to_augmentations[setup] big_dict_aug_sentences = {} big_dict_embeddings = {} for augmentation in augmentations: sentence_to_augmented_sentences = get_augmented_data(train_txt_path, augmentation, alpha) string_to_embedding = utils_bert.get_split_train_embedding_dict(sentence_to_augmented_sentences, train_txt_path, augmentation, alpha) big_dict_aug_sentences[augmentation] = sentence_to_augmented_sentences big_dict_embeddings[augmentation] = string_to_embedding sentence_to_label = get_sentence_to_label(train_txt_path) sentences = list(sentence_to_label.keys()) labels = [] for sentence in sentences: label = sentence_to_label[sentence] labels.append(label) original_sentence_to_embedding = utils_common.load_pickle(train_txt_path.parent.joinpath(f"train_embeddings.pkl")) train_sentences, _, train_labels, _ = train_test_split(sentences, labels, train_size=train_subset, random_state = seed_num, stratify = labels) # get train_x_np train_x = [] aug_train_x_dict = {augmentation: [] for augmentation in augmentations} for train_sentence in train_sentences: embedding = original_sentence_to_embedding[train_sentence] train_x.append(embedding) for augmentation in augmentations: sentence_to_augmented_sentences = big_dict_aug_sentences[augmentation] string_to_embedding = big_dict_embeddings[augmentation] train_sentence_swap = sentence_to_augmented_sentences[train_sentence][0] embedding_swap = string_to_embedding[train_sentence_swap] aug_train_x_dict[augmentation].append(embedding_swap) for augmentation in augmentations: train_x += aug_train_x_dict[augmentation] train_x_np = np.asarray(train_x) # get train_y_np train_labels_dup = list(train_labels) for _ in augmentations: train_labels_dup += train_labels train_y_np = np.asarray(train_labels_dup) #get train_y_aux num_classes_aux = len([train_x] + augmentations) train_labels_aux = [] for y_aux in range(num_classes_aux): for _ in range(len(train_sentences)): train_labels_aux.append(y_aux) train_y_aux_np = np.asarray(train_labels_aux) return train_x_np, train_y_np, train_y_aux_np, num_classes_aux