def separate_vocab(vocab_path, roots): vocab_name = os.path.splitext(vocab_path)[0] vocab = get_vocab(vocab_path) print("Got vocabulary with {:d} elements.".format(len(vocab))) write_train_test_vocabs(vocab, roots, vocab_name) print("Divided vocabulary for train and test.")
def data_process(config): train_data, test_data = util.get_data(config['data_name']) vocab2index = util.get_vocab( train_data["text"] + test_data["text"], max_size=config["vocab_size"]) train_data = train_data.map(lambda e: util.encode_sentence( e["text"], vocab2index, config)) train_data.set_format(type='torch', columns=['input_ids', 'label']) test_data = test_data.map(lambda e: util.encode_sentence( e["text"], vocab2index, config)) test_data.set_format(type='torch', columns=['input_ids', 'label']) train_dl = DataLoader( train_data, batch_size=config['batch_size'], shuffle=True) valid_dl = DataLoader(test_data, batch_size=config['batch_size']) pretrained_emb = util.load_glove('glove.6B.300d.txt') pretrained_embeddings = util.get_emb_matrix( pretrained_emb, vocab2index, emb_size=config['embed_dim']) keywords_matrix = [pretrained_emb[k] for k in config["keywords"]] related_embeddings = util.create_relatedness_matrix( keywords_matrix, pretrained_embeddings) print(f'embedding matrix shape: {pretrained_embeddings.shape}') print(f'relatedness matrix shape: {related_embeddings.shape}') return train_dl, valid_dl, pretrained_embeddings, related_embeddings
a label: int 1 or 0 ''' raw_train_vua = data_parser.load_raw_train_vua() raw_test_vua = data_parser.load_raw_test_vua() print('VUA dataset division: ', len(raw_train_vua), len(raw_test_vua)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_train_vua + raw_test_vua) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r') # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' random.seed(0)
for i in range(len(raw_train_vua)): raw_train_vua[i][2] = index_sequence(pos2idx, raw_train_vua[i][2]) for i in range(len(raw_val_vua)): raw_val_vua[i][2] = index_sequence(pos2idx, raw_val_vua[i][2]) print('size of training set, validation set: ', len(raw_train_vua), len(raw_val_vua)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_train_vua) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r') elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r') # no suffix embeddings for sequence labeling suffix_embeddings = None ''' 2. 2 embed the datasets '''
verb_label = int(line[5]) label_seq[verb_idx] = verb_label pos_seq[ verb_idx] = 1 # idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'} raw_mohx.append([sentence.strip(), label_seq, pos_seq]) print('MOH-X dataset division: ', len(raw_mohx)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_mohx) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings # set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r') ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_mohx)
for word_seq, publication_id in raw_test_rcc: output.append({'publication_id': publication_id, 'sentence': word_seq}) logging.info("Writing on new csv file...") writer.writerows(output) logging.info( 'size of test set: {}, annotated by brute-force test set: {}, to-be-found test set: {}' .format( len(raw_test_rcc) + len(test_annotated), len(test_annotated), len(raw_test_rcc))) # logging.info("Read vocabulary info from {}".format(args.vocab_info_path)) # with open(args.vocab_info_path, "rb+") as infile: # word2idx, idx2word = pickle.load(infile) vocab = get_vocab(raw_test_rcc + test_annotated) word2idx, idx2word = get_word2idx_idx2word(vocab) logging.info("Loading glove embeddings") glove_embeddings = get_embedding_matrix(word2idx, idx2word, args, normalization=False) if using_GPU: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0) else: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1) ############ # labeling # ############
def label_data_with_vocab(vocab_path, file_path): vocab = get_vocab(vocab_path) files = do_dirty_labeling_for_file(file_path, os.path.splitext(file_path)[0], vocab) print("Divided dataset into train and test.") return files
# normal version with open('../Poetry/poetry.csv') as f: lines = csv.reader(f) next(lines) for line in lines: raw_poetry.append([line[1].strip(), int(line[2]), int(line[3])]) print('Poetry dataset size: ', len(raw_poetry)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_poetry) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_poetry = None # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' random.seed(0) # set a seed
a label: int 1 or 0 ''' raw_train_toefl = data_parser.load_raw_train_toefl()[0] raw_test_toefl = data_parser.load_raw_test_toefl()[0] print('TOEFL dataset division: ', len(raw_train_toefl), len(raw_test_toefl)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_train_toefl + raw_test_toefl) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_toefl = h5py.File('../elmo/TOEFL_train.hdf5', 'r') # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' random.seed(0)
labeled = line[3] assert (len(word_seq) == len(label_seq)) raw_train_bruteforce_rcc.append( [publication_id, word_seq, label_seq, labeled]) train_rcc = [] for raw_sent, raw_sent_brute in zip(raw_train_rcc, raw_train_bruteforce_rcc): pub_id = raw_sent[0] word_seq = raw_sent[1] label_seq = raw_sent[2] if raw_sent[3] == 'N' and raw_sent_brute[3] == 'Y': label_seq = raw_sent_brute[2] assert (len(word_seq) == len(label_seq)) train_rcc.append([word_seq, label_seq, pub_id]) vocab = get_vocab(train_rcc) word2idx, idx2word = get_word2idx_idx2word(vocab) logging.info("Loading glove embeddings") glove_embeddings = get_embedding_matrix(word2idx, idx2word, args, normalization=False) if using_GPU: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0) else: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1) # logging.info("embedd test data with glove and elmo vectors") # embedded_rcc = [] # for example in tqdm(train_rcc, total=len(train_rcc)):
# normal version with open('../datasets/TroFi/TroFi_formatted_all3737.csv') as f: lines = csv.reader(f) next(lines) for line in lines: raw_trofi.append([line[1].strip(), int(line[2]), int(line[3])]) print('TroFi dataset size: ', len(raw_trofi)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_trofi) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings # set elmos_mohx=None to exclude elmo vectors #elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r') # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets '''
import numpy as np import matplotlib Gutenberg = [] with open('../Poetry/corpus_to_be_labeled.csv') as t: lines = csv.reader(t) next(lines) for line in lines: # Gutenberg.append([line[1].strip(), int(float(line[2])), int(float(line[3]))]) Gutenberg.append([line[1].strip(), int(line[2])]) print('Poetry dataset size: ', len(Gutenberg)) # vocab is a set of words vocab = get_vocab(Gutenberg) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_poetry = None # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' # random.seed(0) # set a seed
# Spliting the data into train, validation and test msk = np.random.rand(len(df)) < 0.8 df_train = df[msk] msk2 = np.random.rand(len(df_train)) < 0.8 df_validate = df_train[~msk2] df_train = df_train[msk2] df_test = df[~msk] print("Train size: %s" % len(df_train)) print("Validation size: %s" % len(df_validate)) print("Test size: %s" % len(df_test)) # Creating the english and hebrew vocabularies eng_vocab, rev_eng_vocab = get_vocab(df["english_sentences"], addtional_tokens=[PAD_TOKEN, OOV_TOKEN], top=None) heb_vocab, rev_heb_vocab = get_vocab(df["hebrew_sentences"], addtional_tokens=[ PAD_TOKEN, OOV_TOKEN, SENTENCE_START_TOKEN, SENTENCE_END_TOKEN ], top=None) def vectorize_dataset(df): """ vectorizing the data into encoder input, decoder input and decoder target. """ vect_eng_sentences = vectorize_sentences(df["english_sentences"],
df = read_data() # Spliting the data into train, validation and test msk = np.random.rand(len(df)) < 0.8 df_train = df[msk] msk2 = np.random.rand(len(df_train)) < 0.8 df_validate = df_train[~msk2] df_train = df_train[msk2] df_test = df[~msk] print("Train size: %s" % len(df_train)) print("Validation size: %s" % len(df_validate)) print("Test size: %s" % len(df_test)) # Creating the english and hebrew vocabularies eng_vocab, rev_eng_vocab = get_vocab(df["english_sentences"], addtional_tokens=[PAD_TOKEN, OOV_TOKEN], top=None) heb_vocab, rev_heb_vocab = get_vocab(df["hebrew_sentences"], addtional_tokens=[PAD_TOKEN, OOV_TOKEN, SENTENCE_START_TOKEN, SENTENCE_END_TOKEN], top=None) def vectorize_dataset(df): """ vectorizing the data into encoder input, decoder input and decoder target. """ vect_eng_sentences = vectorize_sentences(df["english_sentences"], eng_vocab, encode=True, reverse=True) decoder_input_data = vectorize_sentences(df["hebrew_sentences"], heb_vocab, add_prefix_token=SENTENCE_START_TOKEN, encode=True) decoder_target_data = np.array( [np.concatenate((x[1:], [heb_vocab[SENTENCE_END_TOKEN]]), axis=0) for x in decoder_input_data]) return vect_eng_sentences, decoder_input_data, decoder_target_data