''' raw_train_vua = data_parser.load_raw_train_vua() raw_test_vua = data_parser.load_raw_test_vua() print('VUA dataset division: ', len(raw_train_vua), len(raw_test_vua)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_train_vua + raw_test_vua) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r') # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_train_vua)
# """ # Metaphor Identification # """ # train_sample_ms, train_sam_sen_ms = get_metaphor_feature(train_data) # test_sample_ms, test_sam_sen_ms = get_metaphor_feature(test_data) train_dp = 'train_an' test_dp = 'test_an' train_data, train_vocab, train_sample_ms, train_sam_sen_ms = get_data(train_dp) test_data, test_vocab, test_sample_ms, test_sam_sen_ms = get_data(test_dp) """ Data Embedding optional: Bert or Glove. Default Glove """ word2idx, idx2word = get_word2idx_idx2word(train_vocab) glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) train_embedded_text, train_labels = embed_sentences(train_data, word2idx, glove_embeddings) test_embedded_text, test_labels = embed_sentences(test_data, word2idx, glove_embeddings) """ Produce Dataset & DataLoader """ train_dataset = TextDataset(train_embedded_text, train_sample_ms, train_sam_sen_ms, train_labels) test_dataset = TextDataset(test_embedded_text, test_sample_ms, test_sam_sen_ms, test_labels)