glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings # set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r') ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_mohx) # second argument is the post sequence, which we don't need embedded_mohx = [[ embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_mohx, None), example[2], example[1] ] for example in raw_mohx] ''' 2. 3 10-fold cross validation ''' # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument sentences = [example[0] for example in embedded_mohx] poss = [example[1] for example in embedded_mohx] labels = [example[2] for example in embedded_mohx] # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels) ten_folds = [] fold_size = int(647 / 10) for i in range(10): ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size], poss[i * fold_size:(i + 1) * fold_size], labels[i * fold_size:(i + 1) * fold_size]))
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r') elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r') # no suffix embeddings for sequence labeling suffix_embeddings = None ''' 2. 2 embed the datasets ''' # raw_train_vua: sentence, label_seq, pos_seq # embedded_train_vua: embedded_sentence, pos, labels embedded_train_vua = [[ embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_train_vua, suffix_embeddings), example[2], example[1] ] for example in raw_train_vua] embedded_val_vua = [[ embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_val_vua, suffix_embeddings), example[2], example[1] ] for example in raw_val_vua] ''' 2. 3 set up Dataloader for batching ''' # Separate the input (embedded_sequence) and labels in the indexed train sets. # embedded_train_vua: embedded_sentence, pos, labels train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua], [example[1] for example in embedded_train_vua],
# elmo_embeddings # set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r') bert_mohx = None suffix_embeddings = None #suffix_embeddings = nn.Embedding(15, 50) ''' 2. 2 embed the datasets ''' # second argument is the post sequence, which we don't need embedded_mohx = [[ embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_mohx, bert_mohx, suffix_embeddings), example[2], example[1] ] for example in raw_mohx] #100 times 10-fold cross validation #for valid in range(100): ''' 2. 3 10-fold cross validation ''' # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument sentences = [example[0] for example in embedded_mohx] poss = [example[1] for example in embedded_mohx] labels = [example[2] for example in embedded_mohx] # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels) ten_folds = [] fold_size = int(647 / 10)
normalization=False) if using_GPU: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0) else: elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1) ############ # labeling # ############ embedded_test_rcc = [] logging.info("embedd test data with glove and elmo vectors") for example in tqdm(raw_test_rcc, total=len(raw_test_rcc)): embedded_test_rcc.append([ example[1], embed_indexed_sequence(example[0], word2idx, glove_embeddings, elmo) ]) # pickle.dump(embedded_test_rcc, open('./labeler_embedd_temp.data', "wb+"), protocol=-1) # with open('./labeler_embedd_temp.data', "rb+") as infile: # embedded_test_rcc = pickle.load(infile) logging.info("Set up Dataloader") test_dataset_rcc = RNN_Testset( [example[0] for example in embedded_test_rcc], # pub_id [example[1] for example in embedded_test_rcc]) # embedded sentence # Set up a DataLoader for the test dataset test_dataloader_rcc = DataLoader(dataset=test_dataset_rcc, batch_size=args.batch_size, collate_fn=RNN_Testset.collate_fn)
word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings #elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r') #elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r') # no suffix embeddings for sequence labeling suffix_embeddings = None ''' 2. 2 embed the datasets ''' # raw_train_vua: sentence, label_seq, pos_seq # embedded_train_vua: embedded_sentence, pos, labels embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, None , suffix_embeddings), example[2], example[1]] for example in raw_train_vua] embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, None, suffix_embeddings), example[2], example[1]] for example in raw_val_vua] ''' 2. 3 set up Dataloader for batching ''' # Separate the input (embedded_sequence) and labels in the indexed train sets. # embedded_train_vua: embedded_sentence, pos, labels train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
# glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r') elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r') # pos_embeddings: the pos embedding dimension is 50 # pos_embeddings = nn.Embedding(len(pos2idx), 50) pos_embeddings = None ''' 2. 2 embed the datasets ''' # raw_train_vua: sentence, label_seq, pos_seq # embedded_train_vua: embedded_sentence, pos, labels embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_train_vua, pos_embeddings), example[2], example[1]] for example in raw_train_vua] embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_val_vua, pos_embeddings), example[2], example[1]] for example in raw_val_vua] ''' 2. 3 set up Dataloader for batching ''' # Separate the input (embedded_sequence) and labels in the indexed train sets. # embedded_train_vua: embedded_sentence, pos, labels train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua], [example[1] for example in embedded_train_vua],
suffix_embeddings = None #suffix_embeddings = nn.Embedding(15, 50) ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_trofi) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed(0) torch.backends.cudnn.deterministic = True embedded_trofi = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_trofi, bert_trofi, suffix_embeddings), example[2], example[1]] for example in raw_trofi] ''' 2. 3 10-fold cross validation ''' # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument sentences = [example[0] for example in embedded_trofi] poss = [example[1] for example in embedded_trofi] labels = [example[2] for example in embedded_trofi] # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels) ten_folds = [] fold_size = int(3737 / 10) for i in range(10): ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size],
# no suffix embeddings for sequence labeling suffix_embeddings = None ''' 2. 2 embed the datasets ''' # random.seed(0) # random.shuffle(raw_train_vua) sentence_to_index_train = ast.literal_eval( elmos_train_vua['sentence_to_index'][0]) labels = [example[1] for example in raw_train_vua] poss = [example[2] for example in raw_train_vua] sentences = [ embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, elmos_train_vua, suffix_embeddings, sentence_to_index_train[example[0]]) for example in raw_train_vua ] def train_model(train_dataloader_vua, val_dataloader_vua, fold_num): optimal_f1s = [] optimal_ps = [] optimal_rs = [] optimal_accs = [] predictions_all = [] RNNseq_model = RNNSequenceModel(num_classes=2, embedding_dim=300 + 1024, hidden_size=300,
# glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings # set elmos_trofi=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization #elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r') ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_trofi) # second argument is the post sequence, which we don't need embedded_trofi = [[embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings, None), example[2], example[1]] for example in raw_trofi] ''' 2. 3 10-fold cross validation ''' # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument sentences = [example[0] for example in embedded_trofi] poss = [example[1] for example in embedded_trofi] labels = [example[2] for example in embedded_trofi] # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels) ten_folds = [] fold_size = int(3737 / 10)