for i, l in enumerate(list(children)): label_to_index[l] = i index_to_label[i] = l doc_start_ind, pad_token_dict = create_pad_token_dict(p, parent_to_child, coarse_tokenizer, fine_tokenizer) print(pad_token_dict, doc_start_ind) pickle.dump(pad_token_dict, open(fine_label_path + "/pad_token_dict.pkl", "wb")) print("Pad token dict Dumped") temp_df = df[df.label.isin(children)].reset_index(drop=True) temp_coarse_lbls = [p] * len(temp_df.text.values) temp_coarse_label_to_index = {p: 0} coarse_input_ids, coarse_attention_masks, _ = gpt2_tokenize(coarse_tokenizer, temp_df.text.values, temp_coarse_lbls, pad_token_dict, temp_coarse_label_to_index) fine_input_ids, fine_attention_masks = gpt2_fine_tokenize(fine_tokenizer, temp_df, index_to_label, pad_token_dict) dataset = TensorDataset(coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks) train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=1) label_to_exclusive_dataloader = {} for ch in children: child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + ch + ".pkl", "rb")) # for i in range(1, iteration + 1): # temp_child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(i) + "it/" + ch + ".pkl", "rb")) # if i == 1: # child_df = temp_child_df # else:
index_to_label[i] = l if fine_dir_name == "fine": temp_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + l + ".pkl", "rb")) else: temp_df = pickle.load( open(pkl_dump_dir + "exclusive_ceonly_" + str(iteration) + "it/" + l + ".pkl", "rb")) temp_df["label"] = [l] * len(temp_df) if i == 0: df = temp_df else: df = pd.concat([df, temp_df]) doc_start_ind, pad_token_dict = create_pad_token_dict(p, parent_to_child, coarse_tokenizer, fine_tokenizer) print(pad_token_dict, doc_start_ind) input_ids, attention_masks, labels = gpt2_tokenize(fine_tokenizer, df.text.values, df.label.values, pad_token_dict, label_to_index) # Combine the training inputs into a TensorDataset. dataset = TensorDataset(input_ids, attention_masks, labels) # Create a 90-10 train-validation split. train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=4) doc_start_ind_dict = {} for ch in children: doc_start_ind_dict[ch] = doc_start_ind model = train(fine_model, fine_tokenizer, train_dataloader, validation_dataloader, index_to_label, pad_token_dict, doc_start_ind_dict, device) test_generate(model, fine_tokenizer, set(children), pad_token_dict, device)