Пример #1
0
        for i, l in enumerate(list(children)):
            label_to_index[l] = i
            index_to_label[i] = l

        doc_start_ind, pad_token_dict = create_pad_token_dict(p, parent_to_child, coarse_tokenizer, fine_tokenizer)
        print(pad_token_dict, doc_start_ind)

        pickle.dump(pad_token_dict, open(fine_label_path + "/pad_token_dict.pkl", "wb"))
        print("Pad token dict Dumped")

        temp_df = df[df.label.isin(children)].reset_index(drop=True)
        temp_coarse_lbls = [p] * len(temp_df.text.values)
        temp_coarse_label_to_index = {p: 0}

        coarse_input_ids, coarse_attention_masks, _ = gpt2_tokenize(coarse_tokenizer, temp_df.text.values,
                                                                    temp_coarse_lbls, pad_token_dict,
                                                                    temp_coarse_label_to_index)
        fine_input_ids, fine_attention_masks = gpt2_fine_tokenize(fine_tokenizer, temp_df, index_to_label,
                                                                  pad_token_dict)
        dataset = TensorDataset(coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks)

        train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=1)

        label_to_exclusive_dataloader = {}
        for ch in children:
            child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + ch + ".pkl", "rb"))
            # for i in range(1, iteration + 1):
            #     temp_child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(i) + "it/" + ch + ".pkl", "rb"))
            #     if i == 1:
            #         child_df = temp_child_df
            #     else:
Пример #2
0
            index_to_label[i] = l
            if fine_dir_name == "fine":
                temp_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + l + ".pkl", "rb"))
            else:
                temp_df = pickle.load(
                    open(pkl_dump_dir + "exclusive_ceonly_" + str(iteration) + "it/" + l + ".pkl", "rb"))
            temp_df["label"] = [l] * len(temp_df)
            if i == 0:
                df = temp_df
            else:
                df = pd.concat([df, temp_df])

        doc_start_ind, pad_token_dict = create_pad_token_dict(p, parent_to_child, coarse_tokenizer, fine_tokenizer)
        print(pad_token_dict, doc_start_ind)

        input_ids, attention_masks, labels = gpt2_tokenize(fine_tokenizer, df.text.values, df.label.values,
                                                           pad_token_dict, label_to_index)

        # Combine the training inputs into a TensorDataset.
        dataset = TensorDataset(input_ids, attention_masks, labels)

        # Create a 90-10 train-validation split.
        train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=4)

        doc_start_ind_dict = {}
        for ch in children:
            doc_start_ind_dict[ch] = doc_start_ind

        model = train(fine_model, fine_tokenizer, train_dataloader, validation_dataloader, index_to_label,
                      pad_token_dict, doc_start_ind_dict, device)
        test_generate(model, fine_tokenizer, set(children), pad_token_dict, device)