Пример #1
0
        pickle.dump(pad_token_dict, open(fine_label_path + "/pad_token_dict.pkl", "wb"))
        print("Pad token dict Dumped")

        temp_df = df[df.label.isin(children)].reset_index(drop=True)
        temp_coarse_lbls = [p] * len(temp_df.text.values)
        temp_coarse_label_to_index = {p: 0}

        coarse_input_ids, coarse_attention_masks, _ = gpt2_tokenize(coarse_tokenizer, temp_df.text.values,
                                                                    temp_coarse_lbls, pad_token_dict,
                                                                    temp_coarse_label_to_index)
        fine_input_ids, fine_attention_masks = gpt2_fine_tokenize(fine_tokenizer, temp_df, index_to_label,
                                                                  pad_token_dict)
        dataset = TensorDataset(coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks)

        train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=1)

        label_to_exclusive_dataloader = {}
        for ch in children:
            child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + ch + ".pkl", "rb"))
            # for i in range(1, iteration + 1):
            #     temp_child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(i) + "it/" + ch + ".pkl", "rb"))
            #     if i == 1:
            #         child_df = temp_child_df
            #     else:
            #         child_df = pd.concat([child_df, temp_child_df])
            temp_child_lbls = [ch] * len(child_df.text.values)
            child_exc_input_ids, child_exc_attention_masks = basic_gpt2_tokenize(fine_tokenizer, child_df.text.values,
                                                                                 temp_child_lbls, pad_token_dict)
            child_exc_dataset = TensorDataset(child_exc_input_ids, child_exc_attention_masks)
            dataloader = DataLoader(
Пример #2
0
        for ch in parent_to_child[p]:
            temp_df = pickle.load(
                open(
                    pkl_dump_dir + "exclusive/" + algo + "/" + str(iteration) +
                    "it/" + ch + ".pkl", "rb"))
            temp_df["label"] = [ch] * len(temp_df)
            if df_weaksup is None:
                df_weaksup = temp_df
            else:
                df_weaksup = pd.concat([df_weaksup, temp_df])

    df = pd.concat([df, df_weaksup])
    coarse_input_ids, coarse_attention_masks = basic_gpt2_tokenize(
        tokenizer, df.text.values, df.label.values, pad_token_dict)
    # Combine the training inputs into a TensorDataset.
    dataset = TensorDataset(coarse_input_ids, coarse_attention_masks)

    # Create a 90-10 train-validation split.
    coarse_train_dataloader, coarse_validation_dataloader = create_data_loaders(
        dataset, batch_size=4)

    model = train(model, tokenizer, coarse_train_dataloader,
                  coarse_validation_dataloader, doc_start_ind, all_labels,
                  device, pad_token_dict)
    test_generate(model, tokenizer, all_labels, pad_token_dict, device)

    tokenizer.save_pretrained(tok_path)
    torch.save(model, model_path + model_name)
    pickle.dump(pad_token_dict, open(pkl_dump_dir + "pad_token_dict.pkl",
                                     "wb"))