test_df.to_pickle(test_pre_path)
    else:
        train_df = pd.read_pickle(train_pre_path)
        val_df = pd.read_pickle(val_pre_path)
        test_df = pd.read_pickle(test_pre_path)
        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0],
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_df[data_column] = tokenizer.fit_transform(train_df[data_column])
        val_df[data_column] = tokenizer.transform(val_df[data_column])
        test_df[data_column] = tokenizer.transform(test_df[data_column])

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_tok_path)
        val_df.to_pickle(val_tok_path)
        test_df.to_pickle(test_tok_path)

    else:
        train_df = pd.read_pickle(train_tok_path)
        val_df = pd.read_pickle(val_tok_path)
        test_df = pd.read_pickle(test_tok_path)

    ## for testing purposes
示例#2
0
        np.save(val_pre_path.format("data"), val_data, allow_pickle=True)
        np.save(test_pre_path.format("data"), test_data, allow_pickle=True)
        np.save(train_pre_path.format("target"),
                train_target,
                allow_pickle=True)
        np.save(val_pre_path.format("target"), val_target, allow_pickle=True)
        np.save(test_pre_path.format("target"), test_target, allow_pickle=True)

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(args=tokenizer_model,
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_data = tokenizer.fit_transform(train_data)
        val_data = tokenizer.transform(val_data)
        test_data = tokenizer.transform(test_data)

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        if sparse.issparse(train_data):
            sparse.save_npz(train_tok_path.format("data"), train_data)
        else:
            np.save(train_tok_path.format("data"), train_data)
        np.save(train_tok_path.format("target"), train_target)
        if sparse.issparse(val_data):
            sparse.save_npz(val_tok_path.format("data"), val_data)
        else:
            np.save(val_tok_path.format("data"), val_data)
        np.save(val_tok_path.format("target"), val_target)