예제 #1
0
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_df = load_rte_data_file("data/train.jsonl")
eval_df = pd.read_json("data/eval_df", lines=True, orient="records")
test_df = pd.read_json("data/test_df", lines=True, orient="records")

model_args = ClassificationArgs()
model_args.eval_batch_size = 8
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.learning_rate = 4e-4
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False
model_args.labels_list = ["not_entailment", "entailment"]
model_args.wandb_project = "RTE - Hyperparameter Optimization"

예제 #2
0
train_df.columns = ["text", "labels"]

eval_df = df.iloc[wandb_config["samples"]:, :]

eval_df.columns = ["text", "labels"]

model_args = ClassificationArgs()
model_args.num_train_epochs = wandb_config["epochs"]
model_args.eval_batch_size = wandb_config["eval_batch_size"]
model_args.train_batch_size = wandb_config["train_batch_size"]
model_args.wandb_project = "transformer-aes"
model_args.wandb_kwargs = {
    "name": "{}-{}".format(wandb_config["model"], wandb_config["samples"])
}
model_args.learning_rate = wandb_config["lr"]
model_args.model = wandb_config["model"]
model_args.samples = wandb_config["samples"]
# model_args.max_seq_length = wandb_config["max_seq_length"]
model_args.regression = True
model_args.no_save = True
model_args.overwrite_output_dir = True
model_args.logging_steps = 1
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training_steps = np.ceil(
    (wandb_config["samples"] / wandb_config["train_batch_size"]) * 10)
model_args.use_eval_cached_features = True

model = ClassificationModel(wandb_config["model"],
                            wandb_config["save"],
예제 #3
0
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_df = load_rte_data_file("data/train.jsonl")
eval_df = pd.read_json("data/eval_df", lines=True, orient="records")
test_df = pd.read_json("data/test_df", lines=True, orient="records")

model_args = ClassificationArgs()
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = -1
model_args.learning_rate = 0.00003173
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 40
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False
model_args.labels_list = ["not_entailment", "entailment"]
model_args.output_dir = "vanilla_output"
model_args.best_model_dir = "vanilla_output/best_model"
model_args.wandb_project = "RTE - Hyperparameter Optimization"
예제 #4
0
def bert_training(model_type, model_base, train_data, early_stop,
                  early_stop_delta, overwrite, epoch, batch_size,
                  learning_rate, output):

    # Bringing in the training data
    with open(train_data, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        train.append(json.loads(json_str))

    # Data cleaning
    train_labels = [train[i]['label'] for i in range(len(train))]

    train_response = [
        remove_stopwords(convert_emojis(train[i]['response']))
        for i in range(len(train))
    ]

    # Split data into training and test sets
    labels_train, labels_test, response_train, response_test = train_test_split(
        train_labels, train_response, test_size=0.2, random_state=42)

    # Convert SARCASM/NO SARCASM labels into 1s and 0s
    labels_train_pd = (pd.DataFrame(labels_train) == 'SARCASM').astype(int)
    labels_test_pd = (pd.DataFrame(labels_test) == 'SARCASM').astype(int)
    response_train_pd = pd.DataFrame(response_train)
    response_test_pd = pd.DataFrame(response_test)

    train_bert = pd.DataFrame({
        'text':
        response_train_pd[0].replace(r'\n', ' ', regex=True),
        'label':
        labels_train_pd[0]
    })

    eval_bert = pd.DataFrame({
        'text':
        response_test_pd[0].replace(r'\n', ' ', regex=True),
        'label':
        labels_test_pd[0]
    })

    model_args = ClassificationArgs()
    model_args.use_early_stopping = early_stop
    model_args.early_stopping_delta = early_stop_delta
    model_args.overwrite_output_dir = overwrite
    model_args.num_train_epochs = epoch
    model_args.train_batch_size = batch_size
    model_args.learning_rate = learning_rate
    model_args.output_dir = output

    # Create a TransformerModel
    model = ClassificationModel(model_type,
                                model_base,
                                use_cuda=False,
                                args=model_args)

    # Train the model
    model.train_model(train_bert)

    # Evaluate the model
    model.eval_model(eval_bert)
# Preparing train data
# train_df = load_rte_data_file("data/train.jsonl")
train_df = pd.read_json("data/augmented_train.jsonl",
                        lines=True,
                        orient="records")
eval_df = pd.read_json("data/eval_df", lines=True, orient="records")
test_df = pd.read_json("data/test_df", lines=True, orient="records")

model_args = ClassificationArgs()
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = -1
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.learning_rate = 1e-5
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.num_train_epochs = 3
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
model_args.labels_list = ["not_entailment", "entailment"]
model_args.output_dir = "default_output"
model_args.best_model_dir = "default_output/best_model"
model_args.wandb_project = "RTE - Hyperparameter Optimization"
model_args.wandb_kwargs = {"name": "augmented-default"}
예제 #6
0
def cross_validation(lang):
    print(lang)
    model_name = 'EMBEDDIA/crosloengual-bert'

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger('transformers')
    transformers_logger.setLevel(logging.WARNING)

    # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns.
    # If the Dataframe has a header, it should contain a 'text' and a 'labels' column.
    # If no header is present, the Dataframe should contain at least two columns,
    # with the first column is the text with type str, and the second column in the label with type int.
    accs = []
    f1s = []
    df = load_single_lang(lang)

    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(df.index):
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]

        # hyperparameters
        model_args = ClassificationArgs()
        model_args.logging_steps = 1000000
        model_args.save_eval_checkpoints = False
        model_args.save_steps = 1000000
        model_args.no_cache = True
        model_args.save_model_every_epoch = False

        model_args.num_train_epochs = 1
        model_args.learning_rate = 2e-4
        model_args.train_batch_size = 32
        model_args.overwrite_output_dir = True
        '''
        model_args.train_custom_parameters_only = True
        model_args.custom_parameter_groups = [
            {
                "params": ["classifier.weight"],
                "lr": 2e-4,
            },
            {
                "params": ["classifier.bias"],
                "lr": 2e-4,
                "weight_decay": 0.0,
            },
        ]
        '''

        # Create a ClassificationModel
        model = ClassificationModel('bert',
                                    model_name,
                                    num_labels=3,
                                    args=model_args)
        print(model.get_named_parameters())

        # Train the model
        print('Training ...')
        model.train_model(df_train)

        # Evaluate the model
        print('Evaluating ...')
        predictions, raw_outputs = model.predict(df_test['text'].values)
        out = eval(df_test['labels'].values, predictions)
        accs.append(out['acc'])
        f1s.append(out['avg_f1'])

        del model

    # write results to file
    with open('results_csebert.txt', 'a+') as f:
        f.write("{} {} {}\n".format(lang, statistics.mean(accs),
                                    statistics.mean(f1s)))
예제 #7
0
def fit_and_evaluate(train_langs, test_lang):
    print(train_langs, test_lang)
    model_name = 'EMBEDDIA/crosloengual-bert'

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger('transformers')
    transformers_logger.setLevel(logging.WARNING)

    # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns.
    # If the Dataframe has a header, it should contain a 'text' and a 'labels' column.
    # If no header is present, the Dataframe should contain at least two columns,
    # with the first column is the text with type str, and the second column in the label with type int.
    df_train, df_test = load_dataset(train_langs,
                                     test_lang,
                                     train_on_test_lang=True)

    # hyperparameters
    model_args = ClassificationArgs()
    model_args.logging_steps = 1000000
    model_args.save_eval_checkpoints = False
    model_args.save_steps = 1000000
    model_args.no_cache = True
    model_args.save_model_every_epoch = False

    model_args.num_train_epochs = 1
    model_args.learning_rate = 2e-4
    model_args.train_batch_size = 32
    model_args.overwrite_output_dir = True
    '''
    model_args.train_custom_parameters_only = True
    model_args.custom_parameter_groups = [
        {
            "params": ["classifier.weight"],
            "lr": 2e-4,
        },
        {
            "params": ["classifier.bias"],
            "lr": 2e-4,
            "weight_decay": 0.0,
        },
    ]
    '''

    # Create a ClassificationModel
    model = ClassificationModel('bert',
                                model_name,
                                num_labels=3,
                                args=model_args)
    print(model.get_named_parameters())

    # Train the model
    print('Training ...')
    model.train_model(df_train)

    # Evaluate the model
    print('Evaluating ...')
    predictions, raw_outputs = model.predict(df_test['text'].values)
    out = eval(df_test['labels'].values, predictions)

    # write results to file
    with open('results_csebert.txt', 'a+') as f:
        f.write("{} {} {}\n".format(train_langs, test_lang, out))

    del model