예제 #1
0
def main(_params):
    global params
    params = _params

    train_df, test_df, dev_df, labels, num_labels, label_map, data_dir = prepare_data(
    )

    data_args, model_args, config, tokenizer = prepare_config_and_tokenizer(
        data_dir, labels, num_labels, label_map)

    # ## Create Dataset Objects

    train_dataset = NerDataset(
        data_dir=data_args['data_dir'],
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args['max_seq_length'],
        overwrite_cache=data_args['overwrite_cache'],  # True
        mode=Split.train,
        data_size=params["data_size"])

    eval_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=data_args['overwrite_cache'],
                              mode=Split.dev,
                              data_size=params["data_size"])

    print(train_dataset.__len__(), eval_dataset.__len__())

    # Train top-model using the Trainer API
    trainer, model = run_train(train_dataset, eval_dataset, config, model_args,
                               labels, num_labels, label_map)

    gc.collect()
    torch.cuda.empty_cache()

    # ## Prepare test data, run trainer over test data and print metrics

    # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt
    test_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=True,
                              mode=Split.test,
                              data_size=params["data_size"])

    run_test(trainer, model, train_dataset, train_df, label_map)
    run_test(trainer, model, eval_dataset, dev_df, label_map)
    run_test(trainer, model, test_dataset, test_df, label_map)
예제 #2
0
def main(_params):
    global params
    params = _params
    '''
    params['seed_value'] = args.seed_value
    params['set_seed'] = args.set_seed
    '''
    wb_run = wandb.init(project="NER", name=params['exp_name'] + "_init")
    if params['set_seed']:
        random_seed_set(params['seed_value'])

    train_df, test_df, dev_df, labels, num_labels, label_map, data_dir, wt = prepare_data(
    )

    data_args, model_args, config, tokenizer = prepare_config_and_tokenizer(
        data_dir, labels, num_labels, label_map)

    if 'add_vocab' in params.keys():
        process_entity(tokenizer, train_df)
        process_entity(tokenizer, dev_df)
        process_entity(tokenizer, test_df)

    # ## Create Dataset Objects

    xargs = {}
    if params.get('xargs'):
        xargs = params['xargs']
    xargs['wt'] = wt
    print('Got class weights')
    xargs["top_model"] = params.get("top_model")

    train_dataset = NerDataset(
        data_dir=data_args['data_dir'],
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args['max_seq_length'],
        overwrite_cache=data_args['overwrite_cache'],  # True
        mode=Split.train,
        data_size=params["data_size"],
        xargs=xargs)

    eval_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=data_args['overwrite_cache'],
                              mode=Split.dev,
                              data_size=100)

    # ## Prepare test data, run trainer over test data and print metrics

    # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt
    test_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=True,
                              mode=Split.test,
                              data_size=100)

    print(train_dataset.__len__(), eval_dataset.__len__(),
          test_dataset.__len__())
    wb_run.finish()

    # Train top-model using the Trainer API
    if params.get("hyp"):
        run_hyperp(train_dataset, eval_dataset, config, model_args, labels,
                   num_labels, label_map, tokenizer, xargs)
        return

    trainer, model = run_train(train_dataset, eval_dataset, config, model_args,
                               labels, num_labels, label_map, tokenizer, xargs)

    gc.collect()
    torch.cuda.empty_cache()

    wb_run = wandb.init(project="NER", name=params['exp_name'] + "summary")
    report = run_test(trainer, model, train_dataset, train_df, label_map)
    wandb.run.summary["train_report"] = report
    report = run_test(trainer, model, eval_dataset, dev_df, label_map)
    wandb.run.summary["val_report"] = report
    report = run_test(trainer, model, test_dataset, test_df, label_map)
    wandb.run.summary["test_report"] = report
    wandb.run.summary["model"] = model.__repr__()
    wandb.run.summary["data"] = {
        "train": train_dataset.__len__(),
        "val": eval_dataset.__len__(),
        "test": test_dataset.__len__(),
        "wt": wt
    }
    params["model_type"] = params["model_type"].name
    wandb.run.summary["params"] = params
    wb_run.finish()