def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    if config.clap:
        ds = ClapDataset(config, tokenizer)
    else:
        ds = Dataset(config, tokenizer)

    label_names = ds.label_names
    model_config = tu.load_model_config(config)

    interp_out_file = Path(config.interp_out_file) if config.interp_out_file else None

    for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(ds.get_train_valid_dataloaders(include_valid_df=True)):
        print(f"------------------ BEGIN ITER {i} -----------------------")
        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        run_name = f"fold{i}"

        experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)


        if interp_out_file:
            interp_df = experiment.interpret(valid_dataloader, valid_df, label_names=label_names)
            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                print(interp_df)
            interp_df.to_csv(interp_out_file.with_name(f"{interp_out_file.name}_iter{i}"), index=False)

        results = experiment.results
        # experiment.evaluate('valid', valid_dataloader)
        print(f"================== DONE ITER {i} =======================\n\n")

    return results
Пример #2
0
def main(config, results):

    logger.warning('Unclassified threshold: %s', config.self_train_thresh)

    ds = TDDataset(config,
                   binary=True,
                   self_train_thresh=config.self_train_thresh,
                   keyword_masking_frac=config.keyword_masking_frac)

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)
    label_names = ds.label_names

    #project_name = 'emf-2.4.1'
    project_name = config.single_project

    iter_obj = [
        (project_name, *ds.get_train_valid_dataloaders(
            tokenizer, project_name, include_valid_df=True))
    ] if project_name else ds.get_fold_dataloaders(tokenizer,
                                                   include_valid_df=True)

    interp_out_file = Path(
        config.interp_out_file) if config.interp_out_file else None

    # for train_dataloader, valid_dataloader in [ds.get_train_valid_dataloaders(tokenizer, project_name)]:
    for project_name, train_dataloader, (valid_dataloader,
                                         valid_df) in iter_obj:
        print(
            f"------------------ BEGIN PROJECT {project_name} -----------------------"
        )

        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        experiment = Experiment(config,
                                model,
                                tokenizer,
                                label_names=label_names,
                                run_name=project_name,
                                results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)

        if interp_out_file:
            interp_df = experiment.interpret(valid_dataloader, valid_df)
            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                print(interp_df)
            interp_df.to_csv(interp_out_file.with_name(
                f"{project_name}_{interp_out_file.name}"),
                             index=False)

        results = experiment.results

        # experiment.evaluate('valid', valid_dataloader)

        print(
            f"================== DONE PROJECT {project_name} =======================\n\n"
        )
    return results