def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names model_config = tu.load_model_config(config) for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate( ds.get_train_valid_dataloaders(include_valid_df=True)): print(f"------------------ BEGIN ITER {i} -----------------------") model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) run_name = f"fold{i}" experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results) global_step, tr_loss = experiment.train( train_dataloader, valid_dataloader=valid_dataloader) results = experiment.results # experiment.evaluate('valid', valid_dataloader) print(f"================== DONE ITER {i} =======================\n\n") return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names dataloaders = getattr(ds, f"get_{config.dataset}_train_valid_dataloaders")(include_valid_df=True) for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(dataloaders): print(f"------------------ BEGIN ITER {i} -----------------------") # need to reload original model config, to avoid vocabulary size mismatch # caused by custom tokens model_config = tu.load_model_config(config) model = tu.load_model(config, model_config) model.resize_token_embeddings(len(tokenizer)) model.to(config.device) util.set_seed(config) run_name = f"fold{i}" experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results) global_step, tr_loss = experiment.train( train_dataloader, valid_dataloader=valid_dataloader) results = experiment.results # experiment.evaluate('valid', valid_dataloader) print(f"================== DONE ITER {i} =======================\n\n") return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names train_dataloader, valid_dataloader, test_dataloader = getattr( ds, f"get_{config.dataset}_train_valid_test_dataloaders")() model = tu.load_model(config, model_config) model.resize_token_embeddings(len(tokenizer)) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results) global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloader) results = experiment.results return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) if config.clap: ds = ClapDataset(config, tokenizer) else: ds = Dataset(config, tokenizer) backtrans_dfs = [] run_configs = RUN_CONFIGS[config.run_configs] # Do not use CV in this setting train_valid_df, test_df = train_test_split(ds.df, test_size=0.2, shuffle=True, stratify=ds.df.label_id, random_state=config.seed) results = active_learn(config, model_config, tokenizer, results, ds.label_names, test_df, train_valid_df, backtrans_dfs, ds.get_dataloader, run_configs, balance=True) return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders() model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results) global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=valid_dataloader) results = experiment.results experiment.save_model(util.models_path('comment_code_shuffle')) return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = SentiDataset(config, tokenizer) backtrans_dfs = ds.load_backtrans_dfs(ds.ALL_BACKTRANS_LANGS, 'train') run_configs = RUN_CONFIGS[config.run_configs] results = active_learn(config, model_config, tokenizer, results, ds.label_names, ds.test_df, ds.train_valid_df, backtrans_dfs, ds.get_dataloader, run_configs) return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) if config.clap: ds = ClapDataset(config, tokenizer) else: ds = Dataset(config, tokenizer) label_names = ds.label_names model_config = tu.load_model_config(config) interp_out_file = Path(config.interp_out_file) if config.interp_out_file else None for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(ds.get_train_valid_dataloaders(include_valid_df=True)): print(f"------------------ BEGIN ITER {i} -----------------------") model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) run_name = f"fold{i}" experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results) global_step, tr_loss = experiment.train( train_dataloader, valid_dataloader=valid_dataloader) if interp_out_file: interp_df = experiment.interpret(valid_dataloader, valid_df, label_names=label_names) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(interp_df) interp_df.to_csv(interp_out_file.with_name(f"{interp_out_file.name}_iter{i}"), index=False) results = experiment.results # experiment.evaluate('valid', valid_dataloader) print(f"================== DONE ITER {i} =======================\n\n") return results
def main(config, results): pd.set_option('display.max_rows', None) model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = SentiDataset(config, tokenizer) test_dataloader = ds.get_test_dataloader() model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders() test_dataloader = ds.get_test_dataloader() test_dataloaders = {'test': ds.get_test_dataloader()} if config.jira: test_dataloaders['JIRA'] = ( ds.get_jira_dataloader(), dict(pred_label_ids_func=ds.neutral_to_negative)) if config.app_reviews: test_dataloaders['AppReviews'] = ds.get_app_reviews_dataloader() if config.sentidata_so: test_dataloaders[ 'StackOverflow (SentiData)'] = ds.get_stack_overflow_dataloader() experiment = Experiment(config, model, tokenizer, label_names=ds.label_names, results=results) global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloaders) # interp_df = experiment.interpret(test_dataloader, ds.test_df, label_names=ds.label_names) # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(interp_df) # if config.interp_out_file: # interp_df.to_csv(config.interp_out_file, index=False) return experiment.results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names train_dataloader = ds.get_train_dataloader() fake_valid_dataloader = ds.get_fake_valid_dataloader() # with config: # config.max_steps=100 model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results) global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=fake_valid_dataloader) #test_dataloader=test_dataloader) valid_dataloader = ds.get_valid_dataloader() test_dataloader = ds.get_test_dataloader() experiment.evaluate('test_final', test_dataloader) experiment.evaluate('valid_final', valid_dataloader) experiment.save_model('test_model_complexity') with config: config.model_path = 'test_model_complexity' model = tu.load_model(config, model_config) model.to(config.device) logger.warn('#################################### =========================') experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results) experiment.evaluate('test_final_reloaded', test_dataloader) experiment.evaluate('valid_final_reloaded', valid_dataloader) results = experiment.results return results
def main(config, results): model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) ds = Dataset(config, tokenizer) label_names = ds.label_names train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders() test_dataloader = ds.get_test_dataloader() # with config: # config.max_steps=100 model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results) global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloader) results = experiment.results return results
def main(): config = get_config(parse_args) util.set_seed(config) with config: config.logging_steps = 50 config.train_epochs = 5 # config.train_head_only = True print("model is now", config.model_path) ds = CadoDataset(config) label_names = ds.label_names model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) f1s = [] results = None test_dataloader = ds.get_test_dataloader(tokenizer) for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(ds.get_all_train_valid_dataloaders(tokenizer, include_valid_df=True)): print(f"------------------ BEGIN ITER {i} -----------------------") model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) run_name = f"{config.single_class if config.single_class else 'multi'}_{i}" experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results) global_step, tr_loss = experiment.train( train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloader) results = experiment.results # experiment.evaluate('valid', valid_dataloader) print(f"================== DONE ITER {i} =======================\n\n")
def main(config, results): logger.warning('Unclassified threshold: %s', config.self_train_thresh) ds = TDDataset(config, binary=True, self_train_thresh=config.self_train_thresh, keyword_masking_frac=config.keyword_masking_frac) model_config = tu.load_model_config(config) tokenizer = tu.load_tokenizer(config, model_config) label_names = ds.label_names #project_name = 'emf-2.4.1' project_name = config.single_project iter_obj = [ (project_name, *ds.get_train_valid_dataloaders( tokenizer, project_name, include_valid_df=True)) ] if project_name else ds.get_fold_dataloaders(tokenizer, include_valid_df=True) interp_out_file = Path( config.interp_out_file) if config.interp_out_file else None # for train_dataloader, valid_dataloader in [ds.get_train_valid_dataloaders(tokenizer, project_name)]: for project_name, train_dataloader, (valid_dataloader, valid_df) in iter_obj: print( f"------------------ BEGIN PROJECT {project_name} -----------------------" ) model = tu.load_model(config, model_config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=project_name, results=results) global_step, tr_loss = experiment.train( train_dataloader, valid_dataloader=valid_dataloader) if interp_out_file: interp_df = experiment.interpret(valid_dataloader, valid_df) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(interp_df) interp_df.to_csv(interp_out_file.with_name( f"{project_name}_{interp_out_file.name}"), index=False) results = experiment.results # experiment.evaluate('valid', valid_dataloader) print( f"================== DONE PROJECT {project_name} =======================\n\n" ) return results