def cleanup_outdated_predictions(results_folder: str = None, dry_run=True, ignore_filter: str = 'dummy'): folder = 'data/results/{}'.format( results_folder) if results_folder else get_result_folders()[-1] result_files = get_result_filenames_from_folder(folder) for result_file in result_files: if ignore_filter and ignore_filter in result_file: continue prediction_file = '{}/predictions/{}'.format( folder, filename_utils.get_filename_only(result_file)) predictions_exist = os.path.exists(prediction_file) if not predictions_exist: LOGGER.warning( 'Did not find prediction file for: {}'.format(result_file)) continue with open(result_file, 'rb') as f: result_data = pickle.load(f) with open(prediction_file, 'rb') as f: r = pickle.load(f) result_git_commit = result_data['meta_data']['git_commit'] git_commit = r['meta_data']['git_commit'] if result_git_commit != git_commit: if dry_run: LOGGER.info( 'Outdated prediction: {}. dry_run=True, so it will not get deleted' .format(prediction_file)) continue LOGGER.info( 'Outdated prediction: {}. Deleting'.format(prediction_file)) os.remove(prediction_file)
def preprocess_args(config): config['device'] = get_device() config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1 # Check all provided paths: if not os.path.exists(config['data_path']): raise ValueError("[!] ERROR: Dataset path does not exist") else: LOGGER.info("Data path checked..") if not os.path.exists(config['model_path']): LOGGER.warning( "Creating checkpoint path for saved models at: {}\n".format( config['model_path'])) os.makedirs(config['model_path']) else: LOGGER.info("Model save path checked..") if 'config' in config: if not os.path.isfile(config['config']): raise ValueError("[!] ERROR: config JSON path does not exist") else: LOGGER.info("config JSON path checked..") if not os.path.exists(config['vis_path']): LOGGER.warning( "Creating checkpoint path for Tensorboard visualizations at: {}\n" .format(config['vis_path'])) os.makedirs(config['vis_path']) else: LOGGER.info("Tensorboard Visualization path checked..") LOGGER.info( "Cleaning Visualization path of older tensorboard files...\n") # shutil.rmtree(config['vis_path']) # Print args print("\n" + "x" * 50 + "\n\nRunning training with the following parameters: \n") for key, value in config.items(): if not key.endswith('transf'): print(key + ' : ' + str(value)) print("\n" + "x" * 50) # config['vis_path'] = os.path.join(config['vis_path'], '{}_conf{}'.format(config['pretrained_model_file'], config['confounder_repeat'])) config['writer'] = SummaryWriter(config['vis_path']) set_seed(config['seed']) return config
def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word)
help='Hidden size for race and gender') args, unparsed = parser.parse_known_args() config = args.__dict__ wandb.config.update(config) config['device'] = get_device() config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1 # Check all provided paths: if not os.path.exists(config['data_path']): raise ValueError("[!] ERROR: Dataset path does not exist") else: LOGGER.info("Data path checked..") if not os.path.exists(config['model_path']): LOGGER.warning( "Creating checkpoint path for saved models at: {}\n".format( config['model_path'])) os.makedirs(config['model_path']) else: LOGGER.info("Model save path checked..") if 'config' in config: if not os.path.isfile(config['config']): raise ValueError("[!] ERROR: config JSON path does not exist") else: LOGGER.info("config JSON path checked..") if not os.path.exists(config['vis_path']): LOGGER.warning( "Creating checkpoint path for Tensorboard visualizations at: {}\n" .format(config['vis_path'])) os.makedirs(config['vis_path']) else:
def run_classification_task(task: ExperimentTask, cfo: ClassificationOptions, experiment_config: dict): helper.set_random_seed() args = cfo result_filename_tmpl = filename_utils.get_result_filename_for_task( task, experiment_config=experiment_config, cfo=cfo) result_file = '{}/{}'.format(cfo.results_folder, result_filename_tmpl) predictions_file = '{}/{}'.format(cfo.predictions_folder, result_filename_tmpl) classifier_file = '{}/{}'.format(cfo.classifier_folder, result_filename_tmpl) if not cfo.force and os.path.exists(result_file): return time_checkpoints = {} def add_time_checkpoint(name): time_checkpoints[name] = time() add_time_checkpoint('start') X, Y, estimator, param_grid = task.fn() add_time_checkpoint('retrieved_data') # A good heuristic of whether it's a gram matrix is whether the dimensions are the same is_precomputed = isinstance(X, np.ndarray) and X.shape[0] == X.shape[1] # This is also a heuristic is_dummy = 'classifier__strategy' in param_grid # Add classifiers, instantiate transformer classes and merge with experiment config param_grid = experiment_helper.prepare_param_grid(task, param_grid, experiment_config) LOGGER.info('ParamGrid: {}\n\n'.format( pipeline_helper.remove_complex_types(param_grid))) X_train, Y_train, X_test, Y_test, train_i, test_i = X, Y, [], [], range( len(X)), [] if not is_dummy: # and cfo.create_predictions: # Hold out validation set for predictions try: X_train, X_test, Y_train, Y_test, train_i, test_i = train_test_split( X, Y, test_size=cfo.prediction_test_size, is_precomputed=is_precomputed, ) except Exception as e: LOGGER.warning('Could not split dataset for predictions') LOGGER.exception(e) def get_cv(splits): if splits == -1: _, _, _, _, X_train_i, X_test_i = train_test_split( X_train, Y_train, test_size=0.33, is_precomputed=is_precomputed) cv = [(X_train_i, X_test_i)] else: cv = sklearn.model_selection.StratifiedKFold( n_splits=cfo.n_splits, shuffle=True, random_state=constants.RANDOM_SEED) return cv add_time_checkpoint('split_data') cv = get_cv(cfo.n_splits) should_refit = np.all([ #not cfo.use_nested_cross_validation, not is_dummy, #cfo.create_predictions or cfo.save_best_clf ]) gscv = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, scoring=cfo.scoring, n_jobs=cfo.n_jobs, verbose=cfo.verbose, refit=cfo.refit if should_refit else False) if cfo.use_nested_cross_validation and not is_dummy: cv_nested = get_cv(cfo.n_splits_nested) LOGGER.info('Using nested cross-validation') scores = sklearn.model_selection.cross_validate( gscv, X, Y, scoring=cfo.scoring, cv=cv_nested, n_jobs=cfo.n_jobs_outer, verbose=cfo.verbose, return_train_score=True) result = dict(scores, **param_grid) add_time_checkpoint('fitted_nested') results_helper.save_results(result, result_file, args, time_checkpoints=time_checkpoints) return gscv_result = gscv.fit(X_train, Y_train) add_time_checkpoint('fitted_gridsearch') if not is_dummy and cfo.create_predictions: if not len(X_test): LOGGER.warning('Validation set for prediction has no items') else: try: # Retrain the best classifier and get prediction on validation set Y_test_pred = gscv_result.best_estimator_.predict(X_test) add_time_checkpoint('predicted') results_helper.save_results( { 'gscv_result': remove_coefs_from_results(gscv_result.cv_results_), 'all_params': remove_coefs_from_results(param_grid), 'best_params': remove_coefs_from_results(gscv_result.best_params_), 'Y_real': Y_test, 'Y_pred': Y_test_pred, 'X_test': X_test, }, predictions_file, args, time_checkpoints=time_checkpoints) except Exception as e: LOGGER.warning('Error while trying to retrain best classifier') LOGGER.exception(e) if cfo.save_best_clf: best_estimator = gscv_result.best_estimator_ try: results_helper.save_results( { 'params': gscv_result.best_params_, 'classifier': best_estimator }, classifier_file, args, time_checkpoints=time_checkpoints) except Exception as e: LOGGER.warning('Error while saving best estimator: {}'.format(e)) LOGGER.exception(e) add_time_checkpoint('finished') results_helper.save_results(gscv_result.cv_results_, result_file, args, time_checkpoints=time_checkpoints)
'--model', type=str, default="BERT", help='Name of the model to use (BERT, RoBERTa, ELECTRA, ALBERT)') parser.add_argument('--lr_head', type=float, default=1e-4, help='Learning rate for the MLP head') parser.add_argument('--num_layers_freeze', type=int, default=0, help='Number of layers to freeze in BERT') args, unparsed = parser.parse_known_args() if len(unparsed) > 0: LOGGER.warning("There have been unprocessed parser arguments: " + str(unparsed)) config = args.__dict__ config = TrainerTemplate.preprocess_args(config) # config['no_model_checkpoints'] = (config['no_model_checkpoints'] or config['debug']) config['model'] = config['model'].lower() assert config[ 'model'] in MODEL_DICT, "Given model is not known. Please choose between the following: " + str( MODEL_DICT.keys()) config['model'] = MODEL_DICT[config['model']] # Tokenize tokenizer = config['model']["tokenizer"].from_pretrained( config['model']['pretrain']) tokenizer_func = partial(tokenizer, max_length=config['max_txt_len'], padding='longest',
def get_results(folder=None, results_directory=constants.RESULTS_FOLDER, log_progress=tqdm.tqdm_notebook, exclude_filter=None, include_filter=None, remove_split_cols=True, remove_rank_cols=True, remove_fit_time_cols=True, filter_out_experiment=None, ignore_experiments=True, only_load_dataset=None, fetch_predictions=False): ''' Retrieves results from result folder. Note: This function _seriously_ has to be refactored! Args: folder: specify the results folder. If not specified, defaults to the most recent results folder results_directory: the base folder log_progress: function to log the progess. Takes an iterable and yields the item exclude_filter: which files to exclude include_filter: which files to include remove_split_cols: whether to keep the individual results for each split in CV remove_rank_cols: whether to keep the rank information in the CV results remove_fit_time_cols: keep the fit time filter_out_experiment: string thats gets filtered out ignore_experiments: only_load_dataset: filter the dataset fetch_predictions: whether to also retrieve the predictions and calculate the results on them Returns: pd.DataFrame: the results ''' result_folders = get_result_folders(results_directory) folder = 'data/results/{}'.format(folder) if folder else result_folders[-1] result_files = get_result_filenames_from_folder(folder) if filter_out_experiment: result_files = [ x for x in result_files if _get_experiment_name_from_filename(x) == filter_out_experiment ] if ignore_experiments and not filter_out_experiment: result_files = [x for x in result_files if 'experiment_' not in x] if only_load_dataset is not None: result_files = [ x for x in result_files if filename_utils.get_dataset_from_filename(x) in only_load_dataset ] data_ = [] for result_file in log_progress( result_files) if log_progress else result_files: if include_filter and include_filter not in result_file: continue if exclude_filter and exclude_filter in result_file: continue if '_nested_' in result_file: LOGGER.warning( 'Encountered nested CV result file. Currently not implemented. File: {}' .format(result_file)) continue dataset_name = filename_utils.get_dataset_from_filename(result_file) with open(result_file, 'rb') as f: result_data = pickle.load(f) remove_transformer_classes(result_data) result_file = filename_utils.get_filename_only(result_file) result = result_data if 'params' in result_data else result_data[ 'results'] assert 'params' in result result = clean_result_keys(result) for idx, el in enumerate(result['params']): result['params'][idx] = clean_result_keys(el) prediction_file = '{}/predictions/{}'.format( folder, filename_utils.get_filename_only(result_file)) predictions_exist = os.path.exists(prediction_file) num_results = len(result['params']) result['prediction_file_exists'] = [predictions_exist] * num_results if fetch_predictions and not predictions_exist: LOGGER.warning( 'fetch_predictions=True but could not find prediction: {}'. format(prediction_file)) # Fetch predictions and check whether the git commits are the same. # Also, calculate the prediction scores if fetch_predictions and predictions_exist: with open(prediction_file, 'rb') as f: r = pickle.load(f) result_git_commit = result_data['meta_data']['git_commit'] git_commit = r['meta_data']['git_commit'] if not git_commit == result_git_commit: LOGGER.warning( 'Unmatching git commit for prediction/result file! Prediction: {}, Result: {}' .format(git_commit, result_git_commit)) else: prediction = r['results'] Y_real, Y_pred, X_test = prediction['Y_real'], prediction[ 'Y_pred'], prediction['X_test'] scores = calculate_scores(Y_real, Y_pred) for name, val in scores.items(): result['prediction_score_{}'.format( name)] = [val] * num_results result['prediction_file'] = [prediction_file] * num_results def is_graph_dataset(): graph_file_types = [ constants.TYPE_CONCEPT_MAP, constants.TYPE_COOCCURRENCE, 'graph_extra' ] is_graph_dataset_ = False for x in graph_file_types: if '_{}_'.format(x) in result_file: is_graph_dataset_ = True break return is_graph_dataset_ result['combined'] = np.any([ 'graph_combined__dataset_' in result_file, 'graph_text_combined__dataset_' in result_file ]) # TEXT if is_graph_dataset(): is_cooccurrence_dataset = constants.TYPE_COOCCURRENCE in result_file result[ 'type'] = constants.TYPE_COOCCURRENCE if is_cooccurrence_dataset else constants.TYPE_CONCEPT_MAP result['lemmatized'] = '_lemmatized_' in result_file result['kernel'] = get_kernel_from_filename(result_file) # Co-Occurrence if is_cooccurrence_dataset: parts = re.findall(r'cooccurrence_(.+?)_(.+?)_', result_file)[0] assert len(parts) == 2 result['window_size'], result['words'] = parts # Concept Maps else: result['words'] = 'concepts' # DUMMY elif 'dummy' in result_file: result['type'] = 'dummy' result['words'] = 'dummy' # TEXT else: result['type'] = 'text' result['words'] = ['all'] * num_results if 'time_checkpoints' in result_data: timestamps = result_data['time_checkpoints'] timestamps = sorted(timestamps.items(), key=lambda x: x[1]) start = timestamps[0][1] end = timestamps[-1][1] result['timestamps'] = [timestamps] * num_results result['time'] = [end - start] * num_results result['filename'] = result_file result['dataset'] = dataset_name # Add meta data info = {} if 'results' in result_data: info = { 'info__' + k: v for k, v in result_data.get('meta_data', result_data).items() if k != 'results' } result = dict(result, **{k: [v] * num_results for k, v in info.items()}) data_.append(result) df_all = None for d in data_: result_df = pd.DataFrame(d) df_all = result_df if df_all is None else df_all.append(result_df) if df_all is None or not len(df_all): LOGGER.warning('Did not retrieve results! Aborting') return None # Remove cols df_all = df_all[[ x for x in df_all.columns.tolist() if (not remove_split_cols or not re.match(r'^split\d', x)) and ( not remove_fit_time_cols or not re.match(r'_time$', x)) and ( not remove_rank_cols or not re.match(r'rank_', x)) ]] # Change the column order prio_columns = ['dataset', 'type', 'combined'] low_prio_columns = ['params', 'filename'] + [ c for c in df_all.columns if c.startswith('std_') or c.startswith('mean_') ] columns = df_all.columns.tolist() for c in prio_columns + low_prio_columns: columns.remove(c) return df_all.reset_index(drop=True)[prio_columns + columns + low_prio_columns]
'--model', type=str, default="BERT", help='Name of the model to use (BERT, RoBERTa, ELECTRA, ALBERT, ...)') parser.add_argument('--lr_head', type=float, default=1e-4, help='Learning rate for the MLP head') parser.add_argument('--num_layers_freeze', type=int, default=0, help='Number of layers to freeze in BERT') args, unparsed = parser.parse_known_args() if len(unparsed) > 0: LOGGER.warning("There have been unprocessed parser arguments: " + str(unparsed)) config = args.__dict__ config = TrainerTemplate.preprocess_args(config) config['model'] = config['model'].lower() assert config[ 'model'] in MODEL_DICT, "Given model is not known. Please choose between the following: " + str( MODEL_DICT.keys()) config['model'] = MODEL_DICT[config['model']] # Tokenize tokenizer = config['model']["tokenizer"].from_pretrained( config['model']['pretrain']) tokenizer_func = partial(tokenizer, max_length=config['max_txt_len'], padding='longest', truncation=True,