def process_dataset(dataset_name, args): try: embeddings_file = '{}/{}.npy'.format(args.embedding_save_path, dataset_name) if not args.force and os.path.exists(embeddings_file): return start_time = time() LOGGER.info('{:30} Starting'.format(dataset_name)) X, Y = dataset_helper.get_dataset(dataset_name=dataset_name) X = preprocessing.preprocess_text_spacy(X, n_jobs=args.n_jobs_spacy) X = [[word.text.lower().strip() for word in doc] for doc in X] model = w2v_d2v.train_w2v(X, min_count=args.embedding_min_count, size=args.embedding_size, iter=args.embedding_iter, workers=args.n_jobs_w2v) word_vectors = model.wv del model with open(embeddings_file, 'wb') as f: pickle.dump(word_vectors, f) duration_in_s = time_utils.seconds_to_human_readable(time() - start_time) LOGGER.info('{:30} Finished (time={})'.format(dataset_name, duration_in_s)) except Exception as e: LOGGER.exception(e)
def save_training_meta(args): # Comment out, since rank is not saved to args. Safeguard save_training_meta already in training scripts. # if args.rank > 0: # return # args is an EasyDict object, treat it the same as a normal dict os.makedirs(join(args.output_dir, 'log'), exist_ok=True) os.makedirs(join(args.output_dir, 'ckpt'), exist_ok=True) # training args save_args_path = join(args.output_dir, 'log', 'hps.json') save_json(vars(args), save_args_path, save_pretty=True) # model args model_config = load_json(args.model_config) save_model_config_path = join(args.output_dir, 'log', 'model_config.json') save_json(model_config, save_model_config_path, save_pretty=True) # git info try: LOGGER.info("Waiting on git info....") c = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_branch_name = c.stdout.decode().strip() LOGGER.info("Git branch: %s", git_branch_name) c = subprocess.run(["git", "rev-parse", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_sha = c.stdout.decode().strip() LOGGER.info("Git SHA: %s", git_sha) git_dir = abspath(dirname(__file__)) git_status = subprocess.check_output(['git', 'status', '--short'], cwd=git_dir, universal_newlines=True).strip() with open(join(args.output_dir, 'log', 'git_info.json'), 'w') as writer: json.dump( { 'branch': git_branch_name, 'is_dirty': bool(git_status), 'status': git_status, 'sha': git_sha }, writer, indent=4) except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: LOGGER.exception(e) LOGGER.warn("Git info not found. Saving code into zip instead...") # save a copy of the codebase. # !!!Do not store heavy file in your codebase when using it. code_dir = dirname(dirname(realpath(__file__))) code_zip_filename = os.path.join(args.output_dir, "code.zip") LOGGER.info(f"Saving code from {code_dir} to {code_zip_filename}...") make_zipfile(code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results", exclude_dirs=["results", "debug_results", "__pycache__"], exclude_extensions=[".pyc", ".ipynb", ".swap"]) LOGGER.info("Saving code done.")
async def _handle_internal_error(self, ctx, ex): """ Handler for internal errors for logging into server text room. """ ex_content = traceback.format_exception(type(ex), ex, ex.__traceback__) ex_content = " ".join(e for e in ex_content) LOGGER.exception( f"Unexpected exception on event: {ctx.message.content}. Args: {ctx.args} Kwargs: {ctx.kwargs}. Content: {ex_content}" ) if not self.logging_channel: return await self.logging_channel.send(content="```" + ex_content + "```")
async def on_error(self, event, *args, **kwargs): """ Overall exception handler. """ type_, value, tb = sys.exc_info() ex_content = traceback.format_exception(type_, value, tb) ex_content = " ".join(e for e in ex_content) LOGGER.exception( f"Unexpected exception on event: {event}. Args: {args} Kwargs: {kwargs}. Content: {ex_content}" ) if not self.logging_channel: return await self.logging_channel.send(content="```" + ex_content + "```")
def save_training_meta(args): if args.rank > 0: return if not exists(args.output_dir): os.makedirs(join(args.output_dir, "log")) os.makedirs(join(args.output_dir, "ckpt")) with open(join(args.output_dir, "log", "hps.json"), "w") as writer: json.dump(vars(args), writer, indent=4) model_config = json.load(open(args.model_config)) with open(join(args.output_dir, "log", "model.json"), "w") as writer: json.dump(model_config, writer, indent=4) # git info try: LOGGER.info("Waiting on git info....") c = subprocess.run( ["git", "rev-parse", "--abbrev-ref", "HEAD"], timeout=10, stdout=subprocess.PIPE, ) git_branch_name = c.stdout.decode().strip() LOGGER.info("Git branch: %s", git_branch_name) c = subprocess.run( ["git", "rev-parse", "HEAD"], timeout=10, stdout=subprocess.PIPE ) git_sha = c.stdout.decode().strip() LOGGER.info("Git SHA: %s", git_sha) git_dir = abspath(dirname(__file__)) git_status = subprocess.check_output( ["git", "status", "--short"], cwd=git_dir, universal_newlines=True ).strip() with open(join(args.output_dir, "log", "git_info.json"), "w") as writer: json.dump( { "branch": git_branch_name, "is_dirty": bool(git_status), "status": git_status, "sha": git_sha, }, writer, indent=4, ) except subprocess.TimeoutExpired as e: LOGGER.exception(e) LOGGER.warn("Git info not found. Moving right along...")
async def _handle_internal_error(self, ctx, ex): """ Handler for internal errors for logging into server text room. """ ex_content = traceback.format_exception(type(ex), ex, ex.__traceback__) ex_content = " ".join(e for e in ex_content) # Check for the stupid restart of cogs. # TODO: fix this if "is already registered." in ex_content: return LOGGER.exception( f"Unexpected exception on event: {ctx.message.content}. Args: {ctx.args} Kwargs: {ctx.kwargs}. Content: {ex_content}" ) if not self.logging_channel: return await self.logging_channel.send(content="```" + ex_content + "```")
async def on_error(self, event, *args, **kwargs): """ Overall exception handler. """ type_, value, tb = sys.exc_info() ex_content = traceback.format_exception(type_, value, tb) ex_content = " ".join(e for e in ex_content) # Check for the stupid restart of cogs. # TODO: fix this if "is already registered." in ex_content: return LOGGER.exception( f"Unexpected exception on event: {event}. Args: {args} Kwargs: {kwargs}. Content: {ex_content}" ) if not self.logging_channel: return await self.logging_channel.send(content="```" + ex_content + "```")
def save_training_meta(args): if args.rank > 0: return if not exists(args.output_dir): os.makedirs(join(args.output_dir, 'log')) os.makedirs(join(args.output_dir, 'ckpt')) with open(join(args.output_dir, 'log', 'hps.json'), 'w') as writer: json.dump(vars(args), writer, indent=4) model_config = json.load(open(args.model_config)) with open(join(args.output_dir, 'log', 'model.json'), 'w') as writer: json.dump(model_config, writer, indent=4) return # no need to store git info # git info try: LOGGER.info("Waiting on git info....") c = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_branch_name = c.stdout.decode().strip() LOGGER.info("Git branch: %s", git_branch_name) c = subprocess.run(["git", "rev-parse", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_sha = c.stdout.decode().strip() LOGGER.info("Git SHA: %s", git_sha) git_dir = abspath(dirname(__file__)) git_status = subprocess.check_output( ['git', 'status', '--short'], cwd=git_dir, universal_newlines=True).strip() with open(join(args.output_dir, 'log', 'git_info.json'), 'w') as writer: json.dump({'branch': git_branch_name, 'is_dirty': bool(git_status), 'status': git_status, 'sha': git_sha}, writer, indent=4) except subprocess.TimeoutExpired as e: LOGGER.exception(e) LOGGER.warn("Git info not found. Moving right along...")
def process_dataset(dataset_name, args): LOGGER.info('{:15} - Start'.format(dataset_name)) LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name)) pre_trained_embedding = embeddings.get_embedding_model( args.pre_trained_embedding, binary=False, first_line_header=True, with_gensim=True) try: trained_embedding = dataset_helper.get_w2v_embedding_for_dataset( dataset_name) except FileNotFoundError as e: LOGGER.exception(e) return cmap_cache_files = dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP) coo_cache_files = [ x for x in dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE) if 'all' in x ] if not len(cmap_cache_files) or not len(coo_cache_files): return used_graphs = [cmap_cache_files[0], coo_cache_files[0]] LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name)) all_labels = set() for graph_cache_file in used_graphs: X, _ = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) all_labels |= graph_helper.get_all_node_labels_uniq( X, as_sorted_list=False) LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name)) embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup( all_labels, trained_embedding, pre_trained_embedding) LOGGER.info('{:15} - Missing'.format(dataset_name)) for label, s in [('trained', not_found_trained), ('pre_trained', not_found_pre_trained), ('after_coreference', not_found_pre_trained_coreferenced) ]: LOGGER.info('\t{:20} {:>6}'.format(label, len(s))) embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder, dataset_name) embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file) embeddings_pre_trained = embeddings.load_word2vec_format( fname=embedding_file, binary=False) LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name)) max_topn = max(args.topn) similar_labels = coreference.get_most_similar_labels( all_labels, embeddings_pre_trained, max_topn) for topn in args.topn: for threshold in args.merge_threshold: LOGGER.info( '{:15} - Co-reference resolution: topn: {}, threshold: {}'. format(dataset_name, topn, threshold)) clique_lookup = coreference.create_label_cliques_by_similarity( similar_labels, threshold=threshold, topn=topn) new_lookup = embeddings.merge_lookups(clique_lookup, lookup) with open( '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format( args.embeddings_result_folder, dataset_name, threshold, topn), 'wb') as f: pickle.dump(new_lookup, f) LOGGER.info('{:15} - Finished'.format(dataset_name))
def run_classification_task(task: ExperimentTask, cfo: ClassificationOptions, experiment_config: dict): helper.set_random_seed() args = cfo result_filename_tmpl = filename_utils.get_result_filename_for_task( task, experiment_config=experiment_config, cfo=cfo) result_file = '{}/{}'.format(cfo.results_folder, result_filename_tmpl) predictions_file = '{}/{}'.format(cfo.predictions_folder, result_filename_tmpl) classifier_file = '{}/{}'.format(cfo.classifier_folder, result_filename_tmpl) if not cfo.force and os.path.exists(result_file): return time_checkpoints = {} def add_time_checkpoint(name): time_checkpoints[name] = time() add_time_checkpoint('start') X, Y, estimator, param_grid = task.fn() add_time_checkpoint('retrieved_data') # A good heuristic of whether it's a gram matrix is whether the dimensions are the same is_precomputed = isinstance(X, np.ndarray) and X.shape[0] == X.shape[1] # This is also a heuristic is_dummy = 'classifier__strategy' in param_grid # Add classifiers, instantiate transformer classes and merge with experiment config param_grid = experiment_helper.prepare_param_grid(task, param_grid, experiment_config) LOGGER.info('ParamGrid: {}\n\n'.format( pipeline_helper.remove_complex_types(param_grid))) X_train, Y_train, X_test, Y_test, train_i, test_i = X, Y, [], [], range( len(X)), [] if not is_dummy: # and cfo.create_predictions: # Hold out validation set for predictions try: X_train, X_test, Y_train, Y_test, train_i, test_i = train_test_split( X, Y, test_size=cfo.prediction_test_size, is_precomputed=is_precomputed, ) except Exception as e: LOGGER.warning('Could not split dataset for predictions') LOGGER.exception(e) def get_cv(splits): if splits == -1: _, _, _, _, X_train_i, X_test_i = train_test_split( X_train, Y_train, test_size=0.33, is_precomputed=is_precomputed) cv = [(X_train_i, X_test_i)] else: cv = sklearn.model_selection.StratifiedKFold( n_splits=cfo.n_splits, shuffle=True, random_state=constants.RANDOM_SEED) return cv add_time_checkpoint('split_data') cv = get_cv(cfo.n_splits) should_refit = np.all([ #not cfo.use_nested_cross_validation, not is_dummy, #cfo.create_predictions or cfo.save_best_clf ]) gscv = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, scoring=cfo.scoring, n_jobs=cfo.n_jobs, verbose=cfo.verbose, refit=cfo.refit if should_refit else False) if cfo.use_nested_cross_validation and not is_dummy: cv_nested = get_cv(cfo.n_splits_nested) LOGGER.info('Using nested cross-validation') scores = sklearn.model_selection.cross_validate( gscv, X, Y, scoring=cfo.scoring, cv=cv_nested, n_jobs=cfo.n_jobs_outer, verbose=cfo.verbose, return_train_score=True) result = dict(scores, **param_grid) add_time_checkpoint('fitted_nested') results_helper.save_results(result, result_file, args, time_checkpoints=time_checkpoints) return gscv_result = gscv.fit(X_train, Y_train) add_time_checkpoint('fitted_gridsearch') if not is_dummy and cfo.create_predictions: if not len(X_test): LOGGER.warning('Validation set for prediction has no items') else: try: # Retrain the best classifier and get prediction on validation set Y_test_pred = gscv_result.best_estimator_.predict(X_test) add_time_checkpoint('predicted') results_helper.save_results( { 'gscv_result': remove_coefs_from_results(gscv_result.cv_results_), 'all_params': remove_coefs_from_results(param_grid), 'best_params': remove_coefs_from_results(gscv_result.best_params_), 'Y_real': Y_test, 'Y_pred': Y_test_pred, 'X_test': X_test, }, predictions_file, args, time_checkpoints=time_checkpoints) except Exception as e: LOGGER.warning('Error while trying to retrain best classifier') LOGGER.exception(e) if cfo.save_best_clf: best_estimator = gscv_result.best_estimator_ try: results_helper.save_results( { 'params': gscv_result.best_params_, 'classifier': best_estimator }, classifier_file, args, time_checkpoints=time_checkpoints) except Exception as e: LOGGER.warning('Error while saving best estimator: {}'.format(e)) LOGGER.exception(e) add_time_checkpoint('finished') results_helper.save_results(gscv_result.cv_results_, result_file, args, time_checkpoints=time_checkpoints)
def process_graph_cache_file(graph_cache_file, args): graph_cache_filename = graph_cache_file.split('/')[-1].rsplit('.')[0] dataset = filename_utils.get_dataset_from_filename(graph_cache_file) if '.phi.' in graph_cache_filename or not filter_utils.file_should_be_processed( graph_cache_filename, args.include_filter, args.exclude_filter, args.limit_dataset): return LOGGER.info('{:15} starting ({})'.format(dataset, graph_cache_filename)) fast_wl_trans = FastWLGraphKernelTransformer( h=args.wl_h, use_early_stopping=False, truncate_to_highest_label=False) try: phi_graph_cache_file = graph_cache_file.replace('.npy', '.phi.npy') X_graphs, Y = dataset_helper.get_dataset_cached(graph_cache_file) X_graphs = graph_helper.get_graphs_only(X_graphs) # Kernel: WL if args.use_wl: used_phi_graph_cache_file = phi_graph_cache_file splitted_phi_graph_cache_file = phi_graph_cache_file.replace( '.phi', '.splitted.phi') phi_same_label_graph_cache_file = phi_graph_cache_file.replace( dataset, '{}_same-label'.format(dataset)).replace( '.phi', '.splitted.phi') # Stop here if all files have already been created if not args.force and np.all([ os.path.exists(x) for x in [ splitted_phi_graph_cache_file, used_phi_graph_cache_file, phi_same_label_graph_cache_file ] ]): return X_, Y_ = np.array(np.copy(X_graphs)), np.array(np.copy(Y)) if args.wl_sort_classes: X_, Y_ = sort(X_, Y_, by=Y_) num_vertices = len(graph_helper.get_all_node_labels(X_)) fast_wl_trans.set_params(phi_dim=num_vertices) X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split( np.copy(X_), np.copy(Y_), stratify=Y_, test_size=args.wl_test_size) X_train, Y_train = sort(X_train, Y_train, by=Y_train) X_test, Y_test = sort(X_test, Y_test, by=Y_test) # Splitted version if args.force or not os.path.exists(splitted_phi_graph_cache_file): t = sklearn.base.clone(fast_wl_trans).set_params( same_label=True) phi_train = t.fit_transform(np.copy(X_train)) phi_test = t.transform(np.copy(X_test)) with open(splitted_phi_graph_cache_file, 'wb') as f: pickle.dump((phi_train, phi_test, X_train, X_test, Y_train, Y_test), f) # Splitted, same label if args.force or not os.path.exists( phi_same_label_graph_cache_file): t = sklearn.base.clone(fast_wl_trans) phi_train = t.fit_transform(X_train) phi_test = t.transform(X_test) with open(phi_same_label_graph_cache_file, 'wb') as f: pickle.dump((phi_train, phi_test, X_train, X_test, Y_train, Y_test), f) # Whole dataset if args.force or not os.path.exists(used_phi_graph_cache_file): t = sklearn.base.clone(fast_wl_trans) with open(used_phi_graph_cache_file, 'wb') as f: pickle.dump((t.fit_transform(X_), Y_), f) # Kernel: spgk if args.use_spgk: for depth in args.spgk_depth: spgk_graph_cache_file = graph_cache_file.replace( '.npy', '.spgk-{}.gram.npy'.format(depth)) if args.force or not os.path.exists(spgk_graph_cache_file): K = spgk.transform(X_graphs, depth=depth) with open(spgk_graph_cache_file, 'wb') as f: pickle.dump((K, Y), f) except Exception as e: LOGGER.exception(e) LOGGER.info('{:15} finished ({})'.format(dataset, graph_cache_filename))
def start_tasks(args, all_tasks: typing.List[ExperimentTask], classification_options: ClassificationOptions, experiment_config: dict): filtered_task_types = experiment_config['params_per_type'].keys( ) if experiment_config else None if experiment_config.get('limit_dataset', None) is not None: limit_dataset = experiment_config['limit_dataset'] else: limit_dataset = args.limit_dataset limit_graph_type = experiment_config.get('limit_graph_type', None) def should_process_task(task: ExperimentTask): # Dataset filter is_filtered_by_dataset = limit_dataset and filename_utils.get_dataset_from_filename( task.name) not in limit_dataset # Task type filters is_filtered_by_include_filter = (args.task_type_include_filter and task.type not in args.task_type_include_filter) is_filtered_by_exclude_filter = (args.task_type_exclude_filter and task.type in args.task_type_exclude_filter) is_filtered_by_name_filter = (args.task_name_filter and args.task_name_filter not in task.name) is_filtered_by_param_options = (filtered_task_types and task.type not in filtered_task_types) is_filtered_by_graph_type = ( limit_graph_type and graph_helper.get_graph_type_from_filename( task.name) not in [None] + limit_graph_type) # Do not process tasks that have already been calculated (unless args.force == True) created_files = [ '{}/{}'.format( args.results_folder, filename_utils.get_result_filename_for_task( task, experiment_config, cfo=classification_options)) ] is_filtered_by_file_exists = (not args.force and np.any( [os.path.exists(file) for file in created_files])) should_process = not np.any([ is_filtered_by_graph_type, is_filtered_by_dataset, is_filtered_by_include_filter, is_filtered_by_name_filter, is_filtered_by_file_exists, is_filtered_by_exclude_filter, is_filtered_by_param_options ]) return should_process def print_tasks(tasks: typing.List[ExperimentTask]): for task in tasks: print('\t{t.type:26} {dataset:18} {t.name}'.format( t=task, dataset=filename_utils.get_dataset_from_filename(task.name))) print('\n') # Filter out tasks tasks = sorted( [task for task in all_tasks if should_process_task(task)], key=lambda x: filename_utils.get_dataset_from_filename(x.name)) if args.dry_run: print('All tasks:') print_tasks(all_tasks) print('Filtered tasks:') print_tasks(tasks) print('# tasks per type (filtered/unfiltered)') task_type_counter_unfiltered = collections.Counter( [t.type for t in all_tasks]) task_type_counter_filtered = collections.Counter([t.type for t in tasks]) for task_type, unfiltered_count in task_type_counter_unfiltered.items(): print('\t{:25} {:2}/{:2}'.format( task_type, task_type_counter_filtered.get(task_type, 0), unfiltered_count)) print('\n') if args.dry_run: print('Only doing a dry-run. Exiting.') return num_tasks = len(tasks) for task_idx, t in enumerate(tasks): def print_task(msg=''): LOGGER.info( 'Task {idx:>2}/{num_tasks}: {t.type:30} - {t.name:40} - {msg}'. format(idx=task_idx + 1, num_tasks=num_tasks, t=t, msg=msg)) start_time = time() print_task('Started') try: task_runner.run_classification_task(t, classification_options, experiment_config) gc.collect() except Exception as e: print_task('Error: {}'.format(e)) LOGGER.exception(e) elapsed_seconds = time() - start_time print_task('Finished (time={})'.format( time_utils.seconds_to_human_readable(elapsed_seconds))) gc.collect() LOGGER.info('Finished!')