def draw_learning_curve(solution_file, prediction_files, scoring_function, output_dir, basename, start): """Draw learning curve for one task.""" solution = read_array(solution_file) # numpy array scores = [] timestamps = [] for prediction_file in prediction_files: timestamp = os.path.getmtime(prediction_file) prediction = read_array(prediction_file) # numpy array if (solution.shape != prediction.shape): raise ValueError( "Bad prediction shape {}".format(prediction.shape)) score = scoring_function(solution, prediction) scores.append(score) timestamps.append(timestamp) # Sort two lists according to timestamps sorted_pairs = sorted(zip(timestamps, scores)) start = sorted_pairs[0][0] X = [t - start + 1 for t,_ in sorted_pairs] # Since X on log scale, set first x=1 Y = [s for _,s in sorted_pairs] # Add origin as the first point of the curve X.insert(0, 1) Y.insert(0, 0) # Truncate X using X_max X_max = TIME_BUDGET Y_max = 1 log_X = [np.log(x+1)/np.log(X_max+1) for x in X if x <= X_max] # log_X \in [0, 1] log_X_max = 1 X = X[:len(log_X)] Y = Y[:len(log_X)] # Draw learning curve plt.clf() fig, ax = plt.subplots(figsize=(7, 7.07)) ax.plot(X, Y, marker="o", label="Test score", markersize=3) # Add a point on the final line using last prediction X.append(TIME_BUDGET) Y.append(Y[-1]) log_X.append(1) if len(log_X) >= 2: alc = area_under_learning_curve(log_X,Y) else: alc = 0 ax.fill_between(X, Y, color='cyan') ax.plot(X[-2:], Y[-2:], '--') # Draw a dotted line from last prediction plt.title("Task: " + basename + " - Current normalized ALC: " + format(alc, '.4f')) plt.xlabel('time/second (log scale)') plt.xlim(left=1, right=X_max) plt.xscale('log') plt.ylabel('score (2*BAC - 1)') plt.ylim(bottom=-0.01, top=1) ax.grid(True, zorder=5) plt.legend() fig_name = get_fig_name(basename) path_to_fig = os.path.join(output_dir, fig_name) plt.savefig(path_to_fig) return alc
def compute_error_bars(self, n=10): """Compute error bars on evaluation with boostrap. Args: scoring_function: callable with signature scoring_function(solution, prediction) solution: Numpy array, the solution (true labels). predictions: Numpy array, predicted labels. n: number of times to compute the score (more means more precision) Returns: a list of float, scores """ try: scoring_function = self.scoring_functions['nauc'] solution = self.solution last_prediction = read_array(self.prediction_files_so_far[-1]) assert(len(solution) == len(last_prediction)) l = len(solution) scores = [] for _ in range(n): # number of scoring new_solution = [] new_predictions = [] for _ in range(l): # boostrap i = randrange(l) new_solution.append(solution[i]) new_predictions.append(last_prediction[i]) scores.append(scoring_function(np.array(new_solution), np.array(new_predictions))) mean = np.mean(scores) std = np.std(scores) var = np.var(scores) return mean, std, var except: # not able to compute error bars return -1, -1, -1
def compute_alc_error_bars(self, n=10): """ Return mean, std and variance of ALC score with n runs. n curves are created: For each timestamp, the value of AUC is computed from boostraps of y_true and y_pred. During one curve building, we keep the same boostrap index for each prediction timestamp. Args: n: number of times to compute the score (more means more precision) Returns: (mean, std, var) """ try: scoring_function = self.scoring_functions['nauc'] solution = self.solution alc_scores = [] for _ in range(n): # n learning curves to compute scores = [] size = solution.shape[0] idx = np.random.randint(0, size, size) # bootstrap index for prediction_file in self.prediction_files_so_far: prediction = read_array(prediction_file) scores.append(scoring_function(solution[idx], prediction[idx])) # create new learning curve learning_curve = LearningCurve(timestamps=self.relative_timestamps, # self.learning_curve.timestamps, scores=scores, # list of AUC scores time_budget=self.time_budget) alc_scores.append(learning_curve.get_alc()) return np.mean(alc_scores), np.std(alc_scores), np.var(alc_scores) except: # not able to compute error bars return -1, -1, -1
def get_solution(solution_dir): """Get the solution array from solution directory.""" solution_names = sorted(ls(os.path.join(solution_dir, '*.solution'))) if len(solution_names) != 1: # Assert only one file is found logger.warning("{} solution files found: {}! "\ .format(len(solution_names), solution_names) + "Return `None` as solution.") return None solution_file = solution_names[0] solution = read_array(solution_file) return solution
def draw_learning_curve(solution_dir, prediction_files, scoring_function, output_dir, basename, start, is_multiclass_task, time_budget): """Draw learning curve for one task.""" solution = get_solution(solution_dir) # numpy array scores = [] roc_auc_scores = [] _, timestamps = get_timestamps(prediction_dir) if is_multiclass_task: accuracy_scores = [] for prediction_file in prediction_files: prediction = read_array(prediction_file) # numpy array if (solution.shape != prediction.shape): raise ValueError( "Bad prediction shape: {}. ".format(prediction.shape) + "Expected shape: {}".format(solution.shape)) scores.append(scoring_function(solution, prediction)) try: # if only one class present in y_true. ROC AUC score is not defined in that case. roc_auc_scores.append(roc_auc_score(solution, prediction)) except: roc_auc_scores.append(-1) if is_multiclass_task: accuracy_scores.append(accuracy(solution, prediction)) # Sort two lists according to timestamps sorted_pairs = sorted(zip(timestamps, scores)) roc_auc_sorted_pairs = sorted(zip(timestamps, roc_auc_scores)) time_used = -1 if len(timestamps) > 0: time_used = sorted_pairs[-1][0] - start latest_score = sorted_pairs[-1][1] latest_roc_auc = roc_auc_sorted_pairs[-1][1] logger.info("(2 * AUC - 1) of the latest prediction is {:.4f}."\ .format(latest_score)) if not latest_roc_auc == -1: logger.info("ROC AUC of the latest prediction is {:.4f}."\ .format(latest_roc_auc)) if is_multiclass_task: sorted_pairs_acc = sorted(zip(timestamps, accuracy_scores)) latest_acc = sorted_pairs_acc[-1][1] logger.info("Accuracy of the latest prediction is {:.4f}."\ .format(latest_acc)) X = [t for t, _ in sorted_pairs] Y = [s for _, s in sorted_pairs] alc, ax = plot_learning_curve(X, Y, start_time=start, time_budget=time_budget, task_name=basename) fig_name = get_fig_name(basename) path_to_fig = os.path.join(output_dir, fig_name) plt.savefig(path_to_fig) plt.close() return alc, time_used
def compute_error_bars(self, n=10): """Compute error bars on evaluation with bootstrap. Args: n: number of times to compute the score (more means more precision) Returns: (mean, std, var) """ try: scoring_function = self.scoring_functions['nauc'] solution = self.solution last_prediction = read_array(self.prediction_files_so_far[-1]) scores = compute_scores_bootstrap(scoring_function, solution, last_prediction, n=n) return np.mean(scores), np.std(scores), np.var(scores) except: # not able to compute error bars return -1, -1, -1
def draw_learning_curve(solution_dir, prediction_files, scoring_function, output_dir, basename, start, is_multiclass_task, time_budget): """Draw learning curve for one task.""" solution = get_solution(solution_dir) # numpy array scores = [] roc_auc_scores = [] _, timestamps = get_timestamps(prediction_dir) for prediction_file in prediction_files: prediction = read_array(prediction_file) # numpy array # if (solution.shape != prediction.shape): raise ValueError( # "Bad prediction shape: {}. ".format(prediction.shape) + # "Expected shape: {}".format(solution.shape)) scores.append(scoring_function(solution, prediction)) # Sort two lists according to timestamps sorted_pairs = sorted(zip(timestamps, scores)) time_used = -1 if len(timestamps) > 0: time_used = sorted_pairs[-1][0] - start latest_score = sorted_pairs[-1][1] logger.info("balanced acc of the latest prediction is {:.4f}." \ .format(latest_score)) # if is_multiclass_task: # sorted_pairs_acc = sorted(zip(timestamps, accuracy_scores)) # latest_acc = sorted_pairs_acc[-1][1] # logger.info("Accuracy of the latest prediction is {:.4f}."\ # .format(latest_acc)) X = [t for t, _ in sorted_pairs] Y = [s for _, s in sorted_pairs] alc, ax = plot_learning_curve(X, Y, start_time=start, time_budget=time_budget, task_name=basename) fig_name = get_fig_name(basename) path_to_fig = os.path.join(output_dir, fig_name) plt.savefig(path_to_fig) plt.close() return alc, time_used
def compute_score_per_prediction(self): """For new predictions found, compute their score using `self.solution` and scoring functions in `self.scoring_functions`. Then concatenate the list of new predictions to the list of resolved predictions so far. """ for score_name in self.scoring_functions: scoring_function = self.scoring_functions[score_name] if score_name != 'accuracy' or self.is_multiclass_task: new_scores = [scoring_function(self.solution, read_array(pred)) for pred in self.new_prediction_files] if score_name in self.scores_so_far: self.scores_so_far[score_name] += new_scores else: self.scores_so_far[score_name] = new_scores # If new predictions are found, update state variables if self.new_prediction_files: self.prediction_files_so_far += self.new_prediction_files num_preds = len(self.prediction_files_so_far) self.relative_timestamps = self.get_relative_timestamps()[:num_preds] self.learning_curve = self.get_learning_curve() self.new_prediction_files = []
while (time.time() < ingestion_start + time_budget): if not ingestion_is_alive(prediction_dir): logger.info( "Detected ingestion program had stopped running " + "because an 'end.txt' file is written by ingestion. " + "Stop scoring now.") break time.sleep(1) # Get list of prediction files prediction_files = get_prediction_files(prediction_dir) num_preds_new = len(prediction_files) if (num_preds_new > num_preds): new_prediction_files = get_new_prediction_files( prediction_dir, prediction_files_so_far) new_scores = [ scoring_function(solution, read_array(pred)) for pred in new_prediction_files ] prediction_files_so_far += new_prediction_files logger.info( "[+] New prediction found. Now number of predictions " + "made = {}".format(num_preds_new)) prediction_files_so_far += new_prediction_files scores_so_far += new_scores score = update_score_and_learning_curve( prediction_dir, basename, ingestion_start, solution_dir, scoring_function, score_dir, is_multiclass_task) num_preds = num_preds_new logger.info("Current area under learning curve for {}: {:.4f}"\ .format(basename, score)) else: # When time budget is used up, kill ingestion
scores_so_far = [] num_preds = 0 while(time.time() < ingestion_start + time_budget): if not ingestion_is_alive(prediction_dir): logger.info("Detected ingestion program had stopped running " + "because an 'end.txt' file is written by ingestion. " + "Stop scoring now.") break time.sleep(1) # Get list of prediction files prediction_files = get_prediction_files(prediction_dir) num_preds_new = len(prediction_files) if(num_preds_new > num_preds): new_prediction_files = get_new_prediction_files(prediction_dir, prediction_files_so_far) new_scores = [scoring_function(solution, read_array(pred)) for pred in new_prediction_files] prediction_files_so_far += new_prediction_files logger.info("[+] New prediction found. Now number of predictions " + "made = {}".format(num_preds_new)) prediction_files_so_far += new_prediction_files scores_so_far += new_scores score = update_score_and_learning_curve(prediction_dir, basename, ingestion_start, solution_dir, scoring_function, score_dir, is_multiclass_task) num_preds = num_preds_new logger.info("Current area under learning curve for {}: {:.4f}"\
# Is the first post file available? firstpost_name = os.path.join(pred_dir, '%s.firstpost' % dataset_name) if os.path.isfile(firstpost_name): # Record time of firstpost file creation start_time = os.path.getmtime(firstpost_name) # Makedir if necessary util.mkdir(os.path.join(score_dir, dataset_name, algo_name)) # Load info about this dataset info_file = os.path.join(data_dir, dataset_name, '%s_public.info' % dataset_name) info = libscores.get_info(info_file) # FIXME HACK info['metric'] = 'auc_metric' # END FIXME HACK # Load solution for this dataset solution_file = os.path.join(data_dir, dataset_name, '%s_%s.solution' % (dataset_name, TEST)) solution = libscores.read_array(solution_file) # For each set of predictions prediction_files = util.ls(os.path.join(pred_dir, '%s_%s_*.predict' % (dataset_name, TEST))) for prediction_file in prediction_files: # Time of file creation since algorithm start file_time = os.path.getmtime(prediction_file) - start_time # Open predictions prediction = libscores.read_array(prediction_file) # Check predictions match shape of solution if solution.shape != prediction.shape: raise ValueError("Mismatched prediction shape {} vs. {}".format(prediction.shape, solution.shape)) # Score if info['metric'] == 'r2_metric' or info['metric'] == 'a_metric': # Remove NaN and Inf for regression solution = libscores.sanitize_array(solution) prediction = libscores.sanitize_array(prediction)
score_name = 'set%s_score' % set_num # Extract the dataset name from the file name basename = solution_file[-solution_file[::-1].index(filesep): -solution_file[::-1].index('.') - 1] try: # Get the last prediction from the res subdirectory (must end with '.predict') predict_file = ls( os.path.join(input_dir, 'res', basename + '*.predict'))[-1] if (predict_file == []): raise IOError('Missing prediction file {}'.format(basename)) predict_name = predict_file[-predict_file[::-1].index(filesep): -predict_file[::-1].index('.') - 1] # Read the solution and prediction values into numpy arrays solution = read_array(solution_file) prediction = read_array(predict_file) if (solution.shape != prediction.shape): raise ValueError("Bad prediction shape {}".format( prediction.shape)) try: # Compute the score prescribed by the metric file score = scoring_function(solution, prediction) print("======= Set %d" % set_num + " (" + predict_name.capitalize() + "): score(" + score_name + ")=%0.12f =======" % score) html_file.write("======= Set %d" % set_num + " (" + predict_name.capitalize() + "): score(" + score_name + ")=%0.12f =======\n" % score) except:
# Is the first post file available? firstpost_name = os.path.join(pred_dir, '%s.firstpost' % dataset_name) if os.path.isfile(firstpost_name): # Record time of firstpost file creation start_time = os.path.getmtime(firstpost_name) # Makedir if necessary util.mkdir(os.path.join(score_dir, dataset_name, algo_name)) # Load info about this dataset info_file = os.path.join(data_dir, dataset_name, '%s_public.info' % dataset_name) info = libscores.get_info(info_file) # Load solution for this dataset solution_file = os.path.join( data_dir, dataset_name, '%s_%s.solution' % (dataset_name, TEST)) solution = libscores.read_array(solution_file) # For each set of predictions prediction_files = util.ls( os.path.join(pred_dir, '%s_%s_*.predict' % (dataset_name, TEST))) for prediction_file in prediction_files: # Time of file creation since algorithm start file_time = os.path.getmtime(prediction_file) - start_time # Open predictions prediction = libscores.read_array(prediction_file) # Check predictions match shape of solution if solution.shape != prediction.shape: raise ValueError( "Mismatched prediction shape {} vs. {}".format( prediction.shape, solution.shape)) # Score
# especially when Docker image time is not synced with host time start = os.path.getmtime(detailed_results_filepath) start_str = time.ctime(start) print_log("Start scoring program at " + start_str) # Get the metric scoring_function = autodl_bac metric_name = "Area under Learning Curve" # Get all the solution files from the solution directory solution_names = sorted(ls(os.path.join(solution_dir, '*.solution'))) if len(solution_names) > 1: # Assert only one file is found raise ValueError( "Multiple solution files found: {}!".format(solution_names)) solution_file = solution_names[0] solution = read_array(solution_file) is_multiclass_task = is_multiclass(solution) # Extract the dataset name from the file name basename = get_basename(solution_file) nb_preds = {x: 0 for x in solution_names} scores = {x: 0 for x in solution_names} # Use 'duration.txt' file to detect if ingestion program exits early duration_filepath = os.path.join(prediction_dir, 'duration.txt') # Begin scoring process, along with ingestion program # Moniter training processes while time budget is not attained known_prediction_files = {} while (time.time() < start + TIME_BUDGET): time.sleep(0.5) # Give list of prediction files
def _main(args): scoring_start = time.time() logger.debug("Parsed args are: " + str(args)) logger.debug("-" * 50) solution_dir = args.solution_dir prediction_dir = args.prediction_dir score_dir = args.score_dir wrong_class_log_dir = args.wrong_class_log_dir # Create the output directory, if it does not already exist and open output files if not os.path.isdir(score_dir): os.mkdir(score_dir) if os.path.exists(wrong_class_log_dir): shutil.rmtree(wrong_class_log_dir) os.mkdir(wrong_class_log_dir) # Write initial score to `missing_score` write_score(score_dir, missing_score, duration=0) logger.debug("Using solution_dir: " + str(solution_dir)) logger.debug("Using prediction_dir: " + str(prediction_dir)) logger.debug("Using score_dir: " + str(score_dir)) # Wait 30 seconds for ingestion to start and write 'start.txt', # Otherwise, raise an exception. wait_time = 30 ingestion_info = None for i in range(wait_time): ingestion_info = get_ingestion_info(prediction_dir) if not ingestion_info is None: logger.info("Detected the start of ingestion after {} ".format(i) + "seconds. Start scoring.") break time.sleep(1) else: raise IngestionError( "[-] Failed: scoring didn't detected the start of " + "ingestion after {} seconds.".format(wait_time)) # Get ingestion start time ingestion_start = ingestion_info['start_time'] logger.debug("Ingestion start time: {}".format(ingestion_start)) logger.debug("Scoring start time: {}".format(scoring_start)) # Get ingestion PID ingestion_pid = ingestion_info['ingestion_pid'] # get time_budget from start.txt time_budget = ingestion_info['time_budget'] # Get the metric scoring_function = autodl_acc_weighted metric_name = "Area under Learning Curve" # Get all the solution solution = get_solution(solution_dir) # Check if the task is multilabel (i.e. with one hot label) is_multiclass_task = is_multiclass(solution) # Extract the dataset name from the file name basename = get_task_name(solution_dir) scoring_success = True wrong_class_info_output = "" try: # Begin scoring process, along with ingestion program # Moniter training processes while time budget is not attained prediction_files_so_far = [] scores_so_far = [] num_preds = 0 while (ingestion_is_alive(prediction_dir) and is_process_alive(ingestion_pid)): time.sleep(1) # Get list of prediction files prediction_files = get_prediction_files(prediction_dir) num_preds_new = len(prediction_files) if (num_preds_new > num_preds): new_prediction_files = get_new_prediction_files( prediction_dir, prediction_files_so_far) new_scores = [ scoring_function(solution, read_array(pred)) for pred in new_prediction_files ] prediction_files_so_far += new_prediction_files logger.info( "[+] New prediction found. Now number of predictions " + "made = {}".format(num_preds_new)) prediction_files_so_far += new_prediction_files scores_so_far += new_scores score = update_score_and_learning_curve( prediction_dir, basename, 0, solution_dir, scoring_function, score_dir, is_multiclass_task, time_budget) ############################################### # y_hat_statistic output prediction = read_array(new_prediction_files[-1]) _pred = [] for pred_i in prediction: pred_y = int(pred_i[0]) pred_onehot = [0] * solution.shape[1] pred_onehot[pred_y] = 1 _pred.append(pred_onehot) prediction = np.array(_pred) output_class_info, y_wrong_percent_per_class, y_wrong_per_class_id = y_hat_statistic( prediction, solution) output_class_info = "=======================================================prediction {}==========================================================\n".format( num_preds_new) + output_class_info wrong_class_info_output += output_class_info wrong_class_log_i_path = os.path.join( wrong_class_log_dir, "wrong_class_prediction_{}.log".format(num_preds_new)) with open(wrong_class_log_i_path, 'w') as f1: f1.write(output_class_info) f1.close() plt.bar(x=range(solution.shape[1]), height=y_wrong_percent_per_class) fig_name = "wrong_class_distribution_predict " + str( num_preds_new) + ".png" path_to_fig = os.path.join(wrong_class_log_dir, fig_name) plt.savefig(path_to_fig) plt.close() ############################################### num_preds = num_preds_new logger.info("Current area under learning curve for {}: {:.4f}" \ .format(basename, score)) except Exception as e: scoring_success = False logger.error("[-] Error occurred in scoring:\n" + str(e), exc_info=True) ############################################### wrong_class_log_path = os.path.join(wrong_class_log_dir, "wrong_class_log_total.log") with open(wrong_class_log_path, 'w') as f1: f1.write(wrong_class_info_output) f1.close() ############################################### score = update_score_and_learning_curve(prediction_dir, basename, 0, solution_dir, scoring_function, score_dir, is_multiclass_task, time_budget) logger.info("Final area under learning curve for {}: {:.4f}" \ .format(basename, score)) # Write one last time the detailed results page without auto-refreshing write_scores_html(score_dir, auto_refresh=False) # Use 'end.txt' file to detect if ingestion program ends end_filepath = os.path.join(prediction_dir, 'end.txt') if not scoring_success: logger.error("[-] Some error occurred in scoring program. " + "Please see output/error log of Scoring Step.") elif not os.path.isfile(end_filepath): logger.error("[-] No 'end.txt' file is produced by ingestion. " + "Ingestion or scoring may have not terminated normally.") else: with open(end_filepath, 'r') as f: end_info_dict = yaml.safe_load(f) ingestion_duration = end_info_dict['ingestion_duration'] if end_info_dict['ingestion_success'] == 0: logger.error("[-] Some error occurred in ingestion program. " + "Please see output/error log of Ingestion Step.") else: logger.info("[+] Successfully finished scoring! " + "Scoring duration: {:.2f} sec. " \ .format(time.time() - scoring_start) + "Ingestion duration: {:.2f} sec. " \ .format(ingestion_duration) + "The score of your algorithm on the task '{}' is: {:.6f}." \ .format(basename, score)) logger.info("[Scoring terminated]")