def main(*args): s3_client = s3Client(S3_BUCKET, SENDER_EMAIL, RECIPIENT_EMAIL) gs_client = gsClient(VFXPY_SPREADSHEET_KEY, COMMUNITY_SPREADSHEET_KEY) print("Starting...") packages = remove_irrelevant_packages(get_packages(gs_client)) save_to_file(s3_client, packages) generate_svg_wheel(s3_client, packages) compare_and_notify(s3_client, gs_client) print("Exiting...")
def clean_data(): """ 先简单的处理一下数据, 主要目的是数据清理,移除乱码的数据以及在文本中错误的换行修正 """ logging.info("clean data begin") # 临时数据列表 tmp_data_list = [] tmp_line = "" with open(file_path, "rb") as file: for line in file: # 由于其他数据会出现还行现象,不是以id开头的行是其他数据换行生成的数据,这些数据修正一下 # 修正的思路是读到id之后不是立即处理该行数据,而是继续读取,追加后面行的内容,直到读到下一个id,再处理这些暂存起来的数据信息 # 字节数据解析成字符串,并移除换行符 line = line.decode(encoding="utf-8").replace("\r\n", "\n").replace("\n", "") # 粗略地处理数据 if line.startswith("\"ID\"") is False and re.match(data_pattern, line) is None: # 既不是表头也不是以id开头,说明时上一项错误换行的内容,记录该内容并继续读下一行 tmp_line += line continue else: # 读取到了下一个 id 的信息,处理一下上一次暂存的信息 if tmp_line.strip(): data, need = __parse_data(tmp_line) if need: tmp_data_list.append(data) # 分块写入文件,避免占用大量内存 if len(tmp_data_list) > page_size: save_to_file(tmp_data_list, pure_data_file_path) tmp_data_list.clear() # 下一个id的信息 tmp_line = line.replace("\r\n", "\n").replace("\n", "") # 由于最后一条没有下一个 id 帮助触发数据处理了,因此在结尾处主动处理一下最后一条 id 的数据 if tmp_line.strip(): data, need = __parse_data(tmp_line) if need: tmp_data_list.append(data) # 分块写入文件,避免占用大量内存 if len(tmp_data_list) > page_size: save_to_file(tmp_data_list, pure_data_file_path) tmp_data_list.clear() # 最后再处理一下末尾阶段不足1000条的数据 save_to_file(tmp_data_list, pure_data_file_path) tmp_data_list.clear() logging.info("clean data end")
res_false = res[number_of_tests].loc[res[number_of_tests].correct_pred == False] res_unseen = res_unseen_bay[number_of_tests] all_uncs = [[res_correct[unc_name], res_false[unc_name], res_unseen[unc_name]] for unc_name in unc_names] unc_labels = ('true', 'false', 'unseen') for idx, (unc_name, uncs) in enumerate(zip(unc_names, all_uncs)): fig = plt.figure(figsize=(7, 5)) # ax = fig.add_subplot(len(unc_names), 1, idx+1) ax = fig.add_subplot(111) ax.set_title(f'{unc_name}, rho={arguments["rho"]}, std={arguments["std_prior"]}, T={number_of_tests}, {arguments["loss_type"]}\n' f'Accuracy: {round(res[number_of_tests].correct_pred.mean()*100, 2)}%') uncs_to_show = [unc.loc[unc < 1*unc.max()] for unc in uncs] g.plot_uncertainty( ax, unc_name, uncs, unc_labels, ) ax.legend() (save_path / f'{arguments["trainset"]}/images/{arguments["loss_type"]}/{unc_name}/').mkdir(exist_ok=True, parents=True) fig.savefig(save_path/f'{arguments["trainset"]}/images/{arguments["loss_type"]}/{unc_name}/{arguments["loss_type"]}_{unc_name}_rho{arguments["rho"]}_std{arguments["std_prior"]}_T{number_of_tests}.png') (save_path / f'{arguments["trainset"]}/pickles/{arguments["loss_type"]}/{unc_name}/').mkdir(exist_ok=True, parents=True) u.save_to_file(fig, save_path/f'{arguments["trainset"]}/pickles/{arguments["loss_type"]}/{unc_name}/{arguments["loss_type"]}_{unc_name}_rho{arguments["rho"]}_std{arguments["std_prior"]}_T{number_of_tests}.pkl') plt.close(fig) print('This exp time elapsed:', round(time() - start_exp), 's') print('Total time elapsed:', round(time() - start_tot), 's') start_exp = time()
def main(*args): packages = remove_irrelevant_packages(get_top_packages(), TO_CHART) annotate_wheels(packages) save_to_file(packages) generate_svg_wheel(packages, TO_CHART)
def main( exp_nbs=exp_nbs, path_to_results=save_path, path_to_exps=path_to_exps, n=n, nb_of_runs=nb_of_runs, number_of_tests=number_of_tests, verbose=verbose, nb_of_random=nb_of_random, do_recompute_outputs=do_recompute_outputs, save_csv=save_csv, device='cpu', ): save_path = pathlib.Path(path_to_results) if do_recompute_outputs: save_path = save_path / 'recomputed' else: save_path = save_path / 'saved_from_polyaxon' save_path.mkdir(exist_ok=True, parents=True) if not do_recompute_outputs: nb_of_runs = 1 if not os.path.exists(save_path / 'deadzones.pkl'): deadzones = pd.DataFrame(columns=['group_nb', 'exp_nb', 'unc_name']) else: deadzones = load_from_file(save_path / 'deadzones.pkl', ) deadzones.to_csv(save_path / 'deadzones.csv') recomputed_exps = [] start_time = time() for repeat_idx in range(nb_of_runs): for exp_nb in exp_nbs: print( f'Repeat number {repeat_idx + 1} / {nb_of_runs}, Exp nb {exp_nb}' ) arguments = get_args(exp_nb, path_to_exps) determinist = arguments.get('rho', 'determinist') == 'determinist' def recompute_outputs(deadzones): bay_net_trained, arguments, group_nb = get_trained_model_and_args_and_groupnb( exp_nb, exp_path=path_to_exps) bay_net_trained.to(device) arguments['number_of_tests'] = number_of_tests all_eval_outputs, _ = get_seen_outputs_and_labels( bay_net_trained, arguments, device=device, verbose=verbose, ) all_outputs_unseen = get_unseen_outputs( bay_net_trained, arguments, nb_of_random, device=device, verbose=verbose, ) if determinist: dzs = get_deadzones( all_eval_outputs, all_outputs_unseen, get_all_uncertainty_measures_not_bayesian, n) iterator = zip(['us', 'pe'], dzs) else: dzs = get_deadzones(all_eval_outputs, all_outputs_unseen, get_all_uncertainty_measures_bayesian, n) iterator = zip(['vr', 'pe', 'mi'], dzs) for unc_name, dz in iterator: deadzones = deadzones.append( pd.DataFrame.from_dict({ 'group_nb': [group_nb], 'exp_nb': [exp_nb], 'trainset': [arguments.get('trainset', 'mnist')], 'type_of_unseen': [arguments['type_of_unseen']], 'epoch': [arguments['epoch']], 'number_of_tests': [arguments['number_of_tests']], 'unc_name': [unc_name], f'dz_{n}': [dz], })) return deadzones if do_recompute_outputs: deadzones = recompute_outputs(deadzones) else: try: results, arguments, group_nb = get_res_args_groupnb( exp_nb, exp_path=path_to_exps) except RuntimeError as e: if e.__str__() == "Attempting to deserialize object on a CUDA device but torch.cuda.is_available() " \ "is False. If you are running on a CPU-only machine, please use torch.load with " \ "map_location='cpu' to map your storages to the CPU.": recompute_outputs() recomputed_exps.append(exp_nb) continue else: raise e def seen_and_unseen_and_n(results, unc, n): return (results.get( get_unc_key(results.columns, f'seen {unc}'), [torch.tensor([-1], dtype=torch.float)])[0], results.get( get_unc_key(results.columns, f'unseen {unc}'), [torch.tensor([-1], dtype=torch.float)])[0], n) try: dz_pe = get_deadzone_from_unc( *seen_and_unseen_and_n(results, 'pe', n)) except: dz_pe = -1 if determinist: dz_us = get_deadzone_from_unc( *seen_and_unseen_and_n(results, 'us', n)) iterator = zip(['us', 'pe'], [dz_us, dz_pe]) else: dz_vr = get_deadzone_from_unc( *seen_and_unseen_and_n(results, 'vr', n)) dz_mi = get_deadzone_from_unc( *seen_and_unseen_and_n(results, 'mi', n)) iterator = zip(['vr', 'pe', 'mi'], [dz_vr, dz_pe, dz_mi]) for unc_name, dz in iterator: deadzones = deadzones.append( pd.DataFrame.from_dict({ 'deadzone_number': [n], 'group_nb': [group_nb], 'trainset': [arguments.get('trainset', 'mnist')], 'exp_nb': [exp_nb], 'type_of_unseen': [arguments['type_of_unseen']], 'epoch': [arguments['epoch']], 'number_of_tests': [arguments['number_of_tests']], 'unc_name': [unc_name], f'dz_{n}': [dz], })) print(f'Time Elapsed:{round(time() - start_time)} s.') deadzones.exp_nb = deadzones.exp_nb.astype('int') if save_csv: save_to_file(arguments, save_path / 'arguments.pkl') deadzones.sort_values('exp_nb') deadzones.to_pickle(save_path / 'deadzones.pkl') deadzones.to_csv(save_path / 'deadzones.csv') print(deadzones)
default=20) parser.add_argument('--loss_type', help='which loss to use', choices=['exp', 'uniform', 'criterion'], type=str, default='uniform') parser.add_argument('--std_prior', help='the standard deviation of the prior', type=float, default=0.1) parser.add_argument('--delta', help='probability upper bound of error higher that risk', type=float) args = parser.parse_args() save_to_file(vars(args), './output/arguments.pkl') trainset = args.trainset rho = args.rho epoch = args.epoch batch_size = args.batch_size number_of_tests = args.number_of_tests loss_type = args.loss_type std_prior = args.std_prior stds_prior = (std_prior, std_prior) delta = args.delta risks = np.linspace(0.01, 0.5, 50) if torch.cuda.is_available(): device = 'cuda' else:
f'Mutual Information:{unseen_mi.mean()}') res = pd.concat((res, pd.DataFrame.from_dict({ 'sigma_initial': [log(1 + exp(rho))], 'seen_uncertainty_vr': [eval_vr], 'seen_uncertainty_pe': [eval_pe], 'seen_uncertainty_mi': [eval_mi], 'unseen_uncertainty_vr': [unseen_vr], 'unseen_uncertainty_pe': [unseen_pe], 'unseen_uncertainty_mi': [unseen_mi], })), axis=1) convert_df_to_cpu(res) save_to_file(arguments, f'{output_file}/arguments.pkl') if args.save_loss: save_to_file(loss, f'{output_file}/loss.pkl') if args.save_observables: save_to_file(observables, f'{output_file}/TrainingLogs.pkl') if args.save_outputs: torch.save(all_outputs_unseen, f'{output_file}/unseen_outputs.pt') torch.save(all_outputs_eval, f'{output_file}/seen_outputs.pt') # torch.save(res, f'{output_file}/results.pt') res.to_pickle(f'{output_file}/results.pkl') torch.save(bay_net.state_dict(), f'{output_file}/final_weights.pt') torch.save(observables.max_weights, f'{output_file}/best_weights.pt') pd.DataFrame.from_dict({k: [v] for k, v in arguments.items() }).to_csv(f'{output_file}/arguments.csv')
def train_vcae(n_epochs, model, train_iterator, val_iterator, optimizer, device, criterion, save_best=True, verbose=True, is_nf=False, nf=None): model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__ writer, experiment_name, best_model_path = setup_experiment(model_name, log_dir="./tb") mb = master_bar(range(n_epochs)) train_losses, val_losses = [], [] best_val_loss = float('+inf') for epoch in mb: train_loss = run_epoch(model, train_iterator, optimizer, criterion, mb, phase='train', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) val_loss = run_epoch(model, val_iterator, None, criterion, mb, phase='val', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) # save logs dict_saver = {} dict_saver.update({'train_loss_mean': train_loss}) dict_saver.update({'test_loss_mean': val_loss}) file_to_save_path = ''.join( [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON]) save_to_file(file_to_save_path, dict_saver) # save the best model if save_best and (val_loss < best_val_loss): best_val_loss = val_loss save_model(nf if is_nf else model, best_model_path) if verbose: # append to a list for real-time plotting train_losses.append(train_loss) val_losses.append(val_loss) # start plotting for notebook mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}' mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}" plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses) return best_model_path
from src.utils import get_file_and_dir_path_in_dir, load_from_file, save_to_file path_to_exps = 'output/determinist_cifar10' files, _ = get_file_and_dir_path_in_dir(path_to_exps, 'arguments.pkl') for file in files: args = load_from_file(file) args['number_of_tests'] = 1 print(file, 'changed') save_to_file(args, file)
if show_fig or save_fig: for exp_nb in exp_nbs: fig = plot_acc_cov(number_of_tests_to_print, exp_nb, results_train, figsize=figsize) arguments = get_args(exp_nb, path) fig.suptitle( f'Trainset: Acc-Coverage, w.r.t. nb of tests and uncertainty measure\n' f'{dict({k: v for k, v in arguments.items() if k not in ["type_of_unseen", "number_of_tests"]})}', wrap=True) if save_fig: save_png_path.mkdir(exist_ok=True, parents=True) save_pkl_path.mkdir(exist_ok=True, parents=True) fig.savefig(save_png_path / f'{exp_nb}-acc-coverage-train.png') save_to_file(fig, save_pkl_path / f'{exp_nb}-acc-coverage-train.pkl') if show_fig: fig.show() results_eval = pd.read_csv(save_csv_path / 'results_eval.csv') if show_fig or save_fig: for exp_nb in exp_nbs: fig = plot_acc_cov(number_of_tests_to_print, exp_nb, results_eval, figsize=figsize) arguments = get_args(exp_nb, path) fig.suptitle( f'Testset: Acc-Coverage, w.r.t. nb of tests and uncertainty measure\n' f'{dict({k: v for k, v in arguments.items() if k not in ["type_of_unseen", "number_of_tests"]})}', wrap=True)
arguments, group_nb, exp_nb) fig1 = utils.compute_density_train_seen_unseen( arguments=arguments, all_outputs_train=all_outputs_train, all_outputs_seen=all_outputs_seen, all_outputs_unseen=all_outputs_unseen, show_fig=show_fig, save_fig=save_fig, save_path=save_path_hists, figsize=figsize, ) if save_fig: print('Saving figure...') fig1.savefig(save_path_hists) save_to_file(fig1, str(save_path_hists).replace('png', 'pkl')) print('Figure saved.') if show_fig: print('Showing figures...') fig1.show() print('Figure shown.') print('Done') if do_train_correct_false: print('Do train correct false...') save_path_hists = get_save_path(save_path, 'train_correct_false', arguments, group_nb, exp_nb) fig2 = utils.compute_density_correct_false( arguments=arguments, all_outputs=all_outputs_train,
# ax4.set_title(f'softmax output unseen. VR: {round(vr_unseen.item(), 4)}, PE: {round(pe_unseen.item(), 4)}, MI: {round(mi_unseen.item(), 4)}') if is_cifar: ax4.set_xticks(range(10)) ax4.set_xticklabels(cifar_labels) ax4.tick_params(axis='x', rotation=45) fig.show() save_path = 'results/images/softmax_output' save_path = pathlib.Path(save_path) save_fig = False if save_fig: save_path.mkdir(exist_ok=True, parents=True) fig.savefig(save_path / f'softmax_output_{exp}_{img_index_seen}.png') u.save_to_file( fig, save_path / f'softmax_output_{exp}_{img_index_seen}.pkl') # %% Compute det outputs reload_modules() trainset = 'cifar10' res_det = pd.DataFrame() det_net_trained, arguments, _ = su.get_trained_model_and_args_and_groupnb( f'determinist_{trainset}', f'output/') evalloader_seen = su.get_evalloader_seen(arguments, shuffle=False) labels, all_outputs = e.eval_bayesian( det_net_trained, evalloader_seen,
# cur_auc = metrics.auc(xs_means.iloc[xs_means.argsort()], ys_means.iloc[xs_means.argsort()]) axs.set_title( title + f'Bay AUC: {round(100 * aucs[idx_unc].mean(), 2)} +- {round(100 * 1.97 * aucs[idx_unc].std() / 5, 2)} %, T={number_of_tests}' ) axs.legend(handles=legend_elements) fig.suptitle(f'') if save_fig: (save_path / f'{unc_name}').mkdir(exist_ok=True, parents=True) fig.savefig( save_path / f'{unc_name}' / f'roc_{typ}_{arg["loss_type"]}_{unc_name}_T{number_of_tests}.png' ) save_to_file( fig, save_path / f'{unc_name}' / f'roc_{typ}_{arg["loss_type"]}_{unc_name}_T{number_of_tests}.pkl' ) lp = arg['loss_type'] print( f"Fig saved in {save_path / f'{unc_name}' / f'roc_{typ}_{lp}_{unc_name}_T{number_of_tests}.png'}" ) plt.close(fig) # res.loc[ # [res.rho == rho, res.std_prior == std_prior, res.T == number_of_tests, res.unc_name == unc_name], # [f'{trainset}+_{typ}'] # ] = [aucs[idx_unc].mean(), 1.97 * aucs[idx_unc].std() / 5] print(aucs.mean(1)) print(aucs.std(1) * 1.97 / 5) print(aucs_det.mean(1))