def prep_pdbbind_all(normalize_x_kwargs: Optional[Dict[str, Any]] = None, normalize_y_kwargs: Optional[Dict[str, Any]] = None, shuffle_seed: int = SEED_DEF) -> TVT: url = r'https://raw.githubusercontent.com/guanghelee/iclr20-lcn/master/data/PDBbind.pkl.gz' path = f'{DATASETS_ROOT}/pdbbind/PDBbind.pkl.gz' DataLoader.download(url, path) with gzip.open(path, 'rb') as f: xtrain, ytrain, xval, yval, xtest, ytest = pickle.load(f) train = Dataset(xtrain.astype(np.float32), ytrain.astype(np.float32), name='pdbbind', copy=False, autoshrink_y=True) validn = Dataset(xval.astype(np.float32), yval.astype(np.float32), name='pdbbind', copy=False, autoshrink_y=True) test = Dataset(xtest.astype(np.float32), ytest.astype(np.float32), name='pdbbind', copy=False, autoshrink_y=True) return DataLoader.normalize_all_datasets(train, validn, test, normalize_x_kwargs, normalize_y_kwargs)
def prep_cpuactiv(category, seed) -> Dataset[np.ndarray]: with open('{}/cpuactiv/cpu_act{}.npz'.format(DATASETS_ROOT, seed - 1), 'rb') as f: npz = np.load(f) x, y = npz[f'X_{category}'].astype( np.float32), npz[f'y_{category}'].astype(np.float32) return Dataset(x, y, name='cpuactiv', copy=False, autoshrink_y=True)
def prep_abalone(category, seed) -> Dataset[np.ndarray]: npz = np.load( open('{}/abalone/abalone{}.npz'.format(DATASETS_ROOT, seed - 1), 'rb')) x, y = npz[f'X_{category}'].astype( np.float32), npz[f'y_{category}'].astype(np.float32) return Dataset(x, y, name='abalone', copy=False, autoshrink_y=True)
def prep_ctslice(category, seed) -> Dataset[np.ndarray]: raise NotImplementedError with open('{}/ctslice/ctslice{}.npz'.format(DATASETS_ROOT, seed - 1), 'rb') as f: npz = np.load(f) x, y = npz[f'X_{category}'].astype( np.float32), npz[f'y_{category}'].astype(np.float32) return Dataset(x, y, name='ctslice', copy=False, autoshrink_y=True)
def prep_ailerons(category: str) -> Dataset[np.ndarray]: df = pd.read_csv(f'{DATASETS_ROOT}/ailerons/ailerons.{category}', sep=',', header=None) x = np.array(df.iloc[:, :-1], dtype=np.float32) y = np.array(df.iloc[:, -1], dtype=np.float32) * 1e4 return Dataset(x, y, name='ailerons', copy=False, autoshrink_y=True)
def prep_yah(category: Optional[str]) -> Dataset[np.ndarray]: if category in ['train', 'test', 'val']: x = np.load('{}/yah/yah-{}-x.npy'.format( DATASETS_ROOT, category)).astype(np.float32) y = np.load('{}/yah/yah-{}-y.npy'.format( DATASETS_ROOT, category)).astype(np.float32) return Dataset(x, y, name='yah', copy=False, autoshrink_y=True) else: raise ValueError('category must be in ["train", "test", "val"]')
def prep_year(category: Optional[str]) -> Dataset[np.ndarray]: if category in ['train', 'test']: x = np.load(f'{DATASETS_ROOT}/year/year-{category}-x.npy').astype( np.float32) y = np.load(f'{DATASETS_ROOT}/year/year-{category}-y.npy').astype( np.float32) return Dataset(x, y, name='year', copy=False, autoshrink_y=True) else: raise ValueError('category must be in ["train", "test"]')
def prep_hiv(category: Optional[str]) -> Dataset[np.ndarray]: if category in ['train', 'val', 'test']: if category == 'val': category = 'valid' url = rf'https://raw.githubusercontent.com/guanghelee/iclr20-lcn/master/data/HIV_split/{category}.fgp2048.csv' path = f'{DATASETS_ROOT}/hiv/{category}.fgp2048.csv' DataLoader.download(url, path) df = pd.read_csv(path) x = np.array([[int(j) for j in i] for i in df['smiles'].values]).astype(np.float32) y = df['HIV_active'].values.astype(np.int64) return Dataset(x, y, name='hiv', copy=False, autoshrink_y=True) else: raise ValueError('category must be in ["train", "test", "val"]')
def _prep_generic(dataset_name: str, category: Optional[str]) -> Dataset[np.ndarray]: if category in ['train', 'val', 'test']: x = np.load( f'{DATASETS_ROOT}/{dataset_name}/{dataset_name}-{category}-x.npy' ).astype(np.float32) y = np.load( f'{DATASETS_ROOT}/{dataset_name}/{dataset_name}-{category}-y.npy' ).astype(np.int64 if DataLoader. is_classification(dataset_name) else np.float32) return Dataset(x, y, name=dataset_name, copy=False, autoshrink_y=True) else: raise ValueError('category must be in ["train", "test", "val"]')
def read_libsvm_format( file_path: str, n_features: int, n_classes: int, name: str = '', shuffle_seed: Optional[int] = 1) -> Dataset[np.ndarray]: is_classification = (n_classes > 0) with open(file_path, 'r') as f: content = f.read() assert ': ' not in content, 'Error while reading: {}'.format( file_path) content = content.replace(': ', ':') content = content.strip() lines = content.split('\n') lines = [line.strip() for line in lines] x = np.zeros((len(lines), n_features), dtype=np.float32) y = np.zeros((len(lines), ), dtype=np.int64 if is_classification else np.float32) for line_idx, line in enumerate(lines): for unit_idx, unit in enumerate(line.split()): if unit_idx == 0: assert ':' not in unit if is_classification: y[line_idx] = int(unit.strip()) else: y[line_idx] = float(unit.strip()) else: feat, val = unit.strip().split(':') feat: int = int(feat) val: float = float(val) x[line_idx][feat - 1] = val if is_classification: # To get classes in [0..n_classes-1] y = y - np.min(y) return Dataset(x, y, name=name, copy=False).shuffle(seed=shuffle_seed)
def get_best_model( log_path: str, mss: ModelSearchSet, train_data: Dataset, validn_data: Dataset, test_data: Dataset, get_dataset_kwargs: Dict[str, Any], exp_dir: str = '../out', devices_info: List[Tuple[int, int]] = [ (-1, 1) ], # (device_id, max_proc_cnt), -1 for cpu show_hpsearch_stats: bool = True): logger = Logger(log_path) if LOG_DATASET_STATS: logger.log(train_data.get_stats(title='Train Data')) logger.log(validn_data.get_stats(title='Validation Data')) logger.log(test_data.get_stats(title='Test Data')) # Prepare process-specific device ids, hyperparams and datasets proc_device_id: List[int] nprocs: int # Prep proc_device_id proc_device_id = [] device_ids_helper = {} for info in devices_info: device_ids_helper[info[0]] = info[1] rem = True while rem: rem = False for device_id in device_ids_helper: if device_ids_helper[device_id] > 0: rem = True proc_device_id.append(device_id) device_ids_helper[device_id] -= 1 # Prep proc_hps proc_hps = [] if mss.model_class not in CPU_MODELS: total_proc_cnt = len(proc_device_id) proc_hp_cnt = np.full(total_proc_cnt, len(mss.hps) // total_proc_cnt) proc_hp_cnt[:len(mss.hps) % total_proc_cnt] += 1 agg = 0 for cnt in proc_hp_cnt: if cnt > 0: proc_hps.append(mss.hps[agg:agg + cnt]) agg += cnt else: proc_hps = [mss.hps] nprocs = len(proc_hps) proc_device_id = proc_device_id[:nprocs] # Run the processes l: 'multiprocessing.synchronize.Lock' = Lock() cmlock: 'multiprocessing.synchronize.Lock' = Lock() started_hps = Value('i', 0) started_hps_lk: 'multiprocessing.synchronize.Lock' = Lock() finished_hps = Value('i', 0) total_hps = len(mss.hps) start_time = time.time() assert (train_data.name == validn_data.name) and (validn_data.name == test_data.name) and ( test_data.name == get_dataset_kwargs['dataset_name']) assert (train_data.shuffle_seed == validn_data.shuffle_seed) and ( validn_data.shuffle_seed == test_data.shuffle_seed) and (test_data.shuffle_seed == get_dataset_kwargs['shuffle_seed']) model_search_summary_path = '{}/{}/{}-search-summary{}{}.csv'.format( exp_dir, EXP_LOGS_DIR, get_dataset_kwargs['dataset_name'], DESC_SEP, exp_dir.split(DESC_SEP)[-1]) assert not os.path.exists(model_search_summary_path) logger.log(f'\nExperiment dir: {os.path.abspath(exp_dir)}\n') logger.log(f'Hyperparam configs: {total_hps}\n') logger.log(f'Number of processes: {nprocs}\n') args = (l, cmlock, started_hps, started_hps_lk, finished_hps, total_hps, start_time, mss.model_class, proc_hps, train_data, validn_data, test_data, get_dataset_kwargs, exp_dir, proc_device_id, log_path, model_search_summary_path, mss.use_lforb) if mss.model_class not in CPU_MODELS: spawn(get_best_model_aux, args=args, nprocs=nprocs) else: get_best_model_aux(0, *args) if show_hpsearch_stats: hpsearch_stats(model_search_summary_path, get_dataset_kwargs['dataset_name']) logger.log('\n==================================\n') logger.log('Model search summary saved to: {}\n'.format( os.path.abspath(model_search_summary_path))) logger.log('==================================\n\n') logger.close()
def get_best_model_aux( proc_num: int, l: 'multiprocessing.synchronize.Lock', # lock for coordinating MAIN_PROGRESS_FILE logging cmlock: 'multiprocessing.synchronize.Lock', # common lock for coordinating all other logging (typically for small messages) started_hps: Value, started_hps_lk: 'multiprocessing.synchronize.Lock', finished_hps: Value, total_hps: int, start_time: float, model_class: Type[LearnablePredictor], proc_hps: List[List[Dict[str, Any]]], train_data: Dataset, validn_data: Dataset, test_data: Dataset, get_dataset_kwargs: Dict[str, Any], exp_dir: str, proc_device_id: List[int], log_path: str, model_search_summary_path: str, use_lforb: bool): # Prep process-specific data hps = proc_hps[proc_num] device_ids = None if proc_device_id[proc_num] == -1 else [ proc_device_id[proc_num] ] assert train_data.n_labels == 1 compute_auc = Utils.is_binary_labels(train_data.to_ndarray()['y']) Utils.cmlog = functools.partial(Utils.safe_log, path='{}/{}/cm.log'.format( exp_dir, EXP_LOGS_DIR), l=cmlock) logger = Logger(log_path) for i, hp in enumerate(hps): with started_hps_lk: config_idx = started_hps.value started_hps.value += 1 hp = hp.copy() seed = hp['seed'] # print('seed: {}, type(seed): {}'.format(seed, type(seed)), flush=True) np.random.seed(seed) torch.manual_seed(seed) hp.pop('seed') # Prep try: model = model_class(**hp, device_ids=device_ids) # type: ignore except TypeError: model = model_class(**hp) # type: ignore model._use_lforb = use_lforb config, config_dir, img_save_prefix = setup_config_dir( model, train_data.name, train_data.shuffle_seed, exp_dir, config_idx) # Save data for easy reproducibility (currently used in mstd computation) repro_data = { 'get_dataset_kwargs': get_dataset_kwargs, 'model_class': model_class, 'seed': seed, 'hp': hp } with open('{}/{}.pkl'.format(config_dir, CONFIG_REPRO_DATA_FILE), 'wb') as f: pickle.dump(repro_data, f, protocol=pickle.HIGHEST_PROTOCOL) # Set what model.acc() should do acc_func, acc_func_type = Utils.get_acc_def( DataLoader.is_classification(get_dataset_kwargs['dataset_name']), hp.get('criterion', None)) model.acc_func = acc_func model.acc_func_type = acc_func_type model.train(train_data, validn_data, test_data) stats = process_model(model, train_data, validn_data, test_data, config_dir, config_idx, seed, compute_auc) # Log l.acquire() finished_hps.value += 1 logger.log('\n>> [{:.2f}% ({}/{})]:\nRan: (cidx={}): {}\n'.format( finished_hps.value * 100 / total_hps, finished_hps.value, total_hps, config_idx, config)) logger.log('Config dir: {}\n'.format(os.path.abspath(config_dir))) logger.log('\ntrain_acc={:.5f}%\n'.format(stats['train_acc'])) logger.log('validn_acc={:.5f}%\n'.format(stats['validn_acc'])) logger.log('test_acc={:.5f}%\n'.format(stats['test_acc'])) if compute_auc: logger.log('train_auc={:.5f}\n'.format(stats['train_auc'])) logger.log('validn_auc={:.5f}\n'.format(stats['validn_auc'])) logger.log('test_auc={:.5f}\n'.format(stats['test_auc'])) if isinstance(model, DTExtractablePredictor): if not model._is_pure_dt: logger.log('dt_train_acc={:.5f}%\n'.format( stats['dt_train_acc'])) logger.log('dt_validn_acc={:.5f}%\n'.format( stats['dt_validn_acc'])) logger.log('dt_test_acc={:.5f}%\n'.format( stats['dt_test_acc'])) if compute_auc: logger.log('dt_train_auc={:.5f}\n'.format( stats['dt_train_auc'])) logger.log('dt_validn_auc={:.5f}\n'.format( stats['dt_validn_auc'])) logger.log('dt_test_auc={:.5f}\n'.format( stats['dt_test_auc'])) if isinstance(model, DTExtractablePredictor) and isinstance( model, DGTPredictor): if not model._is_pure_dt: logger.log('cdt_train_acc={:.5f}%\n'.format( stats['cdt_train_acc'])) logger.log('cdt_validn_acc={:.5f}%\n'.format( stats['cdt_validn_acc'])) logger.log('cdt_test_acc={:.5f}%\n'.format( stats['cdt_test_acc'])) total_time = time.time() - start_time per_hp_time = total_time / finished_hps.value rem_time = per_hp_time * (total_hps - finished_hps.value) logger.log('Time: per hp={}, total={}, rem={}\n'.format( td(seconds=per_hp_time), td(seconds=total_time), td(seconds=rem_time))) Utils.append_linedict_to_csv(stats, model_search_summary_path) l.release() if use_lforb: assert hp['use_last_model'] old_logs_dir = model.logs_dir config_idx += total_hps _, config_dir, _ = setup_config_dir(model, train_data.name, train_data.shuffle_seed, exp_dir, config_idx) model.load_best_model(old_logs_dir) hp['use_last_model'] = False repro_data = { 'get_dataset_kwargs': get_dataset_kwargs, 'model_class': model_class, 'seed': seed, 'hp': hp } with open('{}/{}.pkl'.format(config_dir, CONFIG_REPRO_DATA_FILE), 'wb') as f: pickle.dump(repro_data, f, protocol=pickle.HIGHEST_PROTOCOL) stats = process_model(model, train_data, validn_data, test_data, config_dir, config_idx, seed, compute_auc) l.acquire() Utils.append_linedict_to_csv(stats, model_search_summary_path) l.release() logger.close()