Пример #1
0
    def prep_pdbbind_all(normalize_x_kwargs: Optional[Dict[str, Any]] = None,
                         normalize_y_kwargs: Optional[Dict[str, Any]] = None,
                         shuffle_seed: int = SEED_DEF) -> TVT:
        url = r'https://raw.githubusercontent.com/guanghelee/iclr20-lcn/master/data/PDBbind.pkl.gz'
        path = f'{DATASETS_ROOT}/pdbbind/PDBbind.pkl.gz'
        DataLoader.download(url, path)

        with gzip.open(path, 'rb') as f:
            xtrain, ytrain, xval, yval, xtest, ytest = pickle.load(f)

        train = Dataset(xtrain.astype(np.float32),
                        ytrain.astype(np.float32),
                        name='pdbbind',
                        copy=False,
                        autoshrink_y=True)
        validn = Dataset(xval.astype(np.float32),
                         yval.astype(np.float32),
                         name='pdbbind',
                         copy=False,
                         autoshrink_y=True)
        test = Dataset(xtest.astype(np.float32),
                       ytest.astype(np.float32),
                       name='pdbbind',
                       copy=False,
                       autoshrink_y=True)

        return DataLoader.normalize_all_datasets(train, validn, test,
                                                 normalize_x_kwargs,
                                                 normalize_y_kwargs)
Пример #2
0
 def prep_cpuactiv(category, seed) -> Dataset[np.ndarray]:
     with open('{}/cpuactiv/cpu_act{}.npz'.format(DATASETS_ROOT, seed - 1),
               'rb') as f:
         npz = np.load(f)
         x, y = npz[f'X_{category}'].astype(
             np.float32), npz[f'y_{category}'].astype(np.float32)
     return Dataset(x, y, name='cpuactiv', copy=False, autoshrink_y=True)
Пример #3
0
 def prep_abalone(category, seed) -> Dataset[np.ndarray]:
     npz = np.load(
         open('{}/abalone/abalone{}.npz'.format(DATASETS_ROOT, seed - 1),
              'rb'))
     x, y = npz[f'X_{category}'].astype(
         np.float32), npz[f'y_{category}'].astype(np.float32)
     return Dataset(x, y, name='abalone', copy=False, autoshrink_y=True)
Пример #4
0
 def prep_ctslice(category, seed) -> Dataset[np.ndarray]:
     raise NotImplementedError
     with open('{}/ctslice/ctslice{}.npz'.format(DATASETS_ROOT, seed - 1),
               'rb') as f:
         npz = np.load(f)
         x, y = npz[f'X_{category}'].astype(
             np.float32), npz[f'y_{category}'].astype(np.float32)
     return Dataset(x, y, name='ctslice', copy=False, autoshrink_y=True)
Пример #5
0
    def prep_ailerons(category: str) -> Dataset[np.ndarray]:
        df = pd.read_csv(f'{DATASETS_ROOT}/ailerons/ailerons.{category}',
                         sep=',',
                         header=None)

        x = np.array(df.iloc[:, :-1], dtype=np.float32)
        y = np.array(df.iloc[:, -1], dtype=np.float32) * 1e4
        return Dataset(x, y, name='ailerons', copy=False, autoshrink_y=True)
Пример #6
0
    def prep_yah(category: Optional[str]) -> Dataset[np.ndarray]:
        if category in ['train', 'test', 'val']:
            x = np.load('{}/yah/yah-{}-x.npy'.format(
                DATASETS_ROOT, category)).astype(np.float32)
            y = np.load('{}/yah/yah-{}-y.npy'.format(
                DATASETS_ROOT, category)).astype(np.float32)

            return Dataset(x, y, name='yah', copy=False, autoshrink_y=True)
        else:
            raise ValueError('category must be in ["train", "test", "val"]')
Пример #7
0
    def prep_year(category: Optional[str]) -> Dataset[np.ndarray]:
        if category in ['train', 'test']:
            x = np.load(f'{DATASETS_ROOT}/year/year-{category}-x.npy').astype(
                np.float32)
            y = np.load(f'{DATASETS_ROOT}/year/year-{category}-y.npy').astype(
                np.float32)

            return Dataset(x, y, name='year', copy=False, autoshrink_y=True)
        else:
            raise ValueError('category must be in ["train", "test"]')
Пример #8
0
    def prep_hiv(category: Optional[str]) -> Dataset[np.ndarray]:
        if category in ['train', 'val', 'test']:
            if category == 'val':
                category = 'valid'

            url = rf'https://raw.githubusercontent.com/guanghelee/iclr20-lcn/master/data/HIV_split/{category}.fgp2048.csv'
            path = f'{DATASETS_ROOT}/hiv/{category}.fgp2048.csv'
            DataLoader.download(url, path)

            df = pd.read_csv(path)
            x = np.array([[int(j) for j in i]
                          for i in df['smiles'].values]).astype(np.float32)
            y = df['HIV_active'].values.astype(np.int64)

            return Dataset(x, y, name='hiv', copy=False, autoshrink_y=True)
        else:
            raise ValueError('category must be in ["train", "test", "val"]')
Пример #9
0
    def _prep_generic(dataset_name: str,
                      category: Optional[str]) -> Dataset[np.ndarray]:
        if category in ['train', 'val', 'test']:
            x = np.load(
                f'{DATASETS_ROOT}/{dataset_name}/{dataset_name}-{category}-x.npy'
            ).astype(np.float32)
            y = np.load(
                f'{DATASETS_ROOT}/{dataset_name}/{dataset_name}-{category}-y.npy'
            ).astype(np.int64 if DataLoader.
                     is_classification(dataset_name) else np.float32)

            return Dataset(x,
                           y,
                           name=dataset_name,
                           copy=False,
                           autoshrink_y=True)

        else:
            raise ValueError('category must be in ["train", "test", "val"]')
Пример #10
0
    def read_libsvm_format(
            file_path: str,
            n_features: int,
            n_classes: int,
            name: str = '',
            shuffle_seed: Optional[int] = 1) -> Dataset[np.ndarray]:
        is_classification = (n_classes > 0)

        with open(file_path, 'r') as f:
            content = f.read()
        assert ':  ' not in content, 'Error while reading: {}'.format(
            file_path)

        content = content.replace(': ', ':')
        content = content.strip()
        lines = content.split('\n')
        lines = [line.strip() for line in lines]

        x = np.zeros((len(lines), n_features), dtype=np.float32)
        y = np.zeros((len(lines), ),
                     dtype=np.int64 if is_classification else np.float32)

        for line_idx, line in enumerate(lines):
            for unit_idx, unit in enumerate(line.split()):
                if unit_idx == 0:
                    assert ':' not in unit
                    if is_classification:
                        y[line_idx] = int(unit.strip())
                    else:
                        y[line_idx] = float(unit.strip())
                else:
                    feat, val = unit.strip().split(':')
                    feat: int = int(feat)
                    val: float = float(val)
                    x[line_idx][feat - 1] = val

        if is_classification:
            # To get classes in [0..n_classes-1]
            y = y - np.min(y)

        return Dataset(x, y, name=name, copy=False).shuffle(seed=shuffle_seed)
Пример #11
0
def get_best_model(
        log_path: str,
        mss: ModelSearchSet,
        train_data: Dataset,
        validn_data: Dataset,
        test_data: Dataset,
        get_dataset_kwargs: Dict[str, Any],
        exp_dir: str = '../out',
        devices_info: List[Tuple[int, int]] = [
            (-1, 1)
        ],  # (device_id, max_proc_cnt), -1 for cpu
        show_hpsearch_stats: bool = True):
    logger = Logger(log_path)
    if LOG_DATASET_STATS:
        logger.log(train_data.get_stats(title='Train Data'))
        logger.log(validn_data.get_stats(title='Validation Data'))
        logger.log(test_data.get_stats(title='Test Data'))

    # Prepare process-specific device ids, hyperparams and datasets
    proc_device_id: List[int]
    nprocs: int

    # Prep proc_device_id
    proc_device_id = []
    device_ids_helper = {}
    for info in devices_info:
        device_ids_helper[info[0]] = info[1]
    rem = True
    while rem:
        rem = False
        for device_id in device_ids_helper:
            if device_ids_helper[device_id] > 0:
                rem = True
                proc_device_id.append(device_id)
                device_ids_helper[device_id] -= 1

    # Prep proc_hps
    proc_hps = []
    if mss.model_class not in CPU_MODELS:
        total_proc_cnt = len(proc_device_id)
        proc_hp_cnt = np.full(total_proc_cnt, len(mss.hps) // total_proc_cnt)
        proc_hp_cnt[:len(mss.hps) % total_proc_cnt] += 1
        agg = 0
        for cnt in proc_hp_cnt:
            if cnt > 0:
                proc_hps.append(mss.hps[agg:agg + cnt])
                agg += cnt
    else:
        proc_hps = [mss.hps]
    nprocs = len(proc_hps)
    proc_device_id = proc_device_id[:nprocs]

    # Run the processes
    l: 'multiprocessing.synchronize.Lock' = Lock()
    cmlock: 'multiprocessing.synchronize.Lock' = Lock()
    started_hps = Value('i', 0)
    started_hps_lk: 'multiprocessing.synchronize.Lock' = Lock()
    finished_hps = Value('i', 0)
    total_hps = len(mss.hps)
    start_time = time.time()
    assert (train_data.name
            == validn_data.name) and (validn_data.name == test_data.name) and (
                test_data.name == get_dataset_kwargs['dataset_name'])
    assert (train_data.shuffle_seed == validn_data.shuffle_seed) and (
        validn_data.shuffle_seed
        == test_data.shuffle_seed) and (test_data.shuffle_seed
                                        == get_dataset_kwargs['shuffle_seed'])

    model_search_summary_path = '{}/{}/{}-search-summary{}{}.csv'.format(
        exp_dir, EXP_LOGS_DIR, get_dataset_kwargs['dataset_name'], DESC_SEP,
        exp_dir.split(DESC_SEP)[-1])
    assert not os.path.exists(model_search_summary_path)

    logger.log(f'\nExperiment dir: {os.path.abspath(exp_dir)}\n')
    logger.log(f'Hyperparam configs: {total_hps}\n')
    logger.log(f'Number of processes: {nprocs}\n')

    args = (l, cmlock, started_hps, started_hps_lk, finished_hps, total_hps,
            start_time, mss.model_class, proc_hps, train_data, validn_data,
            test_data, get_dataset_kwargs, exp_dir, proc_device_id, log_path,
            model_search_summary_path, mss.use_lforb)
    if mss.model_class not in CPU_MODELS:
        spawn(get_best_model_aux, args=args, nprocs=nprocs)
    else:
        get_best_model_aux(0, *args)

    if show_hpsearch_stats:
        hpsearch_stats(model_search_summary_path,
                       get_dataset_kwargs['dataset_name'])

    logger.log('\n==================================\n')
    logger.log('Model search summary saved to: {}\n'.format(
        os.path.abspath(model_search_summary_path)))
    logger.log('==================================\n\n')
    logger.close()
Пример #12
0
def get_best_model_aux(
        proc_num: int,
        l:
    'multiprocessing.synchronize.Lock',  # lock for coordinating MAIN_PROGRESS_FILE logging
        cmlock:
    'multiprocessing.synchronize.Lock',  # common lock for coordinating all other logging (typically for small messages)
        started_hps: Value,
        started_hps_lk: 'multiprocessing.synchronize.Lock',
        finished_hps: Value,
        total_hps: int,
        start_time: float,
        model_class: Type[LearnablePredictor],
        proc_hps: List[List[Dict[str, Any]]],
        train_data: Dataset,
        validn_data: Dataset,
        test_data: Dataset,
        get_dataset_kwargs: Dict[str, Any],
        exp_dir: str,
        proc_device_id: List[int],
        log_path: str,
        model_search_summary_path: str,
        use_lforb: bool):
    # Prep process-specific data
    hps = proc_hps[proc_num]
    device_ids = None if proc_device_id[proc_num] == -1 else [
        proc_device_id[proc_num]
    ]

    assert train_data.n_labels == 1
    compute_auc = Utils.is_binary_labels(train_data.to_ndarray()['y'])

    Utils.cmlog = functools.partial(Utils.safe_log,
                                    path='{}/{}/cm.log'.format(
                                        exp_dir, EXP_LOGS_DIR),
                                    l=cmlock)

    logger = Logger(log_path)
    for i, hp in enumerate(hps):
        with started_hps_lk:
            config_idx = started_hps.value
            started_hps.value += 1

        hp = hp.copy()
        seed = hp['seed']
        # print('seed: {}, type(seed): {}'.format(seed, type(seed)), flush=True)
        np.random.seed(seed)
        torch.manual_seed(seed)
        hp.pop('seed')

        # Prep
        try:
            model = model_class(**hp, device_ids=device_ids)  # type: ignore
        except TypeError:
            model = model_class(**hp)  # type: ignore
        model._use_lforb = use_lforb
        config, config_dir, img_save_prefix = setup_config_dir(
            model, train_data.name, train_data.shuffle_seed, exp_dir,
            config_idx)

        # Save data for easy reproducibility (currently used in mstd computation)
        repro_data = {
            'get_dataset_kwargs': get_dataset_kwargs,
            'model_class': model_class,
            'seed': seed,
            'hp': hp
        }
        with open('{}/{}.pkl'.format(config_dir, CONFIG_REPRO_DATA_FILE),
                  'wb') as f:
            pickle.dump(repro_data, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Set what model.acc() should do
        acc_func, acc_func_type = Utils.get_acc_def(
            DataLoader.is_classification(get_dataset_kwargs['dataset_name']),
            hp.get('criterion', None))
        model.acc_func = acc_func
        model.acc_func_type = acc_func_type

        model.train(train_data, validn_data, test_data)
        stats = process_model(model, train_data, validn_data, test_data,
                              config_dir, config_idx, seed, compute_auc)

        # Log
        l.acquire()

        finished_hps.value += 1
        logger.log('\n>> [{:.2f}% ({}/{})]:\nRan: (cidx={}): {}\n'.format(
            finished_hps.value * 100 / total_hps, finished_hps.value,
            total_hps, config_idx, config))
        logger.log('Config dir: {}\n'.format(os.path.abspath(config_dir)))

        logger.log('\ntrain_acc={:.5f}%\n'.format(stats['train_acc']))
        logger.log('validn_acc={:.5f}%\n'.format(stats['validn_acc']))
        logger.log('test_acc={:.5f}%\n'.format(stats['test_acc']))
        if compute_auc:
            logger.log('train_auc={:.5f}\n'.format(stats['train_auc']))
            logger.log('validn_auc={:.5f}\n'.format(stats['validn_auc']))
            logger.log('test_auc={:.5f}\n'.format(stats['test_auc']))

        if isinstance(model, DTExtractablePredictor):
            if not model._is_pure_dt:
                logger.log('dt_train_acc={:.5f}%\n'.format(
                    stats['dt_train_acc']))
                logger.log('dt_validn_acc={:.5f}%\n'.format(
                    stats['dt_validn_acc']))
                logger.log('dt_test_acc={:.5f}%\n'.format(
                    stats['dt_test_acc']))
                if compute_auc:
                    logger.log('dt_train_auc={:.5f}\n'.format(
                        stats['dt_train_auc']))
                    logger.log('dt_validn_auc={:.5f}\n'.format(
                        stats['dt_validn_auc']))
                    logger.log('dt_test_auc={:.5f}\n'.format(
                        stats['dt_test_auc']))

        if isinstance(model, DTExtractablePredictor) and isinstance(
                model, DGTPredictor):
            if not model._is_pure_dt:
                logger.log('cdt_train_acc={:.5f}%\n'.format(
                    stats['cdt_train_acc']))
                logger.log('cdt_validn_acc={:.5f}%\n'.format(
                    stats['cdt_validn_acc']))
                logger.log('cdt_test_acc={:.5f}%\n'.format(
                    stats['cdt_test_acc']))

        total_time = time.time() - start_time
        per_hp_time = total_time / finished_hps.value
        rem_time = per_hp_time * (total_hps - finished_hps.value)
        logger.log('Time: per hp={}, total={}, rem={}\n'.format(
            td(seconds=per_hp_time), td(seconds=total_time),
            td(seconds=rem_time)))

        Utils.append_linedict_to_csv(stats, model_search_summary_path)
        l.release()

        if use_lforb:
            assert hp['use_last_model']

            old_logs_dir = model.logs_dir
            config_idx += total_hps
            _, config_dir, _ = setup_config_dir(model, train_data.name,
                                                train_data.shuffle_seed,
                                                exp_dir, config_idx)
            model.load_best_model(old_logs_dir)

            hp['use_last_model'] = False
            repro_data = {
                'get_dataset_kwargs': get_dataset_kwargs,
                'model_class': model_class,
                'seed': seed,
                'hp': hp
            }
            with open('{}/{}.pkl'.format(config_dir, CONFIG_REPRO_DATA_FILE),
                      'wb') as f:
                pickle.dump(repro_data, f, protocol=pickle.HIGHEST_PROTOCOL)

            stats = process_model(model, train_data, validn_data, test_data,
                                  config_dir, config_idx, seed, compute_auc)

            l.acquire()
            Utils.append_linedict_to_csv(stats, model_search_summary_path)
            l.release()

    logger.close()