def my_callback_func(cur_param_vec,
                         is_init=False,
                         alg_state_kwargs=alg_state_kwargs):
        # Update step counter, timer, etc.
        if not is_init:
            alg_state_kwargs.update(
                update_alg_state_kwargs(**alg_state_kwargs))
        if do_print_now(**alg_state_kwargs) or do_save_now(**alg_state_kwargs):
            cur_loss_val = loss_func_wrt_paramvec_and_step(cur_param_vec)
            alg_state_kwargs['cur_loss_val'] = cur_loss_val

        if do_print_now(**alg_state_kwargs):
            pprint(make_status_string(
                **alg_state_kwargs))  # assume cur_loss_val is inside
            save_status_to_txt_files(**alg_state_kwargs)
            alg_state_kwargs.update(
                update_alg_state_kwargs_after_print(**alg_state_kwargs))

        if do_save_now(**alg_state_kwargs):
            param_dict = param_tfm_manager.unflatten_to_common_param_dict(
                cur_param_vec, **dim_P)
            if save_func_wrt_param_dict is not None:
                save_func_wrt_param_dict(param_dict=param_dict,
                                         **alg_state_kwargs)
            if callback_func_wrt_param_dict is not None:
                callback_func_wrt_param_dict(
                    param_dict=param_dict,
                    losstrain_ttl=alg_state_kwargs.get('cur_loss_val',
                                                       init_loss_val),
                    alg_state_kwargs=alg_state_kwargs,
                    **callback_kwargs)
            alg_state_kwargs.update(
                update_alg_state_kwargs_after_save(**alg_state_kwargs))
예제 #2
0
def load_df_from_all_folders_matching_list_of_patterns(
        list_of_path_patterns=None,
        legend_name=None,
        y_ind=0,
        column_names=None,
        query_str=None,
        task_ids=None,
        **kwargs):
    pprint(">>> BEGIN load_df_from_all_folders_that_match_pattern")
    list_of_match_df = list()
    for path_pattern in list_of_path_patterns:
        cur_alg_df = load_df_from_all_folders_that_match_pattern(
            path_pattern,
            y_ind=y_ind,
            task_ids=task_ids,
            column_names=column_names)
        if query_str is not None:
            cur_alg_df = cur_alg_df.query(query_str).copy()

        # Append to list of all matching dataframes
        list_of_match_df.append(cur_alg_df)

    # Create all matching DataFrame
    all_matching_runs_df = pd.concat(list_of_match_df)
    pprint("<<< END   load_df_from_all_folders_that_match_pattern")
    return all_matching_runs_df
예제 #3
0
def make_best_task_df(df,
                      target_query="SPLIT_NAME == 'VALID' and LAP > 50",
                      score_colname='Y_ERROR_RATE',
                      score_ranking_func=np.argmin,
                      default_score=None,
                      verbose=False):
    ''' Find best task for each unique job in provided df.

    Returns
    -------
    best_df : dataframe of best tasks for each unique job
    '''
    if default_score is None:
        default_score = fetch_default_score(score_ranking_func.__name__)
    best_task_df_list = list()
    job_paths = np.unique(df['JOB_PATH'].values)
    for job_path in job_paths:
        if job_path is None:
            continue
        job_df = df.query("JOB_PATH == '%s'" % job_path)
        taskids = np.unique(job_df['TASKID'].values)
        best_score_idx = np.zeros_like(taskids, dtype=np.int32)
        best_score = default_score * np.ones_like(taskids, dtype=np.float64)
        for tt, taskidstr in enumerate(taskids):
            task_df = job_df.query(target_query +
                                   " and TASKID == '%s'" % taskidstr)
            if task_df.shape[0] < 1:
                continue
            if not np.all(np.isfinite(task_df[score_colname].values)):
                pprint(task_df[score_colname].values)
            best_score_idx[tt] = score_ranking_func(
                task_df[score_colname].values)
            best_score[tt] = task_df[score_colname].values[best_score_idx[tt]]
        best_task_idx = score_ranking_func(best_score)
        best_task_df = job_df.query("TASKID == '%s'" % taskids[best_task_idx])
        best_task_df_list.append(best_task_df)
        if verbose:
            pprint(job_path)
            pprint("best task: %s" % best_task_idx)
    return pd.concat(best_task_df_list)
예제 #4
0
def minimize(
        loss_func_wrt_paramvec_and_step=None,
        grad_func_wrt_paramvec_and_step=None,
        save_func_wrt_param_dict=None,
        callback_func_wrt_param_dict=None,
        callback_kwargs=None,
        param_tfm_manager=None,
        dim_P=None,
        init_param_dict=None,
        step_direction='steepest',
        step_size=0.01,
        decay_rate=1.0,
        decay_interval=25,
        decay_staircase=0,
        b1=0.9,
        b2=0.999,
        eps=1e-8,
        max_l2_norm_of_grad_per_entry=10.0,
        **kwargs):
    pprint('[grad_descent_minimizer] Begin training...')
    pprint('--step_direction %s' % step_direction)
    pprint('--step_size  %.3f' % step_size)
    pprint('--decay_rate %.3f' % decay_rate)

    # Parse user input
    step_direction = str(step_direction)
    assert step_direction in ['adam', 'steepest']
    step_size = float(step_size)
    decay_rate = float(decay_rate)
    decay_staircase = int(decay_staircase)
    decay_interval = float(decay_interval)
    b1 = float(b1)
    b2 = float(b2)
    eps = float(eps)

    # Convert provided common param dict
    # to a flat 1D array with unconstrained values
    param_vec = param_tfm_manager.flatten_to_differentiable_param_vec(
        init_param_dict,
        **dim_P)

    # Warmup
    start_time_sec = time.time()
    init_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=0)
    loss_eval_time_sec = time.time() - start_time_sec
    pprint("Loss     @ init: %8.3f sec | val %.6e" % (
        loss_eval_time_sec, init_loss_val))
    pprint("Params   @ init: %8s     | %5d params | l2 norm / entry %.4e" % (
        ' ',
        param_vec.size,
        calc_l2_norm_of_vector_per_entry(param_vec)))

    start_time_sec = time.time()
    init_grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=0)
    elapsed_time_sec = time.time() - start_time_sec
    init_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(init_grad_vec)
    pprint("Gradient @ init: %8.3f sec | %5d params | l2 norm / entry %.4e" % (
        elapsed_time_sec, init_grad_vec.size, init_grad_norm_per_entry))

    # Create settings that track algorithm state
    # cur_step, cur_lap, n_laps, n_steps, etc.
    alg_state_kwargs = init_alg_state_kwargs(
        cur_step=0.0,
        **kwargs)
    n_steps = alg_state_kwargs['n_steps']    
    if 'output_path' in alg_state_kwargs:
        laps_to_save_str, steps_to_save_str = calc_laps_when_snapshots_saved(
            return_str=True,
            keep_first=5,
            keep_last=5,
            **alg_state_kwargs)
        pprint("Snapshots will be saved at intervals:")
        pprint("   laps: %s" % laps_to_save_str)
        pprint("  steps: %s" % steps_to_save_str)
        pprint("Snapshot saved to --output_path:\n%s" % (
            alg_state_kwargs['output_path']))

    # Adam estimates of gradient mean/variance
    m = np.zeros_like(param_vec)
    v = np.zeros_like(param_vec)

    cur_step_size = step_size
    cur_loss_val = init_loss_val
    cur_grad_norm_per_entry = init_grad_norm_per_entry
    for step_id in xrange(0, n_steps + 1):
        if step_id > 0:
            grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=step_id)

            cur_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(grad_vec)
            assert np.isfinite(cur_grad_norm_per_entry)
            if cur_grad_norm_per_entry > max_l2_norm_of_grad_per_entry:
                warn_msg = (
                    'WARNING: clipping gradient enforced.'
                    + '\n cur l2 norm / entry = %.2e'
                    + '\n new l2 norm / entry = %.2e')
                pprint(warn_msg % (
                    cur_grad_norm_per_entry,
                    max_l2_norm_of_grad_per_entry))
                grad_vec *= max_l2_norm_of_grad_per_entry / cur_grad_norm_per_entry
                cur_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(grad_vec)
            assert cur_grad_norm_per_entry <= max_l2_norm_of_grad_per_entry

            # Decay learning rate, like tensorflow's exponential decay
            if decay_staircase:
                cur_step_count = int(step_id) // int(decay_interval)
            else:
                cur_step_count = float(step_id) / float(decay_interval)
            cur_step_size = step_size * decay_rate ** (cur_step_count)

            if step_direction == 'adam':
                g = grad_vec
                m = (1 - b1) * g      + b1 * m  # First  moment estimate.
                v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
                mhat = m / (1 - b1**(step_id))    # Bias correction.
                vhat = v / (1 - b2**(step_id))
                step_vec = -1.0 * cur_step_size * mhat / (np.sqrt(vhat) + eps)
            elif step_direction.count('steep'):
                step_vec = -1.0 * cur_step_size * grad_vec
            else:
                raise ValueError("Unrecognized step_direction: %s" % step_direction)
            param_vec = param_vec + step_vec
            assert np.all(np.isfinite(param_vec))

            # Update step counter, timer, etc.
            alg_state_kwargs = update_alg_state_kwargs(
                **alg_state_kwargs)

        if do_print_now(**alg_state_kwargs):
            cur_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=step_id)
            pprint(make_status_string(
                cur_loss_val=cur_loss_val,
                cur_grad_norm_per_entry=cur_grad_norm_per_entry,
                **alg_state_kwargs))
            save_status_to_txt_files(
                cur_loss_val=cur_loss_val,
                cur_grad_norm_per_entry=cur_grad_norm_per_entry,
                cur_step_size=cur_step_size,
                **alg_state_kwargs)
            alg_state_kwargs = update_alg_state_kwargs_after_print(
                **alg_state_kwargs)

        if do_save_now(**alg_state_kwargs):
            param_dict = param_tfm_manager.unflatten_to_common_param_dict(
                param_vec, **dim_P)
            if save_func_wrt_param_dict is not None:
                save_func_wrt_param_dict(
                    param_dict=param_dict,
                    **alg_state_kwargs)
            if callback_func_wrt_param_dict is not None:
                callback_func_wrt_param_dict(
                    param_dict=param_dict,
                    losstrain_ttl=cur_loss_val,
                    alg_state_kwargs=alg_state_kwargs,
                    **callback_kwargs)
            alg_state_kwargs = update_alg_state_kwargs_after_save(
                **alg_state_kwargs)

    param_dict = param_tfm_manager.unflatten_to_common_param_dict(
        param_vec, **dim_P)
    pprint('[grad_descent_minimizer] Done with training.')
    return param_dict, alg_state_kwargs    
예제 #5
0
def train_and_eval_clf_with_best_params_via_grid_search(
    classifier_name='logreg',
    param_grid_dict=None,
    datasets_by_split=None,
    verbose=True,
    feat_colnames=None,
    feat_preproc_grid_dict=None,
    y_col_id=0,
    y_orig_col_id=0,
    y_col_name='',
    output_path='/tmp/',
    max_grid_search_steps=None,
    class_weight_opts='',
    c_logspace_arg_str='',
    random_state=8675309,
    n_bootstraps=5000,
    seed_bootstrap=42,
    bootstrap_stratify_pos_and_neg=True,
):
    (make_classifier, score_classifier, calc_best_idx,
        make_clf_report, make_csv_row_dict, make_interp_report) = \
            make_constructor_and_evaluator_funcs(
                classifier_name,
                n_bootstraps=n_bootstraps,
                seed_bootstrap=seed_bootstrap,
                bootstrap_stratify_pos_and_neg=bootstrap_stratify_pos_and_neg)
    if param_grid_dict is None:
        param_grid_dict = default_param_grid(
            classifier_name, c_logspace_arg_str=c_logspace_arg_str)
        if class_weight_opts == 'balanced':
            if 'class_weight' in param_grid_dict:
                param_grid_dict['class_weight'].insert(0, 'balanced')
    if isinstance(feat_preproc_grid_dict, dict):
        param_grid_dict.update(feat_preproc_grid_dict)

    n_grid = 1
    for key, val_list in param_grid_dict.items():
        n_grid *= len(val_list)
    if verbose:
        if max_grid_search_steps:
            pprint('Max   configs in grid search: %d' % max_grid_search_steps)
        pprint('Total configs in grid search: %d' % n_grid)

    param_generator = make_param_dict_generator(param_grid_dict)

    clf_list = list()
    param_dict_list = list()
    score_list = list()
    start_time = time.time()

    x_tr, y_tr = make_nonnan_xy_for_target(datasets_by_split['train'],
                                           y_col_id)
    x_va, y_va = make_nonnan_xy_for_target(datasets_by_split['valid'],
                                           y_col_id)
    x_te, y_te = make_nonnan_xy_for_target(datasets_by_split['test'], y_col_id)
    for ii, param_dict in enumerate(param_generator):
        np.random.seed(random_state)
        clf = make_classifier(feat_colnames=feat_colnames,
                              random_state=random_state,
                              **param_dict)

        clf.fit(x_tr, y_tr)
        score = score_classifier(clf, x_va, y_va)
        clf_list.append(clf)
        score_list.append(score)
        param_dict_list.append(param_dict)

        if verbose:
            tr_score = score_classifier(clf, x_tr, y_tr)

            elapsed_time = time.time() - start_time
            param_str = str(param_dict)
            param_str = param_str.replace('),', '  ')
            for badstr in ['OrderedDict', '[', ']', '(', ')', ',']:
                param_str = param_str.replace(badstr, '')
            pprint("%4d/%d %10.2f sec va_auc %.4f   tr_auc %.4f  %s" %
                   (1 + ii, n_grid, elapsed_time, score, tr_score, param_str))

        if max_grid_search_steps and ((ii + 1) >= max_grid_search_steps):
            if verbose:
                pprint("Exceed max_grid_search_steps. Break!")
            break

    best_id = calc_best_idx(score_list)
    best_score = score_list[best_id]
    best_param_dict = param_dict_list[best_id]
    best_clf = clf_list[best_id]

    if verbose:
        pprint("------")
        pprint(" best param dict, using function " + calc_best_idx.__name__)
        pprint("------")
        pprint("va_auc = %.4f %s" % (best_score, str(best_param_dict)))

    ## Now tuning threshold, if applicable
    if isinstance(best_clf.named_steps['clf'], ThresholdClassifier):
        for cur_split_name, x_split in [
            ('train', x_tr),
            ('test', x_te),
            ('valid', x_va),
        ]:
            yproba_class1 = best_clf.predict_proba(x_split)[:, 1]
            if verbose:
                pprint("Percentiles of clf Pr(y=1) on SPLIT = %s..." %
                       cur_split_name)
                perc_str_list = list()
                for perc in [0, 1, 10, 25, 50, 75, 90, 99, 100]:
                    perc_str = "%3d%% %.4f" % (
                        perc, np.percentile(yproba_class1, perc))
                    perc_str_list.append(perc_str)
                pprint("  " + " ".join(perc_str_list))

        ## DEPRECATED
        #thr_min = np.maximum(0.001, [1])
        #thr_max = np.minimum(0.999, np.unique(yproba_class1)[-2])
        #thr_grid = np.linspace(thr_min, thr_max, num=101)

        ## Grid search on validation over possible threshold values
        # Make sure all candidates at least provide
        # one instance of each class (positive and negative)
        assert cur_split_name == 'valid'
        nontrivial_thr_vals = np.unique(yproba_class1)[1:-1]

        if nontrivial_thr_vals.size > 100:
            # Too many for possible thr values for typical compute power
            thr_grid = np.linspace(nontrivial_thr_vals[0],
                                   nontrivial_thr_vals[-1], 100)
        else:
            # Just look at all possible thresholds
            # that give distinct operating points.
            thr_grid = nontrivial_thr_vals
        if verbose:
            pprint("Searching thresholds...")
            pprint("thr_grid = %.4f, %.4f, %.4f ... %.4f, %.4f" %
                   (thr_grid[0], thr_grid[1], thr_grid[2], thr_grid[-2],
                    thr_grid[-1]))
        score_grid = np.zeros_like(thr_grid, dtype=np.float64)
        acc_grid = np.zeros_like(thr_grid, dtype=np.float64)
        tmp_clf = copy.deepcopy(best_clf)
        for gg, thr in enumerate(thr_grid):
            tmp_clf.named_steps['clf'].set_threshold(thr)
            yhat = tmp_clf.predict(x_va)
            score_grid[gg] = f1_score(y_va, yhat, pos_label=1)
            acc_grid[gg] = accuracy_score(y_va, yhat)
        gg_best = np.argmax(score_grid)
        best_clf.named_steps['clf'].set_threshold(thr_grid[gg_best])
        if verbose:
            pprint("------")
            pprint(" best threshold by f1 score on validation")
            pprint("------")
            pprint("thr = %.4f f1_score %.4f acc_score %.4f" % (
                thr_grid[gg_best],
                score_grid[gg_best],
                acc_grid[gg_best],
            ))

    if verbose:
        pprint()
        pprint(make_clf_report(best_clf, x_tr, y_tr, y_col_name + '_train'))
        pprint(make_clf_report(best_clf, x_va, y_va, y_col_name + '_valid'))
        pprint(make_clf_report(best_clf, x_te, y_te, y_col_name + '_test'))
    ireport = make_interp_report(best_clf, feat_colnames, y_col_name)
    if len(ireport) > 0:
        clf_ireport_path = os.path.join(
            output_path, 'clf_%d_interpretation.txt' % (y_orig_col_id))
        with open(clf_ireport_path, 'w') as f:
            f.write(ireport)
        if verbose:
            pprint(ireport)

    # Write the classifier obj to disk
    if classifier_name != 'k_nearest_neighbors':
        clf_path = os.path.join(output_path,
                                'clf_%d_object.dump' % (y_orig_col_id))
        joblib.dump(best_clf, clf_path, compress=1)
        pprint("wrote clf object to file via joblib:")
        pprint(clf_path)

    clf_repr_path = os.path.join(output_path,
                                 'clf_%d_repr.txt' % (y_orig_col_id))
    with open(clf_repr_path, 'w') as f:
        f.write(repr(best_clf) + "\n")
    clf_repr_path = os.path.join(
        output_path, 'clf_%d_best_param_dict_repr.txt' % (y_orig_col_id))
    with open(clf_repr_path, 'w') as f:
        f.write(repr(best_param_dict) + "\n")

    if verbose:
        pprint("completed clf saving after %11.2f sec" %
               (time.time() - start_time))

    if os.path.exists(output_path):
        for ss, split in enumerate(['valid', 'test', 'train']):
            csv_fpath = os.path.join(
                output_path, 'clf_%d_callback_%s.csv' % (y_orig_col_id, split))

            x_cursplit, y_cursplit = make_nonnan_xy_for_target(
                datasets_by_split[split], y_col_id=y_col_id)
            row_dict = make_csv_row_dict(best_clf, x_cursplit, y_cursplit,
                                         y_col_name, split, classifier_name)
            csv_df = pd.DataFrame([row_dict], columns=row_dict.keys())
            csv_df.to_csv(csv_fpath, index=False)

            if hasattr(best_clf, 'predict_proba'):
                for nbins in [6, 10, 20]:
                    fig_fpath = os.path.join(
                        output_path, 'clf_%d_calibration_%02dbin_%s.pdf' %
                        (y_orig_col_id, nbins, split))

                    info_per_bin = calc_calibration_info(best_clf,
                                                         x_cursplit,
                                                         y_cursplit,
                                                         bins=nbins)
                    plot_binary_clf_calibration_curve_and_histograms(
                        info_per_bin=info_per_bin)
                    plt.savefig(fig_fpath, bbox_inches='tight', pad_inches=0)
            if verbose:
                elapsed_time = time.time() - start_time
                pprint("eval %d/%d on %5s split done after %11.2f sec" %
                       (ss + 1, 3, split, elapsed_time))
                pprint("wrote csv file: " + csv_fpath)
    return best_clf, best_param_dict
예제 #6
0
def read_args_from_stdin_and_run():
    ''' Main executable function to train and evaluate classifier.

    Post Condition
    --------------
    AUC and other eval info printed to stdout.
    Trained classifier saved ???.
    '''
    if not sys.stdin.isatty():
        for line in sys.stdin.readlines():
            line = line.strip()
            sys.argv.extend(line.split(' '))
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder containing:" +
                        " *.npy files: X_train, y_train, P_train"
                        " *.txt files: X_colnames.txt and y_colnames.txt")
    parser.add_argument(
        '--output_path',
        default='/tmp/',
        type=str,
        help="Path to folder to hold output from classifier. Includes:" +
        " perf_metric*.txt files: auc_train.txt & auc_test.txt" +
        " settings.txt: description of all settings to reproduce.")
    parser.add_argument('--feature_arr_names',
                        type=str,
                        default='X',
                        help='Name of feature files to use for training')
    parser.add_argument('--features_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder with extra feature files")
    parser.add_argument(
        '--target_arr_name',
        default='Y',
        type=str,
    )
    parser.add_argument(
        '--target_names',
        default='all',
        type=str,
        help='Name of response/intervention to test.' +
        ' To try specific interventions, write names separated by commas.' +
        ' To try all interventions, use special name "all"')
    parser.add_argument(
        '--n_folds',
        default=1,
        type=int,
        help='Number of folds for cross validation during classification.')
    parser.add_argument('--classifier_name',
                        default='logistic_regression',
                        choices=[
                            'k_nearest_neighbors', 'mlp',
                            'logistic_regression', 'extra_trees',
                            'svm_with_linear_kernel', 'svm_with_rbf_kernel'
                        ],
                        help='Name of classifier')
    parser.add_argument(
        '--class_weight_opts',
        choices=['none', 'balanced'],
        default='none',
    )
    parser.add_argument('--max_grid_search_steps',
                        default=None,
                        type=int,
                        help='max number of steps for grid search')
    parser.add_argument('--frac_labels_train',
                        default=1.0,
                        type=float,
                        help='Fraction of the training data to use')
    parser.add_argument('--c_logspace_arg_str',
                        default="-6,4,7",
                        type=str,
                        help='Comma-sep list of args to np.logspace')
    parser.add_argument('--seed',
                        default=8675309,
                        type=int,
                        help='Seed for random number generation')
    parser.add_argument('--seed_bootstrap',
                        default=42,
                        type=int,
                        help='Seed for bootstrap')
    parser.add_argument('--n_bootstraps',
                        default=5000,
                        type=int,
                        help='Number of samples for bootstrap conf. intervals')
    parser.add_argument('--bootstrap_stratify_pos_and_neg',
                        default=True,
                        type=int,
                        help='Whether to stratify examples or not')
    args, unk_list = parser.parse_known_args()
    arg_dict = vars(args)

    dataset_path = arg_dict['dataset_path']
    for key, val in arg_dict.items():
        if arg_dict['output_path'].count('$' + key):
            arg_dict['output_path'] = \
                arg_dict['output_path'].replace('$' + key, str(val))
    if not os.path.exists(arg_dict['output_path']):
        mkpath(arg_dict['output_path'])

    config_pprint_logging(arg_dict['output_path'],
                          txtfile='stdout_%s.txt' % arg_dict['target_names'])
    pprint('[run_classifier says:] Parsing args ...')

    # Parse possible preprocessors
    feat_preproc_grid_dict = dict()
    for key, val in zip(unk_list[::2], unk_list[1::2]):
        if key.startswith('--preproc_'):
            feat_preproc_grid_dict[key[2:]] = str(val).split(',')
            pprint(key + " : " + val)
            arg_dict[key[2:]] = val

    for key in feat_preproc_grid_dict.keys():
        ii = unk_list.index('--' + key)
        del unk_list[ii + 1]
        del unk_list[ii]
    if len(unk_list) > 0:
        pprint("UNKNOWN ARGS (ignored)")
        for key in unk_list:
            pprint(key)

    # Set default seed for numpy
    np.random.seed(arg_dict['seed'])

    # Write parsed args to plain-text file
    # so we can exactly reproduce later
    with open(os.path.join(arg_dict['output_path'], 'settings.txt'), 'w') as f:
        for key, val in arg_dict.items():
            f.write(key + ' = ' + str(val) + '\n')
            pprint(key + ' = ' + str(val))
    with open(os.path.join(arg_dict['output_path'], 'args.txt'), 'w') as f:
        for key, val in arg_dict.items():
            f.write('--' + key + ' ' + str(val) + '\n')
    pprint('')

    feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']]

    pprint('[run_classifier says:] Loading dataset ...')
    start_time = time.time()
    feature_arr_names = arg_dict['feature_arr_names'].split(',')
    pprint('feature_arr_names:')
    feat_colnames_by_arr = OrderedDict()
    for feat_arr_name in feature_arr_names:
        pprint(feat_arr_name)
        cur_feat_colnames = None
        for feat_path in feat_path_list:
            colname_fpath = os.path.join(feat_path,
                                         feat_arr_name + '_colnames.txt')
            if os.path.exists(colname_fpath):
                cur_feat_colnames = \
                    [str(feat_arr_name + ":") + s
                        for s in load_list_of_unicode_from_txt(colname_fpath)]
                break
        feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames

    target_arr_name = arg_dict['target_arr_name']
    all_target_names = load_list_of_strings_from_txt(
        os.path.join(arg_dict['dataset_path'],
                     target_arr_name + '_colnames.txt'))

    target_names = arg_dict['target_names']
    if target_names == 'all':
        target_names = all_target_names
        target_cols = np.arange(len(all_target_names)).tolist()
    else:
        target_names = target_names.split(',')
        target_cols = list()
        for name in target_names:
            assert name in all_target_names
            target_cols.append(all_target_names.index(name))

    datasets_by_split = dict()
    for split_name in ['train', 'valid', 'test']:
        datasets_by_split[split_name] = dict()
        split_dataset = datasets_by_split[split_name]

        # Load Y
        dense_fpath = os.path.join(dataset_path,
                                   target_arr_name + "_%s.npy" % split_name)
        y = np.asarray(np.load(dense_fpath), order='C',
                       dtype=np.float32)  # 0/1/nan
        if y.ndim < 2:
            y = y[:, np.newaxis]
        assert y.ndim == 2
        assert y.shape[1] == len(all_target_names)
        split_dataset['y'] = y[:, target_cols]
        assert split_dataset['y'].shape[1] == len(target_cols)

        # Load X
        x_list = list()
        for feat_arr_name in feature_arr_names:
            for ii, feat_path in enumerate(feat_path_list):
                dense_fpath = os.path.join(
                    feat_path, feat_arr_name + "_%s.npy" % split_name)
                sparse_fpath = os.path.join(
                    feat_path, feat_arr_name + "_csr_%s.npz" % split_name)
                x_cur = None
                try:
                    if os.path.exists(sparse_fpath):
                        print("Here is sparse_fpath", sparse_fpath)
                        x_cur = load_csr_matrix(sparse_fpath)
                        print(x_cur)
                        assert np.all(np.isfinite(x_cur.data))
                        break
                    else:
                        x_cur = np.asarray(np.load(dense_fpath),
                                           order='C',
                                           dtype=np.float64)
                        if x_cur.ndim < 2:
                            x_cur = np.atleast_2d(x_cur).T
                        assert np.all(np.isfinite(x_cur))
                        break
                except IOError as e:
                    if ii == len(feat_path_list) - 1:
                        # Couldn't find desired file in any feat_path
                        raise e
                    else:
                        # Try the next feat_path in the list
                        pass
            if x_cur is not None:
                if feat_colnames_by_arr[feat_arr_name] is not None:
                    feat_dim = len(feat_colnames_by_arr[feat_arr_name])
                    print('feat name, %s, feat_dim %d' %
                          (feat_arr_name, feat_dim))
                    print('x_cur shape', x_cur.shape[1])
                    assert x_cur.shape[1] == feat_dim
                else:
                    # Add dummy colnames
                    feat_dim = x_cur.shape[1]
                    n_sig_digits = np.maximum(3,
                                              int(np.ceil(np.log10(feat_dim))))
                    fmt_str = "%s_%0" + str(n_sig_digits) + "d"
                    feat_colnames_by_arr[feat_arr_name] = [
                        fmt_str % (feat_arr_name, fid)
                        for fid in range(feat_dim)
                    ]
                x_list.append(x_cur)

        if isinstance(x_list[0], np.ndarray):
            split_dataset['x'] = np.hstack(x_list)
        else:
            split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr')

        #Use only a fraction of the training dataset if specified
        frac_labels_train = arg_dict['frac_labels_train']
        if split_name == 'train' and frac_labels_train < 1.0:
            # Same random seed taken from bow_dataset.py
            data_prng = np.random.RandomState(int(42))
            n_rows = y.shape[0]

            #Note: does not handle truly missing labels
            indexed_rows = np.arange(n_rows)
            shuffled_rows = data_prng.permutation(indexed_rows)
            n_visible = int(np.ceil(frac_labels_train * n_rows))
            visible_rows = shuffled_rows[:n_visible]
            split_dataset['x'] = split_dataset['x'][visible_rows, :]
            split_dataset['y'] = split_dataset['y'][visible_rows, :]

        assert split_dataset['x'].ndim == 2
        assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0]
        assert (isinstance(split_dataset['x'], np.ndarray)
                or isinstance(split_dataset['x'], scipy.sparse.csr_matrix))

        if split_name == 'train':
            # Flatten feat colnames into single list
            feat_colnames = sum(feat_colnames_by_arr.values(), [])
            assert isinstance(feat_colnames, list)
            assert len(feat_colnames) == split_dataset['x'].shape[1]
            if len(feat_colnames) > 10:
                pprint('x colnames: %s ... %s' % (' '.join(
                    feat_colnames[:5]), ' '.join(feat_colnames[-5:])))
            else:
                pprint('x colnames: %s' % ' '.join(feat_colnames))
            pprint('y colnames: %s' % ' '.join(target_names))

        pprint('---- %5s dataset summary' % split_name)
        pprint('%9d total examples' % y.shape[0])
        pprint('y : %d x %d targets' % split_dataset['y'].shape)
        pprint('x : %d x %d features' % split_dataset['x'].shape)

        for c in range(len(target_names)):
            y_c = split_dataset['y'][:, c]
            nan_bmask = np.isnan(y_c)
            pos_bmask = y_c == 1
            neg_bmask = y_c == 0
            pprint('target %s :' % target_names[c])
            pprint('    %6d pos examples | %.3f' %
                   (np.sum(pos_bmask), calcfrac(pos_bmask)))
            pprint('    %6d neg examples | %.3f' %
                   (np.sum(neg_bmask), calcfrac(neg_bmask)))
            pprint('    %6d NaN examples | %.3f' %
                   (np.sum(nan_bmask), calcfrac(nan_bmask)))
            assert nan_bmask.sum() + pos_bmask.sum() + neg_bmask.sum(
            ) == neg_bmask.size

    elapsed_time = time.time() - start_time
    pprint('[run_classifier says:] dataset loaded after %.2f sec.' %
           elapsed_time)

    n_cols = len(target_names)
    for c in range(n_cols):
        pprint('[run_classifier says:] train for target %s' % target_names[c])
        train_and_eval_clf_with_best_params_via_grid_search(
            arg_dict['classifier_name'],
            datasets_by_split=datasets_by_split,
            y_col_id=c,
            y_orig_col_id=all_target_names.index(target_names[c]),
            y_col_name=target_names[c],
            feat_colnames=feat_colnames,
            feat_preproc_grid_dict=feat_preproc_grid_dict,
            output_path=arg_dict['output_path'],
            max_grid_search_steps=arg_dict['max_grid_search_steps'],
            class_weight_opts=arg_dict['class_weight_opts'],
            c_logspace_arg_str=arg_dict['c_logspace_arg_str'],
            random_state=arg_dict['seed'],
            seed_bootstrap=arg_dict['seed_bootstrap'],
            n_bootstraps=arg_dict['n_bootstraps'],
            bootstrap_stratify_pos_and_neg=arg_dict[
                'bootstrap_stratify_pos_and_neg'],
        )
        elapsed_time = time.time() - start_time
        pprint('[run_classifier says:] target %s completed after %.2f sec' %
               (target_names[c], elapsed_time))
def calc_perf_metrics_for_snapshot_param_dict(
        param_dict=None,
        topics_KV=None,
        w_CK=None,
        datasets_by_split=None,
        model_hyper_P=None,
        dim_P=None,
        alg_state_kwargs=None,
        output_path=None,
        cur_lap=0.0,
        cur_step=None,
        elapsed_time_sec=0.0,
        losstrain_ttl=None,
        verbose_timings=False,
        disable_output=False,
        do_force_update_w_CK=0,
        perf_metrics_pi_optim_kwargs=None,
        **unused_kwargs):
    ''' Compute performance metrics at provided topic model param dict.

    Returns
    -------
    info_dict : dict
        Contains all perf. metric information.

    Post Condition
    --------------
    Row appended to CSV files in output_path/
        * snapshot_perf_metrics_train.csv
        * snapshot_perf_metrics_valid.csv
        * snapshot_perf_metrics_test.csv
    '''
    if perf_metrics_pi_optim_kwargs is None:
        perf_metrics_pi_optim_kwargs = dict()

    etimes = OrderedDict()
    etimes = start_timer_segment(etimes, 'total')

    # Unpack parameters
    if param_dict is not None:
        topics_KV = param_dict['topics_KV']
        w_CK = param_dict['w_CK']
    if topics_KV is None:
        raise ValueError("topics_KV should not None")
    if not np.all(np.isfinite(topics_KV)):
        raise ValueError("topics_KV should not be NaN or Inf")
    if w_CK is None:
        raise ValueError("w_CK should not None")
    if not np.all(np.isfinite(w_CK)):
        raise ValueError("w_CK should not be NaN or Inf")
    # Track norms of params (crude debugging tool)
    l1_norm_logtopics = np.mean(np.abs(np.log(topics_KV.flatten())))
    l1_norm_w = np.mean(np.abs(w_CK.flatten()))

    # Unpack hyperparams
    alpha = model_hyper_P['alpha']
    tau = model_hyper_P['tau']
    lambda_w = model_hyper_P['lambda_w']
    weight_y = model_hyper_P['weight_y']

    # Unpack state kwargs
    if alg_state_kwargs is not None:
        output_path = alg_state_kwargs['output_path']
        cur_lap = alg_state_kwargs['cur_lap']
        cur_step = alg_state_kwargs['cur_step']
        elapsed_time_sec = alg_state_kwargs['elapsed_time_sec']

    # TODO check if dataset is semisupervised
    y_DC = datasets_by_split['train']['y_DC']
    n_labels = y_DC.shape[1]
    u_y_vals = np.unique(y_DC.flatten())
    if u_y_vals.size <= 2 and np.union1d([0.0, 1.0], u_y_vals).size == 2:
        output_data_type = 'binary'
    else:
        output_data_type = 'real'

    # Count number of docs for which at least one pair of each vocab word occurs
    _, ndocs_csc_VV = coh.calc_pairwise_cooccurance_counts(
        dataset=datasets_by_split['train'])

    split_names = ['train', 'valid', 'test']
    for split_name in split_names:
        etimes = start_timer_segment(etimes, '%s_calc_lossmap' % split_name)
        ans_dict = pc_toolbox.model_slda.slda_loss__cython.calc_loss__slda(
            dataset=datasets_by_split[split_name],
            topics_KV=topics_KV,
            w_CK=w_CK,
            LP=None,
            weight_x=1.0,
            weight_y=1.0,
            alpha=alpha,
            tau=tau,
            lambda_w=lambda_w,
            pi_estimation_mode='missing_y',
            pi_estimation_weight_y=0.0,
            return_dict=True,
            **perf_metrics_pi_optim_kwargs)
        etimes = stop_timer_segment(etimes, '%s_calc_lossmap' % split_name)
        assert 'summary_msg' in ans_dict

        # Extract doc-topic features
        assert 'pi_DK' in ans_dict
        pi_DK = ans_dict.pop('pi_DK')

        info_dict = OrderedDict([
            ('step', float(cur_step)),
            ('lap', float(cur_lap)),
            ('elapsed_time_sec', float(elapsed_time_sec)),
            ('logpdf_x_pertok', -1 * ans_dict['uloss_x__pertok']),
            ('logpdf_y_perdoc', -1 * ans_dict['uloss_y__perdoc']),
            ('lossmap_ttl_pertok', ans_dict['loss_ttl']),
            ('lossmap_x_pertok', ans_dict['loss_x']),
            ('lossmap_y_pertok', ans_dict['loss_y']),
            ('lossmap_pi_pertok', ans_dict['loss_pi']),
            ('lossmap_topic_pertok', ans_dict['loss_topics']),
            ('lossmap_w_pertok', ans_dict['loss_w']),
        ])
        if losstrain_ttl is not None:
            info_dict['losstrain_ttl'] = float(losstrain_ttl)

        ## Compute y metrics
        # Case 1/2: binary
        etimes = start_timer_segment(etimes, '%s_calc_y_metrics' % split_name)
        assert 'y_proba_DC' in ans_dict
        if output_data_type.count('binary'):
            y_proba_DC = ans_dict.pop('y_proba_DC')
            C = y_proba_DC.shape[1]
            assert np.nanmin(y_proba_DC) >= 0.0
            assert np.nanmax(y_proba_DC) <= 1.0
            for c in range(n_labels):
                ytrue_c_D = datasets_by_split[split_name]['y_DC'][:, c]
                yproba_c_D = y_proba_DC[:, c]
                # Keep only finite values
                rowmask = np.logical_and(np.isfinite(yproba_c_D),
                                         np.isfinite(ytrue_c_D))
                ytrue_c_D = ytrue_c_D[rowmask]
                yproba_c_D = yproba_c_D[rowmask]
                if ytrue_c_D.size == 0:
                    raise ValueError("Label id c=%d has no observed y values" %
                                     c)

                yhat_c_D = np.asarray(yproba_c_D > 0.5, dtype=ytrue_c_D.dtype)

                # Error rate
                error_rate_y__c = np.sum(np.logical_xor(ytrue_c_D, yhat_c_D))
                error_rate_y__c /= float(ytrue_c_D.size)
                info_dict['y_%d_error_rate' % c] = error_rate_y__c

                # Area under ROC curve
                try:
                    #roc_auc_y__c = roc_auc_score(ytrue_c_D, yproba_c_D)
                    roc_auc_y__c = average_precision_score(
                        ytrue_c_D, yproba_c_D)
                except ValueError as e:
                    # Error occurs when not enough examples of each label
                    roc_auc_y__c = 0.0
                info_dict['y_%d_auprc' % c] = roc_auc_y__c

        # Case 2/2: real values
        elif output_data_type.count('real'):
            # Remember, y_proba_DC is really estimated mean of y_DC
            y_est_DC = ans_dict.pop('y_proba_DC')
            for c in range(n_labels):
                y_true_c_D = datasets_by_split[split_name]['y_DC'][:, c]
                y_est_c_D = y_est_DC[:, c]
                # Keep only finite values
                rowmask = np.logical_and(np.isfinite(y_true_c_D),
                                         np.isfinite(y_est_c_D))
                y_true_c_D = y_true_c_D[rowmask]
                y_est_c_D = y_est_c_D[rowmask]
                if y_true_c_D.size == 0:
                    raise ValueError("Label id c=%d has no observed y values" %
                                     c)
                # Compute RMSE
                rmse = np.sqrt(np.mean(np.square(y_true_c_D - y_est_c_D)))
                info_dict['y_%d_rmse' % c] = rmse
        etimes = stop_timer_segment(etimes, '%s_calc_y_metrics' % split_name)

        ## Compute vb lower bound on logpdf x
        etimes = start_timer_segment(etimes,
                                     '%s_calc_lb_logpdf_x' % split_name)
        lb_logpdf_x, lb_logpdf_x_pertok = calc_elbo_for_many_docs(
            dataset=datasets_by_split[split_name],
            topics_KV=topics_KV,
            alpha=alpha,
            init_name_list=['warm'],
            init_pi_DK=pi_DK,
            verbose=False,
            do_trace_elbo=False,
        )
        etimes = stop_timer_segment(etimes, '%s_calc_lb_logpdf_x' % split_name)
        info_dict['elbo_logpdf_x_pertok'] = lb_logpdf_x_pertok

        ## COHERENCE
        etimes = start_timer_segment(etimes,
                                     '%s_calc_coher_metrics' % split_name)
        K = topics_KV.shape[0]
        npmi_K = np.zeros(K)
        for k in range(K):
            # Select at most 20 vocab words per topic
            # But if fewer than that take up 90% of the mass, take only those
            top_vocab_ids = np.argsort(-1 * topics_KV[k])[:20]
            cumsum_mass = np.cumsum(topics_KV[k, top_vocab_ids])
            m = np.searchsorted(cumsum_mass, 0.9)
            top_vocab_ids = top_vocab_ids[:(m + 1)]
            npmi_K[
                k], _ = coh.calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic(
                    ndocs_csc_VV=ndocs_csc_VV,
                    top_vocab_ids=top_vocab_ids,
                    pair_smooth_eps=0.1)
        if K < 10:
            perc_list = [0, 50, 100]
        else:
            perc_list = [0, 10, 50, 90, 100]
        for perc in perc_list:
            pstr = '%06.2f' % perc
            info_dict['topic_npmi_p' + pstr] = np.percentile(npmi_K, perc)

        etimes = stop_timer_segment(etimes,
                                    '%s_calc_coher_metrics' % split_name)

        info_dict['losstrain_weight_y'] = weight_y
        info_dict['alpha'] = alpha
        info_dict['tau'] = tau
        info_dict['lambda_w'] = lambda_w

        info_dict['n_states'] = float(topics_KV.shape[0])
        info_dict['l1norm_w'] = float(l1_norm_w)
        info_dict['l1norm_logtopics'] = float(l1_norm_logtopics)

        info_df = pd.DataFrame([info_dict])
        col_order = info_dict.keys()
        ppinfo_str = info_df.to_csv(
            None,
            float_format='% 20.12g',
            na_rep='%20s' % 'nan',
            index=False,
            header=False,
            columns=col_order)  # relying on an ordered dict here
        info_str = info_df.to_csv(
            None,
            float_format='% .12g',
            na_rep='nan',
            index=False,
            header=False,
            columns=col_order)  # relying on an ordered dict here
        assert np.max(list(map(len, col_order))) <= 20
        if not disable_output:
            csv_fpath = os.path.join(
                output_path, 'snapshot_perf_metrics_%s.csv' % split_name)
            ppcsv_fpath = os.path.join(
                output_path,
                'pretty_snapshot_perf_metrics_%s.csv' % split_name)

            if int(cur_step) == 0:
                with open(csv_fpath, 'w') as f:
                    header_str = ','.join(['%s' % s for s in col_order])
                    f.write(header_str + "\n")
                with open(ppcsv_fpath, 'w') as f:
                    header_str = ','.join(['%20s' % s for s in col_order])
                    f.write(header_str + "\n")
            with open(csv_fpath, 'a') as f:
                f.write(info_str)
            with open(ppcsv_fpath, 'a') as f:
                f.write(ppinfo_str)

            pi_summary_txt_fpath = os.path.join(
                output_path,
                'perf_metrics_pi_optim_summaries_%s.txt' % split_name)
            lap_prefix = 'lap %011.3f  ' % cur_lap
            with open(pi_summary_txt_fpath, 'a') as f:
                f.write(lap_prefix + ans_dict['summary_msg'] + "\n")

    # Write timings to txt file for comparison
    msg = pprint_timer_segments(etimes, prefix='lap%011.3f' % (cur_lap))
    if verbose_timings:
        pprint(msg)
    if not disable_output:
        timings_txt = os.path.join(output_path, 'timings_for_perf_metrics.txt')
        with open(timings_txt, 'a') as f:
            f.write(msg)
    return info_dict
예제 #8
0
def get_stratified_subsample_ids(y_DC=None,
                                 n_subsamples=1000,
                                 min_per_label=5,
                                 seed=42,
                                 verbose=False):
    ''' Get row ids of examples to keep in subsample for initializing weights

    Returns
    -------
    doc_ids : 1D array of ids

    Examples
    --------
    >>> y_DC = np.zeros((1000, 3))
    >>> y_DC[200:205, 0] = 1
    >>> y_DC[400:405, 1] = 1
    >>> y_DC[:995, 2] = 1
    >>> mask = get_stratified_subsample_ids(y_DC, 10, min_per_label=5)
    >>> mask.tolist()
    [200, 201, 202, 203, 204, 400, 401, 402, 403, 404, 995, 996, 997, 998, 999]
    >>> np.sum(y_DC[mask] == 0, axis=0).tolist()
    [10, 10, 10]
    >>> np.sum(y_DC[mask] == 1, axis=0).tolist()
    [5, 5, 5]
    '''
    n_labels = y_DC.shape[1]
    n_examples = y_DC.shape[0]
    if n_subsamples >= n_examples:
        return np.arange(n_examples)
    # If here, we actually need to subsample

    # Make version of y_DC where 1 is the minority class in EVERY column
    sums_total = np.sum(y_DC, axis=0)
    need_flip = sums_total / n_examples > 0.5
    y_DC[:, need_flip] = 1.0 - y_DC[:, need_flip]
    sums_total[need_flip] = n_examples - sums_total[need_flip]

    keep_mask = np.zeros(y_DC.shape[0], dtype=np.bool)
    sums_subsample = np.sum(y_DC[keep_mask], axis=0)
    for c in xrange(n_labels):
        if sums_subsample[c] < min_per_label \
                and sums_subsample[c] < sums_total[c]:
            n_more = np.minimum(min_per_label, sums_total[c])
            on_ids = np.flatnonzero(y_DC[:, c])[:min_per_label]
            keep_mask[on_ids] = True
    size = np.sum(keep_mask)
    if size < n_subsamples:
        prng = np.random.RandomState(seed)
        eligible_ids = np.flatnonzero(keep_mask == 0)
        chosen_ids = prng.choice(eligible_ids,
                                 n_subsamples - size,
                                 replace=False)
        keep_mask[chosen_ids] = 1
        size = np.sum(keep_mask)
    assert size >= n_subsamples
    sums_subsample = np.sum(y_DC[keep_mask], axis=0)
    if verbose:
        pprint('Minority examples per label in dataset of size %d' %
               n_examples)
        pprint(' '.join(['%4d' % val for val in sums_total]))
        pprint('Minority examples per label in subsample of size %d:' % size)
        pprint(' '.join(['%4d' % val for val in sums_subsample]))
    return np.flatnonzero(keep_mask)
예제 #9
0
def calc_nef_map_pi_DK(dataset=None,
                       topics_KV=None,
                       alpha=None,
                       nef_alpha=None,
                       init_pi_DK=None,
                       n_seconds_between_print=-1,
                       active_proba_thr=0.005,
                       return_info=False,
                       calc_pi_d_K=calc_nef_map_pi_d_K,
                       **some_pi_estimation_kwargs):
    ''' Extract doc-topic probability features for every doc in dataset.

    Args
    ----
    dataset : dict with array fields
        'n_docs' : int, non-negative
            number of documents in dataset
        'word_id_U' : 1D array, size U, dtype=int
            vocab ids for each doc-term pair in dataset
        'word_ct_U' : 1D array, size U, dtype=float
            counts for each doc-term pair in dataset
        'doc_indptr_Dp1' : 1D array, size D+1, type=int
            indptr / fenceposts delineating where individual docs begin/end
    topics_KV : 2D array, size K x V, rows sum to one
        probability of each word v appearing under each topic k
    alpha : float, positive value
        concentration parameter of Dirichlet prior on doc-topic probas
    
    Returns
    -------
    pi_DK : 2D array, size D x K
        Each row has positive entries and sums to one.
    info_dict : dict
        Only returned if called with return_info=True
    '''
    # Parse pi estimation kwargs
    pi_estimation_kwargs = dict(**DefaultDocTopicOptKwargs)
    for key in pi_estimation_kwargs.keys():
        if key in some_pi_estimation_kwargs:
            val = DefaultDocTopicOptKwargs[key]
            if isinstance(val, float):
                pi_estimation_kwargs[key] = float(
                    some_pi_estimation_kwargs[key])
            else:
                pi_estimation_kwargs[key] = int(some_pi_estimation_kwargs[key])

    assert topics_KV is not None
    K = int(topics_KV.shape[0])

    n_docs = dataset['n_docs']
    doc_indptr_Dp1 = dataset['doc_indptr_Dp1']
    word_id_U = dataset['word_id_U']
    word_ct_U = dataset['word_ct_U']

    pi_DK = np.zeros((n_docs, K))
    n_docs_converged = 0
    n_docs_restarted = 0
    iters_per_doc = np.zeros(n_docs, dtype=np.int32)
    n_active_per_doc = np.zeros(n_docs, dtype=np.int32)
    restarts_per_doc = np.zeros(n_docs, dtype=np.int32)
    step_size_per_doc = np.zeros(n_docs, dtype=np.float32)
    dist_per_doc = np.zeros(n_docs, dtype=np.float32)
    loss_per_doc = np.zeros(n_docs, dtype=np.float32)

    is_time = False
    start_time_sec = time.time()
    last_print_sec = start_time_sec
    for d in xrange(n_docs):
        start_d = doc_indptr_Dp1[d]
        stop_d = doc_indptr_Dp1[d + 1]

        if init_pi_DK is None:
            init_pi_d_K = None
        else:
            init_pi_d_K = init_pi_DK[d]

        # MCH: Cannot autograd when doing this kind of assignment
        pi_DK[d,:], info_dict = \
            calc_pi_d_K(
                word_id_U[start_d:stop_d],
                word_ct_U[start_d:stop_d],
                topics_KV=topics_KV,
                alpha=alpha,
                nef_alpha=nef_alpha,
                init_pi_d_K=init_pi_d_K,
                **pi_estimation_kwargs)
        if return_info or n_seconds_between_print > 0:
            n_active_per_doc[d] = \
                np.sum(pi_DK[d,:] > active_proba_thr)
            n_docs_restarted += info_dict['n_restarts'] > 0
            n_docs_converged += info_dict['did_converge']
            iters_per_doc[d] = info_dict['n_iters']
            step_size_per_doc[d] = info_dict['pi_step_size']
            try:
                dist_per_doc[d] = info_dict['cur_L1_diff']
            except KeyError:
                dist_per_doc = None
            try:
                restarts_per_doc[d] = info_dict['n_restarts']
            except KeyError:
                restarts_per_doc = None
            try:
                loss_per_doc[d] = info_dict['loss']
            except KeyError:
                pass

            cur_time_sec = time.time()
            if n_seconds_between_print > 0:
                is_time = cur_time_sec - last_print_sec > n_seconds_between_print
            is_last = (d + 1) == n_docs
            if is_last or is_time:
                msg = make_readable_summary_for_pi_DK_estimation(
                    elapsed_time_sec=cur_time_sec - start_time_sec,
                    n_docs=n_docs,
                    n_docs_completed=d + 1,
                    n_docs_converged=n_docs_converged,
                    n_docs_restarted=n_docs_restarted,
                    iters_per_doc=iters_per_doc,
                    n_active_per_doc=n_active_per_doc,
                    dist_per_doc=dist_per_doc,
                    restarts_per_doc=restarts_per_doc,
                    step_size_per_doc=step_size_per_doc,
                    loss_per_doc=loss_per_doc)

                last_print_sec = cur_time_sec
                if n_seconds_between_print > 0:
                    pprint(msg)
    if return_info:
        agg_info_dict = dict(summary_msg=msg,
                             iters_per_doc=iters_per_doc,
                             n_active_per_doc=n_active_per_doc,
                             dist_per_doc=dist_per_doc,
                             restarts_per_doc=restarts_per_doc,
                             step_size_per_doc=step_size_per_doc,
                             loss_per_doc=loss_per_doc,
                             loss=np.sum(loss_per_doc),
                             alpha=alpha)
        return pi_DK, agg_info_dict
    else:
        return pi_DK
def minimize(loss_func_wrt_paramvec_and_step=None,
             grad_func_wrt_paramvec_and_step=None,
             save_func_wrt_param_dict=None,
             callback_func_wrt_param_dict=None,
             callback_kwargs=None,
             param_tfm_manager=None,
             dim_P=None,
             init_param_dict=None,
             n_line_search_steps=10,
             n_terms_approx_hessian=10,
             **kwargs):
    """ Minimize provided loss function using L-BFGS algorithm

    Returns
    -------
    param_dict : dict
        Contains estimated parameters that minimize the loss
    alg_state_dict : dict
        Contains algorithm information (num steps completed, etc.)
    """
    pprint('[scipy_lbfgs_minimizer] Begin training...')
    pprint('--n_line_search_steps  %.3f' % n_line_search_steps)
    pprint('--n_terms_approx_hessian %.3f' % n_terms_approx_hessian)

    # Parse user input
    n_line_search_steps = int(n_line_search_steps)
    n_terms_approx_hessian = int(n_terms_approx_hessian)

    # Convert provided common param dict
    # to a flat 1D array with unconstrained values
    param_vec = param_tfm_manager.flatten_to_differentiable_param_vec(
        init_param_dict, **dim_P)

    # Warmup
    start_time_sec = time.time()
    init_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=0)
    loss_eval_time_sec = time.time() - start_time_sec
    pprint("Loss     @ init: %8.3f sec | val %.6e" %
           (loss_eval_time_sec, init_loss_val))
    pprint("Params   @ init: %8s     | %5d params | l2 norm / entry %.4e" %
           (' ', param_vec.size, calc_l2_norm_of_vector_per_entry(param_vec)))
    start_time_sec = time.time()
    init_grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=0)
    elapsed_time_sec = time.time() - start_time_sec
    init_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(init_grad_vec)
    pprint("Gradient @ init: %8.3f sec | %5d params | l2 norm / entry %.4e" %
           (elapsed_time_sec, init_grad_vec.size, init_grad_norm_per_entry))

    # Create settings that track algorithm state
    # cur_step, cur_lap, n_laps, n_steps, etc.
    alg_state_kwargs = init_alg_state_kwargs(cur_step=0.0, **kwargs)
    n_steps = alg_state_kwargs['n_steps']
    if 'output_path' in alg_state_kwargs:
        laps_to_save_str, steps_to_save_str = calc_laps_when_snapshots_saved(
            return_str=True, keep_first=5, keep_last=5, **alg_state_kwargs)
        pprint("Snapshots will be saved at intervals:")
        pprint("   laps: %s" % laps_to_save_str)
        pprint("  steps: %s" % steps_to_save_str)
        pprint("Snapshot saved to --output_path:\n%s" %
               (alg_state_kwargs['output_path']))

    # Translate settings into scipy's specific options format
    options_dict = dict(
        maxiter=n_steps,
        maxfun=n_line_search_steps * n_steps,
        maxcor=n_terms_approx_hessian,
        maxls=n_line_search_steps,
        ftol=0.0,
        gtol=0.0,
    )
    alg_state_kwargs['cur_loss_val'] = init_loss_val

    ## Define special callback function
    # Which does things like print progress at relevant steps
    # Save snapshots to files at relevant steps, etc.
    def my_callback_func(cur_param_vec,
                         is_init=False,
                         alg_state_kwargs=alg_state_kwargs):
        # Update step counter, timer, etc.
        if not is_init:
            alg_state_kwargs.update(
                update_alg_state_kwargs(**alg_state_kwargs))
        if do_print_now(**alg_state_kwargs) or do_save_now(**alg_state_kwargs):
            cur_loss_val = loss_func_wrt_paramvec_and_step(cur_param_vec)
            alg_state_kwargs['cur_loss_val'] = cur_loss_val

        if do_print_now(**alg_state_kwargs):
            pprint(make_status_string(
                **alg_state_kwargs))  # assume cur_loss_val is inside
            save_status_to_txt_files(**alg_state_kwargs)
            alg_state_kwargs.update(
                update_alg_state_kwargs_after_print(**alg_state_kwargs))

        if do_save_now(**alg_state_kwargs):
            param_dict = param_tfm_manager.unflatten_to_common_param_dict(
                cur_param_vec, **dim_P)
            if save_func_wrt_param_dict is not None:
                save_func_wrt_param_dict(param_dict=param_dict,
                                         **alg_state_kwargs)
            if callback_func_wrt_param_dict is not None:
                callback_func_wrt_param_dict(
                    param_dict=param_dict,
                    losstrain_ttl=alg_state_kwargs.get('cur_loss_val',
                                                       init_loss_val),
                    alg_state_kwargs=alg_state_kwargs,
                    **callback_kwargs)
            alg_state_kwargs.update(
                update_alg_state_kwargs_after_save(**alg_state_kwargs))

    ## Run training ...
    my_callback_func(param_vec, is_init=True)
    if n_steps > 0:
        opt_result_obj = scipy.optimize.minimize(
            loss_func_wrt_paramvec_and_step,
            param_vec,
            method='l-bfgs-b',
            jac=grad_func_wrt_paramvec_and_step,
            options=options_dict,
            callback=my_callback_func)
        pprint('[scipy_lbfgs_minimizer] msg %s' % opt_result_obj.message)
        param_vec = opt_result_obj.x
        # Relies on alg_state_kwargs already being defined in callback
        my_callback_func(param_vec)

    param_dict = param_tfm_manager.unflatten_to_common_param_dict(
        param_vec, **dim_P)
    pprint('[scipy_lbfgs_minimizer] Done with training.')
    return param_dict, alg_state_kwargs
def estimate_w_CK__given_pi_DK(
        dataset=None,
        pi_DK=None,
        lambda_w=0.001,
        seed=42,
        prefix='',
        verbose=False,
        **kwargs):
    """ Estimate regression weights from provided probability features.

    Uses sklearn's regularized regressors under the hood.

    Returns
    -------
    w_CK : 2D array, size C x K
        Regression weights
    """

    K = pi_DK.shape[1]
    C = int(dataset['n_labels'])
    if verbose:
        pprint('%s Fitting %d regressions...' % (
            prefix, C))

    w_CK = np.zeros((C, K))

    u_y_vals = np.unique(dataset['y_DC'].flatten())
    if u_y_vals.size <= 2 and np.union1d([0.0, 1.0], u_y_vals).size == 2:
        output_data_type = 'binary'
    else:
        output_data_type = 'real'

    if 'y_rowmask' in dataset:
        y_DC = dataset['y_DC'][1 == dataset['y_rowmask']]
        pi_DK = pi_DK[1 == dataset['y_rowmask']]
        u_y_vals = np.unique(y_DC.sum(axis=1))
        assert u_y_vals.size > 1
    else:
        y_DC = dataset['y_DC']

    for c in xrange(C):
        # Do a quick regression to get initial weights!
        if output_data_type.count('binary') > 0:
            clf = LogisticRegression(
                fit_intercept=False,
                C=0.5/lambda_w,
                random_state=seed,
                )
        else:
            clf = RidgeRegression(
                fit_intercept=False,
                alpha=lambda_w,
                random_state=seed,
                )

        clf.fit(pi_DK, y_DC[:, c])
        w_CK[c] = clf.coef_
        if verbose:
            pprint('  w_CK[%d, :5]=' % c + ' '.join(['% .2f' % w for w in w_CK[c, :5]]))
            pprint('  label id %d / %d done with lambda_w = %.5f' % (
                c+1, C, lambda_w))
    return w_CK
def read_args_from_stdin_and_run():
    ''' Main executable function to train and evaluate classifier.

    Post Condition
    --------------
    AUC and other eval info printed to stdout.
    Trained classifier saved ???.
    '''
    if not sys.stdin.isatty():
        for line in sys.stdin.readlines():
            line = line.strip()
            sys.argv.extend(line.split(' '))
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder containing:" +
                        " *.npy files: X_train, y_train, P_train"
                        " *.txt files: X_colnames.txt and y_colnames.txt")
    parser.add_argument(
        '--pretrained_clf_path',
        default='/tmp/',
        type=str,
        help="Path to folder to hold output from classifier. Includes:" +
        " perf_metric*.txt files: auc_train.txt & auc_test.txt" +
        " settings.txt: description of all settings to reproduce.")
    parser.add_argument('--split_names', default='test')
    parser.add_argument('--split_nicknames', default='evaltest')

    parser.add_argument('--features_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder with SSAMfeat*.npy files")
    parser.add_argument(
        '--target_arr_name',
        default='Y',
        type=str,
    )
    parser.add_argument(
        '--target_names',
        default='all',
        type=str,
        help='Name of response/intervention to test.' +
        ' To try specific interventions, write names separated by commas.' +
        ' To try all interventions, use special name "all"')
    parser.add_argument('--seed_bootstrap',
                        default=42,
                        type=int,
                        help='Seed for bootstrap')
    parser.add_argument('--n_bootstraps',
                        default=5000,
                        type=int,
                        help='Number of samples for bootstrap conf. intervals')
    parser.add_argument('--bootstrap_stratify_pos_and_neg',
                        default=True,
                        type=int,
                        help='Whether to stratify examples or not')
    args, unk_list = parser.parse_known_args()
    arg_dict = vars(args)

    dataset_path = arg_dict['dataset_path']
    assert os.path.exists(arg_dict['pretrained_clf_path'])
    output_path = arg_dict['pretrained_clf_path']

    clf_opts = list()
    # Write parsed args to plain-text file
    # so we can exactly reproduce later
    with open(os.path.join(output_path, 'settings.txt'), 'r') as f:
        for line in f.readlines():
            line = line.strip()
            clf_opts.append(line.split(' = '))
    clf_opts = dict(clf_opts)

    feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']]

    pprint('[run_classifier says:] Loading dataset ...')
    start_time = time.time()
    feature_arr_names = clf_opts['feature_arr_names'].split(',')
    pprint('feature_arr_names:')
    feat_colnames_by_arr = OrderedDict()
    for feat_arr_name in feature_arr_names:
        pprint(feat_arr_name)
        cur_feat_colnames = None
        for feat_path in feat_path_list:
            colname_fpath = os.path.join(feat_path,
                                         feat_arr_name + '_colnames.txt')
            if os.path.exists(colname_fpath):
                cur_feat_colnames = \
                    [unicode(feat_arr_name + ":") + s
                        for s in load_list_of_unicode_from_txt(colname_fpath)]
                break
        feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames

    target_arr_name = arg_dict['target_arr_name']
    all_target_names = load_list_of_strings_from_txt(
        os.path.join(arg_dict['dataset_path'],
                     target_arr_name + '_colnames.txt'))
    target_names = arg_dict['target_names']
    if target_names == 'all':
        target_names = all_target_names
        target_cols = np.arange(len(all_target_names)).tolist()
    else:
        target_names = target_names.split(',')
        target_cols = list()
        for name in target_names:
            assert name in all_target_names
            target_cols.append(all_target_names.index(name))

    datasets_by_split = dict()
    split_nicknames = arg_dict['split_nicknames'].split(',')
    split_names = arg_dict['split_names'].split(',')

    for nickname, split_name in zip(split_nicknames, split_names):
        datasets_by_split[nickname] = dict()
        split_dataset = datasets_by_split[nickname]

        # Load Y
        dense_fpath = os.path.join(dataset_path,
                                   target_arr_name + "_%s.npy" % split_name)
        y = np.asarray(np.load(dense_fpath), order='C', dtype=np.int32)
        if y.ndim < 2:
            y = y[:, np.newaxis]
        assert y.ndim == 2
        assert y.shape[1] == len(all_target_names)
        split_dataset['y'] = y[:, target_cols]
        assert split_dataset['y'].shape[1] == len(target_cols)

        # Load X
        x_list = list()
        for feat_arr_name in feature_arr_names:
            x_cur = None

            def fpath_generator():
                for feat_path in feat_path_list:
                    for sname in [nickname, split_name]:
                        dense_fpath = os.path.join(
                            feat_path, feat_arr_name + "_" + sname + ".npy")
                        sparse_fpath = os.path.join(
                            feat_path,
                            feat_arr_name + "_csr_" + sname + ".npz")
                        yield dense_fpath, sparse_fpath

            ds_path_list = [pair for pair in fpath_generator()]
            for ii, (dense_fpath, sparse_fpath) in enumerate(ds_path_list):
                try:
                    if os.path.exists(sparse_fpath):
                        x_cur = load_csr_matrix(sparse_fpath)
                        assert np.all(np.isfinite(x_cur.data))
                        break
                    else:
                        x_cur = np.asarray(np.load(dense_fpath),
                                           order='C',
                                           dtype=np.float64)
                        if x_cur.ndim < 2:
                            x_cur = np.atleast_2d(x_cur).T
                        assert np.all(np.isfinite(x_cur))
                        break
                except IOError as e:
                    if ii == len(ds_path_list) - 1:
                        # Couldn't find desired file in any feat_path
                        raise e
                    else:
                        # Try the next feat_path in the list
                        pass
            if x_cur is not None:
                if feat_colnames_by_arr[feat_arr_name] is not None:
                    feat_dim = len(feat_colnames_by_arr[feat_arr_name])
                    assert x_cur.shape[1] == feat_dim
                else:
                    # Add dummy colnames
                    feat_dim = x_cur.shape[1]
                    n_sig_digits = np.maximum(3,
                                              int(np.ceil(np.log10(feat_dim))))
                    fmt_str = "%s_%0" + str(n_sig_digits) + "d"
                    feat_colnames_by_arr[feat_arr_name] = [
                        fmt_str % (feat_arr_name, fid)
                        for fid in range(feat_dim)
                    ]
                x_list.append(x_cur)

        if isinstance(x_list[0], np.ndarray):
            split_dataset['x'] = np.hstack(x_list)
        else:
            split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr')

        assert split_dataset['x'].ndim == 2
        assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0]
        assert (isinstance(split_dataset['x'], np.ndarray)
                or isinstance(split_dataset['x'], scipy.sparse.csr_matrix))

        if split_name == split_names[0]:
            # Flatten feat colnames into single list
            feat_colnames = sum(feat_colnames_by_arr.values(), [])
            assert isinstance(feat_colnames, list)
            assert len(feat_colnames) == split_dataset['x'].shape[1]

            print('y colnames: %s' % ' '.join(target_names))
            if len(feat_colnames) > 10:
                print('x colnames: %s ... %s' % (' '.join(
                    feat_colnames[:5]), ' '.join(feat_colnames[-5:])))
            else:
                print('x colnames: %s' % ' '.join(feat_colnames))

        print('---- %5s dataset summary' % split_name)
        print('%9d total examples' % y.shape[0])
        print('y : %d x %d targets' % split_dataset['y'].shape)
        print('x : %d x %d features' % split_dataset['x'].shape)

        for c in xrange(len(target_names)):
            y_c = split_dataset['y'][:, c]
            print('target %s : frac pos %.3f' %
                  (target_names[c], np.mean(y_c)))
            print('    %6d pos examples' % np.sum(y_c == 1))
            print('    %6d neg examples' % np.sum(y_c == 0))

    elapsed_time = time.time() - start_time
    print('[run_classifier says:] dataset loaded after %.2f sec.' %
          elapsed_time)

    n_cols = len(target_names)
    for c in xrange(n_cols):
        print('[eval_pretrained_classifier says:] eval for target %s' %
              target_names[c])
        eval_pretrained_clf(
            classifier_name=clf_opts['classifier_name'],
            classifier_path=arg_dict['pretrained_clf_path'],
            datasets_by_split=datasets_by_split,
            y_col_id=c,
            y_orig_col_id=all_target_names.index(target_names[c]),
            y_col_name=target_names[c],
            feat_colnames=feat_colnames,
            output_path=arg_dict['pretrained_clf_path'],
            seed_bootstrap=arg_dict['seed_bootstrap'],
            n_bootstraps=arg_dict['n_bootstraps'],
            bootstrap_stratify_pos_and_neg=arg_dict[
                'bootstrap_stratify_pos_and_neg'],
        )
        elapsed_time = time.time() - start_time
        print(
            '[eval_pretrained_classifier says:] target %s completed after %.2f sec'
            % (target_names[c], elapsed_time))
예제 #13
0
def select_best_from_many_runs(legend_name=None,
                               results_path_patterns=None,
                               output_path=None,
                               txt_src_path=None,
                               target_y_name=None,
                               all_y_names=None,
                               col_names_to_use_at_selection=['N_STATES'],
                               col_names_to_keep="",
                               col_names_to_keep_per_split="",
                               min_lap_to_use_at_selection=10,
                               split_name_to_use_at_selection='VALID',
                               selection_score_colname='LOSS_X',
                               selection_score_ranking_func='argmin',
                               unk_list=None,
                               **kwargs):
    """
    """
    provided_arg_dict = dict(**locals())

    # Create output_path on disk
    if output_path.count("$"):
        for key, val in locals().items():
            if output_path.count('$' + key):
                output_path = output_path.replace("$" + key, str(val))
    if not os.path.exists(output_path):
        mkpath(output_path)

    # Setup logging
    suffix = "__select_best__y_target=%s_score=%s_legend=%s" % (
        target_y_name, selection_score_colname, legend_name)
    config_pprint_logging(output_path, txtfile='stdout%s.txt' % suffix)

    # Write parsed args to plain-text file
    # so we can exactly reproduce later
    this_script_prefix = '[select_best.py says:]'
    pprint("%s Parsing args ..." % this_script_prefix)
    with open(os.path.join(output_path, 'settings%s.txt' % suffix), 'w') as f:
        for key, val in provided_arg_dict.items():
            f.write(key + ' = ' + str(val) + '\n')
            pprint(key + ' = ' + str(val))
    with open(os.path.join(output_path, 'args%s.txt' % suffix), 'w') as f:
        for key, val in provided_arg_dict.items():
            f.write('--' + key + ' ' + str(val) + '\n')
    pprint('')

    # Parse unknown args
    if unk_list is not None and len(unk_list) > 0:
        pprint("UNKNOWN ARGS (ignored)")
        for key in unk_list:
            pprint(key)
        del unk_list

    # Parse target y names
    target_y_name = unicode(target_y_name)
    if not isinstance(all_y_names, list):
        if os.path.exists(all_y_names):
            all_y_names = load_list_of_unicode_from_txt(all_y_names)
        else:
            all_y_names = map(unicode, all_y_names.split(","))

    def force_list_of_strings(val):
        if not isinstance(val, list):
            val = map(str, val.split(","))
        return val

    results_path_patterns = force_list_of_strings(results_path_patterns)
    col_names_to_use_at_selection = force_list_of_strings(
        col_names_to_use_at_selection)
    col_names_to_keep = force_list_of_strings(col_names_to_keep)
    col_names_to_keep_per_split = force_list_of_strings(
        col_names_to_keep_per_split)

    # Load df for all runs that match the query
    all_matching_runs_df = load_df_from_all_folders_matching_list_of_patterns(
        list_of_path_patterns=results_path_patterns,
        legend_name=legend_name,
        y_ind=all_y_names.index(target_y_name),
        column_names=COLUMN_NAMES,
        task_ids=range(1, 10),
    )
    all_matching_runs_df['TARGET_LABEL_NAME'] = target_y_name

    if selection_score_colname.startswith("="):
        formula = selection_score_colname.lstrip("=")
        all_matching_runs_df[selection_score_colname] = 0.0
        add_ops = formula.split("+")
        for op in add_ops:
            coef, colname = op.lstrip('(').rstrip(')').split("*")
            coef = float(coef)
            all_matching_runs_df[
                selection_score_colname] += coef * all_matching_runs_df[
                    colname].values

    if selection_score_ranking_func is None:
        selection_score_ranking_func = get_score_ranking_function_for_colname(
            selection_score_colname)
    elif selection_score_ranking_func == 'argmax':
        selection_score_ranking_func = np.argmax
    else:
        selection_score_ranking_func = np.argmin

    ## Create dataframe with only the best task at each legend name
    best_df = select_best_df_at_each_value_of_specific_vars(
        all_matching_runs_df,
        legend_name=legend_name,
        keys=col_names_to_use_at_selection,
        query_min_lap=min_lap_to_use_at_selection,
        score_colname=selection_score_colname,
        score_ranking_func=selection_score_ranking_func,
        target_splitname=split_name_to_use_at_selection,
    )
    row_dict_list = list()
    # Write the legend names to output path
    for cur_legend_name in np.unique(best_df['LEGEND_NAME_ASCII'].values):

        ## Make symlink to best run's task_path directory
        cur_query_str = ("LEGEND_NAME_ASCII == '%s' and IS_BEST_SNAPSHOT > 0" %
                         (cur_legend_name))
        # Prepare existing path
        best_snapshot_df = best_df.query(cur_query_str)
        assert best_snapshot_df.shape[0] == len(SPLIT_NAMES)
        best_task_path = best_snapshot_df['TASK_PATH_AT_BEST_SNAPSHOT'].values[
            0]
        best_task_path = best_task_path.rstrip(os.path.sep)
        assert os.path.exists(best_task_path)
        # Prepare symlink path
        job_path = "best_snapshot_run-legend_name=%s" % (
            cur_legend_name.replace(" ", "_"))
        cur_symlink_output_job_path = os.path.join(output_path, job_path)
        mkpath(cur_symlink_output_job_path)
        cur_symlink_output_task_path = os.path.join(output_path, job_path,
                                                    'best_task')
        # Remove any old version
        if os.path.islink(cur_symlink_output_task_path):
            os.unlink(cur_symlink_output_task_path)
        # Finally, make the symlink happen
        os.symlink(best_task_path, cur_symlink_output_task_path)
        pprint("\nLEGEND_NAME %s" % cur_legend_name)
        pprint("NEW BEST TASK PATH:\n%s" % cur_symlink_output_task_path)

        ## Make symlink to best snapshot directory

        # Prepare existing snapshot path (download content if necessary)
        snapshot_path = make_snapshot_path_for_lap(
            task_path=best_snapshot_df['TASK_PATH_AT_BEST_SNAPSHOT'].values[0],
            lap=best_snapshot_df['LAP_AT_BEST_SNAPSHOT'].values[0],
        )
        if not os.path.exists(snapshot_path):
            download_snapshot(snapshot_path)
        # Prepare new symlink path
        cur_symlink_snapshot_path = os.path.join(cur_symlink_output_job_path,
                                                 'best_snapshot')
        # Remove any old version
        if os.path.islink(cur_symlink_snapshot_path):
            os.unlink(cur_symlink_snapshot_path)
        # Finally, make the symlink happen
        os.symlink(snapshot_path, cur_symlink_snapshot_path)
        pprint("NEW BEST SNAPSHOT PATH:\n%s" % cur_symlink_snapshot_path)

        ## If needed, make brand new snapshot with only target y column
        if len(all_y_names) > 1 and target_y_name != 'avg':
            GP = load_param_dict_at_specific_snapshot(
                snapshot_path=snapshot_path)
            new_GP = dict(**GP)
            new_GP['w_CK'] = GP['w_CK'][all_y_names.index(target_y_name), :][
                np.newaxis, :]
            save_topic_model_snapshot(output_path=cur_symlink_output_job_path,
                                      prefix='targety=%s' % (target_y_name),
                                      **new_GP)

        ## Append to .csv file
        row_dict = OrderedDict()
        row_dict['LEGEND_NAME'] = legend_name
        for key in col_names_to_use_at_selection:
            row_dict[key] = best_snapshot_df[key].values[0]
        for key in col_names_to_keep:
            row_dict[key] = best_snapshot_df[key].values[0]

        for split_name in SPLIT_NAMES:
            best_split_df = best_snapshot_df.query("SPLIT_NAME == '%s'" %
                                                   split_name)
            assert best_split_df.shape[0] == 1
            assert isinstance(col_names_to_keep_per_split, list)
            for key in col_names_to_keep_per_split:
                split_key = "%s_%s" % (split_name.upper(), key)
                row_dict[split_key] = best_split_df[key].values[0]
        row_dict['LAP'] = best_snapshot_df['LAP'].values[0]
        row_dict['LABEL_NAME'] = best_snapshot_df['TARGET_LABEL_NAME'].values[
            0]
        row_dict['SNAPSHOT_SRCFILE'] = cur_symlink_snapshot_path
        row_dict['TXTSRCFILES_PATH'] = txt_src_path
        row_dict_list.append(row_dict)

    pprint("\nWriting csv file documenting all best snapshots for legend %s" %
           (legend_name))
    my_df = pd.DataFrame(row_dict_list)
    basename = "best_snapshots_%s.csv" % legend_name
    csv_fpath = os.path.join(output_path, basename)
    my_df.to_csv(csv_fpath, columns=row_dict_list[0].keys(), index=False)
    pprint("WROTE CSV FILE:\n%s" % csv_fpath)
예제 #14
0
def make_best_job_df(df,
                     target_query="SPLIT_NAME == 'VALID' and LAP > 50",
                     target_splitname='VALID',
                     score_colname='Y_ERROR_RATE',
                     score_ranking_func=np.argmin,
                     verbose=False):
    ''' Find single best task among all jobs in provided df.
    
    Returns
    -------
    best_job_df : dataframe of best single task
    '''
    default_score = fetch_default_score(score_ranking_func.__name__)
    job_paths = np.unique(df['JOB_PATH'].values)

    best_task_idstr_list = ['' for a in range(len(job_paths))]
    best_score_idx = np.zeros_like(job_paths, dtype=np.int32)
    best_score = default_score * np.ones_like(job_paths, dtype=np.float64)
    best_lap_idx = np.zeros_like(job_paths, dtype=np.float64)
    for jj, job_path in enumerate(job_paths):
        if job_path is None:
            continue

        cur_job_best_df = make_best_task_df(
            df.query("JOB_PATH == '%s'" % job_path),
            target_query=target_query,
            score_colname=score_colname,
            score_ranking_func=score_ranking_func,
            default_score=default_score,
            verbose=verbose)

        # Narrow down to ___ split, after __ laps
        cur_job_best_df = cur_job_best_df.query(target_query)
        if verbose:
            pprint(job_path.split(os.path.sep)[-1])

        if cur_job_best_df.shape[0] < 1:
            if verbose:
                pprint('    skipped. Too small to satisfy query.')
            continue

        split_name_chk = np.unique(cur_job_best_df['SPLIT_NAME'].values)
        assert len(split_name_chk) == 1
        assert split_name_chk[0].lower() == target_splitname.lower()

        best_task_idstr_list[jj] = str(cur_job_best_df['TASKID'].values[0])
        best_score_idx[jj] = score_ranking_func(
            cur_job_best_df[score_colname].values)
        best_score[jj] = cur_job_best_df[score_colname].values[
            best_score_idx[jj]]
        best_lap_idx[jj] = cur_job_best_df['LAP'].values[best_score_idx[jj]]
        if verbose:
            print("    best %s = %.4f at lap %9.3f of task %s" %
                  (score_colname, best_score[jj], best_lap_idx[jj],
                   best_task_idstr_list[jj]))

    # No tasks/jobs exist that satisfy target_query
    # This can happen when runs havent gone long enough yet
    if np.allclose(best_score, default_score):
        return None

    best_job_idx = score_ranking_func(best_score)
    best_job_df = df.query(
        "JOB_PATH == '%s' and TASKID == '%s'" %
        (job_paths[best_job_idx], best_task_idstr_list[best_job_idx])).copy()
    best_job_df['SCORE_AT_BEST_SNAPSHOT'] = best_score[best_job_idx]
    best_job_df['LAP_AT_BEST_SNAPSHOT'] = best_lap_idx[best_job_idx]
    best_job_df['IS_BEFORE_BEST_SNAPSHOT'] = np.asarray(
        best_job_df['LAP'].values.copy() <= best_lap_idx[best_job_idx],
        dtype=np.int32)
    best_job_df['TASK_PATH_AT_BEST_SNAPSHOT'] = os.path.join(
        job_paths[best_job_idx], best_task_idstr_list[best_job_idx])
    best_job_df['IS_BEST_SNAPSHOT'] = np.asarray(
        best_job_df['LAP'].values.copy() == best_lap_idx[best_job_idx],
        dtype=np.int32)
    best_job_df['FRAC_PROGRESS'] = \
        1.0 * best_job_df['LAP'].values.copy() \
        / np.max(best_job_df['LAP'].values)
    return best_job_df
예제 #15
0
def select_best_df_at_each_value_of_specific_vars(
        df,
        legend_name='Gibbs_LDA',
        keys=['N_STATES'],
        disp_keys=None,
        no_legend_keys=[],
        query="SPLIT_NAME == '$target_splitname' and LAP >= $query_min_lap",
        query_min_lap=5,
        target_splitname='VALID',
        score_colname='LOSS_X',
        score_ranking_func=np.argmin,
        **kwargs):
    ''' Produce dataframe of best runs at each value of specific variables.

    Args
    ----
    df : pandas DataFrame
        Each row represents a snapshot during training.
    legend_name : string
        Nickname of all runs provided.
    keys : list of strings
        Column names of specified variables used for best run selection.

    Returns
    -------
    best_df : pandas DataFrame
    '''
    if disp_keys is None:
        disp_keys = ['LAP_AT_BEST_SNAPSHOT', 'TASKID'] + keys
    query = query.replace("$query_min_lap", str(query_min_lap))
    query = query.replace("$target_splitname", str(target_splitname))
    pprint("Finding snapshots with %s of %s" %
           (score_ranking_func.__name__, score_colname))
    pprint("Among snapshots satisfying query: %s" % query)

    def expand_query_str_list(cur_list, new_vals):
        new_list = list()
        if len(cur_list) == 0:
            for new_q_str in new_vals:
                new_list.append(new_q_str)
        else:
            for q_str in cur_list:
                for new_q_str in new_vals:
                    new_list.append(q_str + " and " + new_q_str)
        return new_list

    query_str_list = list()
    pprint("Finding best task for each possible combo of these legend keys:")
    for key in keys:
        is_finite_mask = np.logical_not(pd.isnull(df[key].values))
        if np.sum(is_finite_mask) > 0:
            u_vals = np.unique(df[key].values[is_finite_mask]).tolist()
        else:
            u_vals = []
        if not np.all(is_finite_mask):
            u_vals += [np.nan]
        new_queries = list()
        for u_val in u_vals:
            if isinstance(u_val, str):
                new_query_str = "%s == '%s'" % (key, u_val)
            elif np.isfinite(u_val):
                new_query_str = "%s == %s" % (key, u_val)
            else:
                new_query_str = "%s != %s" % (key, key)
            new_queries.append(new_query_str)
        if len(new_queries) == 1:
            if len(query_str_list) < 1:
                query_str_list.extend(new_queries)
            continue
        pprint("    %s: %s" % (key, ','.join(map(str, u_vals))))
        query_str_list = expand_query_str_list(query_str_list, new_queries)

    best_df_list = list()
    for query_str in query_str_list:
        best_job_df = make_best_job_df(df.query(query_str),
                                       target_query=query,
                                       score_colname=score_colname,
                                       score_ranking_func=score_ranking_func,
                                       target_splitname=target_splitname,
                                       **kwargs)

        if best_job_df is None:
            pprint("NO BEST TASK AVAILABLE FOR %s + %s" %
                   (legend_name, query_str))
            continue

        # _UNIQUE_LEGEND_NAME distinctly identifies each "best job"
        # like 'Gibbs_LDA K == 5'
        # LEGEND_NAME may be simpler with duplicates
        # like 'Gibbs_LDA', for each of K in [5,10, 20]

        cur_queries = [s for s in query_str.split('and')]
        cur_legend_name = legend_name
        cur_ulegend_name = legend_name
        for cur_query_str in cur_queries:
            is_bad = False
            for no_leg_key in no_legend_keys:
                if cur_query_str.count(no_leg_key) > 0:
                    is_bad = True
            cur_ulegend_name += " " + cur_query_str.strip()
            if not is_bad:
                cur_legend_name += " " + cur_query_str.strip()
        best_job_df['_UNIQUE_LEGEND_NAME'] = cur_ulegend_name
        best_job_df['LEGEND_NAME'] = cur_legend_name
        best_df_list.append(best_job_df)
    best_df = pd.concat(best_df_list)

    pprint("ON SPLIT %s:" % (target_splitname))
    q_df = best_df.query("IS_BEST_SNAPSHOT > 0 and SPLIT_NAME == '%s'" %
                         target_splitname)
    disp_df = q_df[[score_colname] + disp_keys]
    disp_df = disp_df.apply(pd.to_numeric, errors='ignore')
    pprint(
        disp_df.to_string(index=False,
                          header=True,
                          float_format=lambda x: ' %.3f' % float(x)))

    best_df.reset_index(inplace=True)
    best_df = simplify_best_df_and_make_unicode_friendly(best_df)
    best_df.reset_index(inplace=True)
    return best_df
예제 #16
0
def init_param_dict(dataset=None,
                    topics_KV=None,
                    w_CK=None,
                    n_states=None,
                    init_name=None,
                    init_name_topics='rand_docs',
                    init_name_w='regress_given_topics',
                    init_model_path=None,
                    max_n_docs=100000,
                    min_n_docs_per_label=10,
                    seed=0,
                    alpha=1.1,
                    tau=1.1,
                    lambda_w=.001,
                    verbose=True,
                    **kwargs):
    ''' Create initial param dict for slda optimization problem.

    Returns
    -------
    init_params_dict : dict, with fields
        topics_KV : 2D array, K x V
        w_CK : 2D array, C x K
    '''
    if n_states is not None:
        n_states = int(n_states)
    n_states = int(n_states)
    lambda_w = float(lambda_w)
    tau = float(tau)
    alpha = float(alpha)

    # Parse init_name
    # For backwards compat: init_name means same thing as init_name_topics
    if init_name is not None:
        init_name_topics = init_name
    del init_name

    if str(init_model_path).lower() != 'none':
        pprint('[init_params] Loading from init_model_path ...')

        if init_model_path.count('snapshot'):
            initfromdisk_param_dict = load_topic_model_param_dict(
                snapshot_path=init_model_path)
        else:
            if init_model_path.endswith(os.path.sep):
                init_model_path = os.path.join(init_model_path,
                                               'param_dict.dump')
            initfromdisk_param_dict = joblib.load(init_model_path)
        topics_KV = initfromdisk_param_dict['topics_KV']
        if 'w_CK' in initfromdisk_param_dict:
            w_CK = initfromdisk_param_dict['w_CK']

    if topics_KV is None or topics_KV.shape[0] < n_states:
        pprint('[init_params] Running init_topics_KV %s ...' %
               (init_name_topics))
        topics_KV = init_topics_KV(
            dataset=dataset,
            topics_KV=topics_KV,
            n_states=n_states,
            seed=seed,
            init_name=init_name_topics,
            alpha=alpha,
            tau=tau,
        )

    if w_CK is None or w_CK.shape[1] < n_states:
        pprint('[init_params] Running init_w_CK %s ...' % (init_name_w))
        if init_name_w.count('regress'):
            assert dataset['n_docs'] < 1e6  # don't want this too big

            pprint('[init_params] Regress Step 1/2: Extract pi_DK...')
            pi_DK = calc_nef_map_pi_DK(dataset,
                                       topics_KV=topics_KV,
                                       alpha=alpha,
                                       n_seconds_between_print=600)

            prefix = '[init_params] Regress Step 2/2:'
            w_CK = estimate_w_CK__given_pi_DK(
                dataset=dataset,
                pi_DK=pi_DK,
                lambda_w=lambda_w,
                prefix=prefix,
                verbose=verbose,
            )
        else:
            raise ValueError("Unsupported init_name_w: " + init_name_w)

    assert topics_KV is not None
    assert w_CK is not None
    assert topics_KV.shape[0] == n_states
    assert w_CK.shape[1] == n_states
    pprint('[init_params] Done. Created init_param_dict.')
    return dict(w_CK=w_CK, topics_KV=topics_KV)