def run():
    # Create necessary dirs
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # Load data
    print('Loading NT data ...')
    nt = pd.read_csv(os.path.join(DATAPATH, FILENAME), sep=',')
    mm = pd.read_csv(os.path.join(DATAPATH, GENEMAPFILE), sep='\t')

    # Extract lincs cols from the whole dataset
    nt = nt[['Sample'] + mm['gdc'].tolist()]

    # Map lincs gene names and sort genes alphabetically
    col_mapping = {
        mm.loc[g, 'gdc']: mm.loc[g, 'symbol']
        for g in range(mm.shape[0])
    }
    nt = nt.rename(columns=col_mapping)
    nt = nt[['Sample'] + sorted(nt.columns[1:].tolist())]

    # Shuffle and extract the target label
    nt = nt.sample(n=nt.shape[0], axis=0, replace=False,
                   random_state=SEED).reset_index(drop=True)
    nt['Sample'] = nt['Sample'].map(lambda s: s.split('-')[-1]).values
    nt.rename(columns={'Sample': 'y'}, inplace=True)

    print(nt['y'].value_counts())
    nt.to_csv(os.path.join(OUTDIR, f'{APP}_data'), sep='\t', index=False)
def run(args):
    print(args)
    datapath = args.datapath
    modeldir = args.modeldir
    tempdir = args.tempdir
    n_shuffles = args.n_shuffles
    # col_set = args.col_set
    col_set = ['C', 'F']

    # Create necessary dirs
    utils.make_dir(tempdir)  #  os.makedirs(tempdir, exist_ok=True)

    # Load data
    data = pd.read_csv(datapath, sep=None)
    # ydata = data.iloc[:, 0].values
    xdata = data.iloc[:, 1:].values
    features = xdata.columns

    # # Scale data
    # scaler = StandardScaler()
    # xdata = scaler.fit_transform(xdata)
    # xdata = pd.DataFrame(xdata, columns=features)

    # Load trained keras model
    # (no need to compile the model for inference)
    model_name = 'keras_model.json'
    weights_name = 'keras_weights.h5'

    model_path = os.path.join(modeldir, model_name)
    weights_path = os.path.join(modeldir, weights_name)

    print('\nLoading model from ... {}'.format(model_path))
    with open(model_path, 'r') as json_file:
        model = json_file.read()
    keras_model = model_from_json(model)
    keras_model.name = 'trained_model'

    # Load weights into new model
    print('Loading weights from ... {}\n'.format(weights_path))
    keras_model.load_weights(weights_path)

    # Shuffle and predict
    # pred_df = infer_with_col_shuffle_multi(model=keras_model, xdata=xdata, col=col, n_shuffles=n_shuffles)
    pred_df = pfi_utils.shuf_and_pred_multi(model=keras_model,
                                            xdata=xdata,
                                            col_set=col_set,
                                            n_shuffles=n_shuffles)

    # ## ----------------------------------------------------------------------
    # pred_df = pd.DataFrame(index=range(len(xdata)), columns=range(n_shuffles))

    # for s in range(n_shuffles):
    #     # Execute infer code
    #     preds = infer_with_col_shuffle(model, xdata, col)
    #     pred_df.iloc[:, s] = preds
    # ## ----------------------------------------------------------------------

    # Write out results
    # pred_df.to_csv(os.path.join(tempdir, 'col.' + str(col) + '.csv'), sep='\t', index=False)
    pred_df.to_csv(os.path.join(tempdir,
                                'col.' + str('-'.join(col_set)) + '.csv'),
                   sep='\t',
                   index=False)
def run(args):
    # TODO: log out the args
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_ff_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    logger = set_logger(
        filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log'))

    # ==========  Load data  ==========
    print('\nLoad TC data ...')

    # ---------- Load data ----------
    # y_enc = pd.read_csv(YENC_PATH, sep='\t')
    ## data = pd.read_csv(DATAPATH, sep='\t')
    ## xdata = data.iloc[:, 1:].copy()
    ## ydata = data.iloc[:, 0].copy()
    data_train = pd.read_csv(DATAPATH_TR, sep=',')
    data_val = pd.read_csv(DATAPATH_VL, sep=',')
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    mm = pd.read_csv('/vol/ml/apartin/Benchmarks/Data/Pilot1/lincs1000.tsv',
                     sep='\t')

    train = train[['case_id', 'cancer_type'] +
                  mm['gdc'].tolist()]  # Extract lincs from the whole dataset
    test = test[['case_id', 'cancer_type'] +
                mm['gdc'].tolist()]  # Extract lincs from the whole dataset
    print(train.shape)
    print(test.shape)

    if args.bootstrap_cols > -1:
        ## xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED)  # Take a subset of cols
        y_tmp = data_train.iloc[:, 0]
        x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols,
                                              axis=1,
                                              random_state=SEED)
        data_train = pd.concat([y_tmp, x_tmp], axis=1)
        data_val = data_val[data_train.columns]
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')
    ##features = xdata.columns

    ##print('data.shape', data.shape)
    ##print(data.iloc[:3, :4])

    ##print('\nxdata.shape', xdata.shape)
    ##print('np.unique(ydata)', np.unique(ydata))

    ##scaler = StandardScaler()
    ##xdata = scaler.fit_transform(xdata)
    ##xdata = pd.DataFrame(xdata, columns=features)

    ##xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)

    # ==========  RF classifier  ==========
    logger.info('RF classifier ...')
    logger.info('-----------------')

    # ---------- Get the data ----------
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    # ---------- Train RF classifier ----------
    logger.info(f'Train RF Classifier ...')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    logger.info(
        f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:10].values)
    print('pred', yvl_preds[:10])
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    # TODO: finish this ...
    # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values,
    #                                       title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion')

    # ---------- Feature importance ----------
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols=max_cols,
                                    title='RF Classifier (FI using MDI)')
    rf_fi = utils.get_rf_fi(rf_model, columns=features)
    rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')

    # PFI
    logger.info('Compute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='RF Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='RF Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_rf')

    # ==========  NN classifier  ==========
    logger.info('                 ')
    logger.info('NN classifier ...')
    logger.info('-----------------')

    # ---------- Get the data ----------
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    n_classes = len(np.unique(ytr))
    ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # ---------- Train NN classifier ----------
    logger.info('Training NN Classifier...')
    keras_model = create_nn_classifier(n_features=xtr.shape[1],
                                       n_classes=n_classes)
    history = keras_model.fit(xtr,
                              ytr,
                              epochs=epoch,
                              batch_size=batch,
                              verbose=0)
    # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True,
    #                           img_name=os.path.join(OUTDIR, 'learning_with_lr'))
    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    logger.info('Prediction score (val loss): {:.4f}'.format(score))

    yvl_preds = keras_model.predict(xvl)
    print('true', np.argmax(yvl[:10], axis=1))
    print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='macro')))

    # ---------- Feature importance ----------
    # PFI
    logger.info('Compute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=keras_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='NN Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='NN Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
示例#4
0
def run(args):
    print(args)
    datapath = args.datapath
    modeldir = args.modeldir
    tempdir = args.tempdir
    n_shuffles = args.n_shuffles
    corr_th = CORR_THRES

    # Create necessary dirs
    utils.make_dir(TEMPDIR)  # os.makedirs(TEMPDIR, exist_ok=True)  # python 3
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)   # python 3

    # =======  Load dataset  =======
    data = pd.read_csv(datapath, sep='\t')
    xdata = data.iloc[:, 1:].copy()
    ydata = data.iloc[:, 0].copy()
    features = xdata.columns

    print('data.shape', data.shape)
    print(data.iloc[:3, :4])
    print('np.unique(ydata)', np.unique(ydata))

    # Scale data
    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    # Split data
    if 'classification' in datapath.split(os.sep)[-1]:
        print('classification')
        xtr, xvl, ytr, yvl = train_test_split(xdata,
                                              ydata,
                                              test_size=0.2,
                                              random_state=SEED,
                                              stratify=ydata)
    elif 'regression' in datapath.split(os.sep)[-1]:
        print('regression')
        xtr, xvl, ytr, yvl = train_test_split(xdata,
                                              ydata,
                                              test_size=0.2,
                                              random_state=SEED)

    # =======  Load trained keras model  =======
    # (no need to compile the model for inference)
    model_name = 'keras_model.json'
    weights_name = 'keras_weights.h5'

    model_path = os.path.join(modeldir, model_name)
    weights_path = os.path.join(modeldir, weights_name)

    print(f'\nLoading model from ... {model_path}')
    with open(model_path, 'r') as json_file:
        model = json_file.read()
    keras_model = model_from_json(model)
    keras_model.name = 'trained_model'

    # Load weights into new model
    print('\nLoading model from ... {}'.format(model_path))
    keras_model.load_weights(weights_path)

    # =======  Feature importance  =======
    # Compute correlated features subgroups
    col_sets = pfi_utils.get_fea_groups(xvl, th=corr_th, toplot=False)

    if len(col_sets) == 0:
        col_sets = [[c] for c in xdata.columns.tolist()]
    else:
        cols_unq_req = set()  # set of unique cols that were requested
        for col_set in col_sets:  # get the unique cols that were passed in col_sets
            for col in col_set:
                cols_unq_req.add(col)
        cols_unq = set(xdata.columns.tolist())
        cols_other = cols_unq.difference(cols_unq_req)
        col_sets = sorted(
            col_sets, key=len,
            reverse=True)  # sort list based on the length of sublists
        col_sets.extend([[c] for c in cols_other])
        col_sets = col_sets

    # Create df that stores feature importance
    fi_var = pd.DataFrame(index=range(len(col_sets)), columns=['cols', 'imp'])
    fi_score = pd.DataFrame(index=range(len(col_sets)),
                            columns=['cols', 'imp', 'std'])

    # Iter over col sets (col set per node)
    print('Iterate over col sets to compute importance ...')
    for i, col_set in enumerate(col_sets):
        # pred_df = infer_with_col_shuffle_multi(model=keras_model, xdata=xdata, col=col, n_shuffles=n_shuffles)
        pred_df = pfi_utils.shuf_and_pred_multi(model=keras_model,
                                                xdata=xdata,
                                                col_set=col_set,
                                                n_shuffles=n_shuffles)

        # ## ----------------------------------------------------------------------
        # pred_df = pd.DataFrame(index=range(xdata.shape[0]), columns=range(n_shuffles))
        # for s in range(n_shuffles):
        #     # Execute infer code
        #     # TODO: still need to decide regarding the output ...
        #     preds = shuf_and_pred(model=keras_model, xdata=xdata, col=col)
        #     pred_df.iloc[:, s] = preds
        # ## ----------------------------------------------------------------------

        pred_df.to_csv(os.path.join(tempdir,
                                    'col.' + str('-'.join(col_set)) + '.csv'),
                       sep='\t',
                       index=False)

        fi_var.loc[i, 'cols'] = ','.join(col_set)  # col
        fi_var.loc[i, 'imp'] = pred_df.var(axis=1).mean()

    fi_var['imp'] = fi_var['imp'] / fi_var['imp'].sum()
    fi_var = fi_var.sort_values('imp', ascending=False).reset_index(drop=True)
    fi_var.to_csv(os.path.join(OUTDIR, 'fi_var.csv'), index=False)
示例#5
0
def run(args):
    print(args)
    n_samples = args.n_samples
    n_classes = args.n_classes
    n_features = args.n_features
    n_informative = args.n_informative

    # Create necessary dirs
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # Build classification dataset
    print('\n======= Generate classification data =======')
    xdata, ydata = make_classification(
        n_samples=n_samples,
        n_classes=n_classes,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=
        N_REDUNDANT,  # features generated as random linear combinations of the informative features
        n_repeated=
        N_REPEATED,  # duplicated features, drawn randomly from the informative and the redundant features
        shift=None,
        scale=None,
        random_state=SEED,
        shuffle=False)

    ydata = pd.DataFrame(ydata).rename(columns={0: 'y'})
    # xdata = pd.DataFrame(xdata)
    xdata = pd.DataFrame(
        xdata, columns=[c for c in string.ascii_uppercase[:xdata.shape[1]]])
    data = pd.concat([ydata, xdata], axis=1)
    data = data.sample(data.shape[0], axis=0, replace=False, random_state=SEED)

    features = xdata.columns
    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=TEST_SIZE,
                                          random_state=SEED,
                                          shuffle=True,
                                          stratify=ydata)
    data_train = pd.concat([ytr, xtr], axis=1)
    data_val = pd.concat([yvl, xvl], axis=1)

    # Sort val data by class label
    data_val = data_val.sort_values('y', ascending=True).reset_index(drop=True)

    print('data.shape      ', data.shape)
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(data.iloc[:3, :4])

    if (N_REDUNDANT == 0) and (N_REPEATED == 0):
        # data.to_csv(os.path.join(OUTDIR, 'data_classification'), sep='\t', float_format=np.float16, index=False)
        data_train.to_csv(os.path.join(OUTDIR, 'data_classification_train'),
                          sep='\t',
                          float_format=np.float16,
                          index=False)
        data_val.to_csv(os.path.join(OUTDIR, 'data_classification_val'),
                        sep='\t',
                        float_format=np.float16,
                        index=False)
    else:
        # data.to_csv(os.path.join(OUTDIR, 'data_classification_corr'), sep='\t', float_format=np.float16, index=False)
        data_train.to_csv(os.path.join(OUTDIR,
                                       'data_classification_corr_train'),
                          sep='\t',
                          float_format=np.float16,
                          index=False)
        data_val.to_csv(os.path.join(OUTDIR, 'data_classification_corr_val'),
                        sep='\t',
                        float_format=np.float16,
                        index=False)

    # Build regression dataset
    print('\n======= Generate regression data =======')
    xdata, ydata = make_regression(n_samples=n_samples,
                                   n_targets=1,
                                   n_features=n_features,
                                   n_informative=n_informative,
                                   bias=0.0,
                                   effective_rank=None,
                                   tail_strength=0.5,
                                   noise=0.0,
                                   coef=False,
                                   random_state=SEED,
                                   shuffle=True)

    ydata = pd.DataFrame(ydata).rename(columns={0: 'y'})
    # xdata = pd.DataFrame(xdata)
    xdata = pd.DataFrame(
        xdata, columns=[c for c in string.ascii_uppercase[:xdata.shape[1]]])
    data = pd.concat([ydata, xdata], axis=1)
    data = data.sample(data.shape[0], replace=False, random_state=SEED)

    features = xdata.columns
    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=0.2,
                                          random_state=SEED,
                                          shuffle=True)
    data_train = pd.concat([ytr, xtr], axis=1)
    data_val = pd.concat([yvl, xvl], axis=1)

    print('data.shape      ', data.shape)
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(data.iloc[:3, :4])

    if (N_REDUNDANT == 0) and (N_REPEATED == 0):
        # data.to_csv(os.path.join(OUTDIR, 'data_regression'), sep='\t', float_format=np.float16, index=False)
        data_train.to_csv(os.path.join(OUTDIR, 'data_regression_train'),
                          sep='\t',
                          float_format=np.float16,
                          index=False)
        data_val.to_csv(os.path.join(OUTDIR, 'data_regression_val'),
                        sep='\t',
                        float_format=np.float16,
                        index=False)
    else:
        # data.to_csv(os.path.join(OUTDIR, 'data_regression_corr'), sep='\t', float_format=np.float16, index=False)
        data_train.to_csv(os.path.join(OUTDIR, 'data_regression_corr_train'),
                          sep='\t',
                          float_format=np.float16,
                          index=False)
        data_val.to_csv(os.path.join(OUTDIR, 'data_regression_corr_val'),
                        sep='\t',
                        float_format=np.float16,
                        index=False)
示例#6
0
def run(args):
    # TODO: log out the args
    print(f'\n{args}')
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols_plot = args.max_cols_plot

    # Create necessary dirs
    # dataset = DATAPATH_TR.split('_')[-1]  # TODO: clean/fix
    OUTDIR = os.path.join(file_path, f'results_{APP}_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    logger = set_logger(
        filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log'))

    # ==========  Load data  ==========
    print('\n======= Load TC data =======')
    y_enc = pd.read_csv(YENC_PATH, sep='\t')
    data_train = pd.read_csv(DATAPATH_TR, sep='\t')
    data_val = pd.read_csv(DATAPATH_VL, sep='\t')
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    if args.bootstrap_cols > -1:
        y_tmp = data_train.iloc[:, 0]
        x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols,
                                              axis=1,
                                              random_state=SEED)
        data_train = pd.concat([y_tmp, x_tmp], axis=1)
        data_val = data_val[data_train.columns]
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    # Compute corr matrix
    # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    # fig = utils.plot_cor_heatmap(cor)
    # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight')

    # # k-fold scheme
    # kfolds = 5
    # if kfolds == 1:
    #     skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED)
    # else:
    #     skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED)

    # # Run k-fold CV
    # best_model = None
    # best_model_id = 0
    # best_score = 0
    # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro'])

    # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)):
    #     print(f'\nFold {f + 1}/{kfolds} ...\n')

    #     print('train_idx', train_idx)
    #     print('val_idx', val_idx)

    #     # Split data
    #     xtr, xvl = xdata[train_idx], xdata[val_idx]
    #     ytr, yvl = ydata[train_idx], ydata[val_idx]

    #     rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)  # min_samples_split=3,
    #     rf_model.fit(xtr, ytr)
    #     score = rf_model.score(xvl, yvl)
    #     print(f'Prediction score (mean accuracy): {score:.4f}')

    #     yvl_preds = rf_model.predict(xvl)
    #     print('true', yvl[:7])
    #     print('pred', yvl_preds[:7])
    #     print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}')
    #     print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}')
    #     tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds})
    #     tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False)

    #     # Plot feature importance
    #     indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier')
    #     fi = utils.get_rf_fi(rf_model)
    #     fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False)
    #     fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight')

    #     # Compute scores
    #     df_scores.loc[f, 'kfold'] = f + 1
    #     df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')
    #     df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')

    #     # Save best model
    #     ## if val_scores.iloc[f, 0] < best_score:
    #     if best_score < df_scores.loc[f, 'f1_micro']:
    #         best_score = df_scores.loc[f, 'f1_micro']
    #         best_model = rf_model
    #         best_model_id = f

    # print(df_scores)
    # model = best_model

    # ==========  RF classifier  ==========
    logger.info('------- Data for RF Classifier -------')
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy().values
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy().values
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    # ---------- Train RF classifier ----------
    logger.info('------- Train RF Classifier -------')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      min_samples_leaf=5,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    logger.info(
        f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    #print('true', yvl[:10].values)
    print('true', yvl[:10])
    print('pred', yvl_preds[:10])
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=y_enc['label'].values,
                                title=f'{APP}_confusion_rf',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, f'{APP}_confusion_rf.png'))

    # ---------- MDI and PFI from RF ----------
    print('\n------- MDI and PFI from RF classifier -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols_plot=max_cols_plot,
                                    title='RF Classifier (FI using MDI)',
                                    errorbars=False,
                                    plot_direction='v',
                                    color='darkorange')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')
    rf_fi = utils.get_rf_fi(rf_model, columns=features)
    rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False)

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     y_enc=y_enc,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot,
                             title='RF Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot,
                               title='RF Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_fimap(figsize=(20, 7),
                            n_top_cols=10,
                            title='RF PFI Map',
                            drop_correlated=True)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_map.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_rf')

    # ==========  NN classifier  ==========
    logger.info('                 ')
    logger.info('------- Data for NN Classifier -------')
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    n_classes = len(np.unique(ytr))
    ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # ---------- Train NN classifier ----------
    logger.info('------- Train NN Classifier -------')
    keras_model = create_nn_classifier(n_features=xtr.shape[1],
                                       n_classes=n_classes)

    # callback_list = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0,
    #                                mode='auto', baseline=None, restore_best_weights=True)]
    callback_list = [
        ReduceLROnPlateau(monitor='val_loss',
                          factor=0.1,
                          patience=10,
                          verbose=1,
                          mode='auto',
                          min_delta=0.0001,
                          cooldown=0,
                          min_lr=0),
        ModelCheckpoint(filepath=os.path.join(OUTDIR, f'{APP}_nn_model'),
                        monitor='val_loss',
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=False,
                        mode='auto',
                        period=1)
    ]

    history = keras_model.fit(xtr,
                              ytr,
                              validation_data=(xvl, yvl),
                              epochs=epoch,
                              batch_size=batch,
                              verbose=1,
                              callbacks=callback_list)
    # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True,
    #                           img_name=os.path.join(OUTDIR, 'learning_with_lr'))

    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    logger.info('Prediction score (val loss): {:.4f}'.format(score))

    yvl_preds = keras_model.predict(xvl)
    print('true', np.argmax(yvl[:10], axis=1))
    print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='macro')))

    # Reshape taregt (required for confusion matrix and PFI)
    if yvl_preds.ndim > 1 and yvl_preds.shape[
            1] > 1:  # if classification, get the class label
        yvl_preds = np.argmax(yvl_preds, axis=1)
    if yvl.ndim > 1 and yvl.shape[
            1] > 1:  # if classification, get the class label
        yvl = np.argmax(yvl, axis=1)

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=y_enc['label'].values,
                                title=f'{APP}_confusion_nn',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, f'{APP}_confusion_nn.png'))

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=keras_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     y_enc=y_enc,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot,
                             title='NN Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot,
                               title='NN Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_fimap(figsize=(20, 7),
                            n_top_cols=10,
                            title='NN PFI Map',
                            drop_correlated=True)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_map.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
示例#7
0
def run():
    # print(args)
    n_shuffles = 20
    corr_th = 1

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}_runtime')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  RF classifier  ==========
    print('\nLoad TC data ...')

    # ---------- Load data ----------
    data = pd.read_csv(DATAPATH, sep='\t')
    xdata = data.iloc[:, 1:].copy()
    ydata = data.iloc[:, 0].copy()
    features = xdata.columns

    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)
    print('\nxtr.shape', xtr.shape)
    print('xvl.shape', xvl.shape)

    # ---------- Feature importance from RF and PFI ----------
    print('\nCompute PFI ...')
    n_samples = np.linspace(start=int(xvl.shape[0]/4), stop=xvl.shape[0], num=4, dtype=int) 
    n_cols = np.linspace(start=int(xvl.shape[1]/4), stop=xvl.shape[1], num=4, dtype=int)
    print(n_samples)
    print(n_cols)

    tt = pd.DataFrame(index=range(len(n_samples) * len(n_cols)),
                      columns=['n_samples', 'n_cols', 'time (sec)', 'time (min)'])

    t_run = time.time()
    cnt = 0
    for i, s in enumerate(n_samples):
        for j, c in enumerate(n_cols):
            print(f'(n_samples, n_cols): ({s}, {c})')
            xtr_ = xtr.iloc[:, :c]
            xvl_ = xvl.iloc[:s, :c]
            yvl_ = yvl[:s]
            # print('xtr_.shape', xtr_.shape)
            # print('xvl_.shape', xvl_.shape)
            # print('yvl_.shape', yvl_.shape)
            
            rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)
            rf_model.fit(xtr_, ytr)

            fi_obj = pfi.PFI(model=rf_model, xdata=xvl_, ydata=yvl_, n_shuffles=n_shuffles, outdir=OUTDIR)
            fi_obj.gen_col_sets(th=corr_th, toplot=False, verbose=False)
            
            t0 = time.time()
            fi_obj.compute_pfi(ml_type='c', verbose=False)
            t = time.time()-t0
            tt.loc[cnt, ['n_samples', 'n_cols', 'time (sec)', 'time (min)']] = np.array([s, c, t, t/60])
            cnt += 1

    tt.to_csv(os.path.join(OUTDIR, 'tt.csv'), index=False)
    print(f'\nTotal run time:  {(time.time()-t_run)/60} mins')
示例#8
0
def run(args):
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  RF classifier  ==========
    print('\nLoad NT data ...')

    # ---------- Load data ----------
    data = pd.read_csv(DATAPATH, sep='\t')
    xdata = data.iloc[:, 1:].copy()
    ydata = data.iloc[:, 0].copy()

    if args.bootstrap_cols > -1:
        xdata = xdata.sample(n=args.bootstrap_cols, axis=1,
                             random_state=SEED)  # Take a subset of cols
    features = xdata.columns

    print('data.shape', data.shape)
    print(data.iloc[:3, :4])

    print('\nxdata.shape', xdata.shape)
    print('np.unique(ydata)', np.unique(ydata))

    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=0.2,
                                          random_state=SEED,
                                          shuffle=True,
                                          stratify=ydata)

    # # k-fold scheme
    # kfolds = 5
    # if kfolds == 1:
    #     skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED)
    # else:
    #     skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED)

    # # Run k-fold CV
    # best_model = None
    # best_model_id = 0
    # best_score = 0
    # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro'])

    # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)):
    #     print(f'\nFold {f + 1}/{kfolds} ...\n')

    #     print('train_idx', train_idx)
    #     print('val_idx', val_idx)

    #     # Split data
    #     xtr, xvl = xdata[train_idx], xdata[val_idx]
    #     ytr, yvl = ydata[train_idx], ydata[val_idx]

    #     rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)  # min_samples_split=3,
    #     rf_model.fit(xtr, ytr)
    #     score = rf_model.score(xvl, yvl)
    #     print(f'Prediction score (mean accuracy): {score:.4f}')

    #     yvl_preds = rf_model.predict(xvl)
    #     print('true', yvl[:7])
    #     print('pred', yvl_preds[:7])
    #     print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}')
    #     print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}')
    #     tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds})
    #     tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False)

    #     # Plot feature importance
    #     indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier')
    #     fi = utils.get_rf_fi(rf_model)
    #     fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False)
    #     fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight')

    #     # Compute scores
    #     df_scores.loc[f, 'kfold'] = f + 1
    #     df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')
    #     df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')

    #     # Save best model
    #     ## if val_scores.iloc[f, 0] < best_score:
    #     if best_score < df_scores.loc[f, 'f1_micro']:
    #         best_score = df_scores.loc[f, 'f1_micro']
    #         best_model = rf_model
    #         best_model_id = f

    # print(df_scores)
    # model = best_model

    # ---------- Train classifier ----------
    print('\nTrain RF Classifier ...')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:10].values)
    print('pred', yvl_preds[:10])
    print('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    print('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    # TODO: finish this ...
    # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values,
    #                                       title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion')

    # Compute corr matrix
    # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    # fig = utils.plot_cor_heatmap(cor)
    # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight')

    # ---------- Feature importance from RF and PFI ----------
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols=max_cols,
                                    title='RF Classifier (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')

    # PFI
    print('\nCompute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c')
    # logger.info(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='RF Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='RF Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}')
示例#9
0
def run(args):
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  Load classification data  ==========
    print('\n======== Load classification data ========')
    data_train = pd.read_csv(DATAPATH_CLASSIFICATION_TRAIN, sep='\t')
    data_val = pd.read_csv(DATAPATH_CLASSIFICATION_VAL, sep='\t')
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(f'\ndata_train:\n{data_train.iloc[:3, :4]}')
    print(f'\ndata_val:\n{data_val.iloc[:3, :4]}')

    # ==========  RF classifier  ==========
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')

    # Compute corr matrix
    cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    fig = utils.plot_cor_heatmap(cor)
    fig.savefig(os.path.join(OUTDIR, 'feature_corr_classification.png'),
                bbox_inches='tight')

    # ---------- Train classifier ----------
    print('\n------- Train RF Classifier -------')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:5].values)
    print('pred', yvl_preds[:5])
    print('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    print('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    yvl_preds_p = rf_model.predict_proba(xvl)
    print(f'yvl_preds_p:\n{yvl_preds_p[:5]}')

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=yvl.unique(),
                                title=f'RF Classifier (Confusion)',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, 'rf_classifier_confusion.png'))

    # ---------- MDI and PFI from RF ----------
    print('\n------- MDI and PFI from RF classifier -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    title='RF Classifier (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'),
                bbox_inches='tight')

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=False)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(title='RF Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(title='RF Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi_p(title='RF Classifier (PFI MDA: p-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score_p.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name='rf_classifier')

    # ==========  NN classifier  ==========
    # print('\nLoad classification data ...')

    # ---------- Load data ----------
    # data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t')
    # xdata = data.iloc[:, 1:].copy()
    # ydata = data.iloc[:, 0].copy()
    # features = xdata.columns

    # print('data.shape', data.shape)
    # print(data.iloc[:3, :4])

    # print('\nxdata.shape', xdata.shape)
    # print('np.unique(ydata)', np.unique(ydata))

    # n_classes = len(np.unique(ydata))
    # ydata = keras.utils.to_categorical(ydata, num_classes=n_classes)

    # scaler = StandardScaler()
    # xdata = scaler.fit_transform(xdata)
    # xdata = pd.DataFrame(xdata, columns=features)

    # xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)

    # n_classes = len(np.unique(ydata))
    # ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    # yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # print('\nTrain NN Classifier ...')
    # keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes)
    # history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0)
    # score = keras_model.evaluate(xvl, yvl, verbose=False)[-1]  # compute the val loss
    # print(f'Prediction score (val loss): {score:.4f}')

    # yvl_preds = keras_model.predict(xvl)
    # print('true', np.argmax(yvl[:10], axis=1))
    # print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    # print('f1_score micro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro')))
    # print('f1_score macro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro')))

    # # ---------- Feature importance from RF and PFI ----------
    # # PFI
    # print('\nCompute PFI (NN classifier) ...')
    # t0 = time.time()
    # fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles)
    # fi_obj.gen_col_sets(th=corr_th, toplot=False)
    # fi_obj.compute_pfi(ml_type='c', verbose=False)
    # print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # # Plot and save PFI
    # fig = fi_obj.plot_var_fi(title='NN Classifier (PFI var)', ylabel='Importance (relative)')
    # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_var.png'), bbox_inches='tight')
    # fig = fi_obj.plot_score_fi(title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)')
    # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_score.png'), bbox_inches='tight')

    # # Dump resutls
    # fi_obj.dump(path=OUTDIR, name='nn_classifier')

    # ==========  Load regression data  ==========
    print('\n======== Load regression data ========')

    data_train = pd.read_csv(DATAPATH_REGRESSION_TRAIN, sep='\t')
    data_val = pd.read_csv(DATAPATH_REGRESSION_VAL, sep='\t')
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(f'\ndata_train:\n{data_train.iloc[:3, :4]}')
    print(f'\ndata_val:\n{data_val.iloc[:3, :4]}')

    # ==========  RF regressor  ==========
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')

    # Compute corr matrix
    cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    fig = utils.plot_cor_heatmap(cor)
    fig.savefig(os.path.join(OUTDIR, 'feature_corr_regression.png'),
                bbox_inches='tight')

    # ---------- Train regressor ----------
    print('\n------- Train RF Regressor -------')
    rf_model = RandomForestRegressor(n_estimators=150,
                                     min_samples_leaf=5,
                                     max_features='sqrt',
                                     random_state=SEED)
    rf_model.fit(xtr, ytr)
    score = rf_model.score(xvl, yvl)
    print(f'Prediction score (r_square): {score:.4f}')

    # ---------- Feature importance from RF and PFI ----------
    print('\n------- MDI and PFI from RF regressor -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    title='RF Regressor (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_fi.png'),
                bbox_inches='tight')

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='r', verbose=False)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(title='RF Regressor (PFI var)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(title='RF Regressor (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name='rf_regressor')
示例#10
0
def run():
    # Create necessary dirs
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # Load data
    dataset = 'raw'
    df_rna, meta = utils.load_lincs1000(dataset=dataset, sources=['gdc'])

    # Specify col name of the target variable (cancer type)
    target_col_name = 'ctype'

    # Drop small classes
    min_class_size = 300
    df_rna, df_rna_small = utils.drop_samples_on_class_count(df=df_rna, y=meta[target_col_name],
                                                             min_class_size=min_class_size)
    df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample')
    print(f'\n{meta[target_col_name].value_counts()}')    

    # Balance classes
    class_size = min_class_size
    df_rna, y_out, dropped_classes = utils.balance_df(df=df_rna, y=meta[target_col_name],
                                                      class_size=class_size, seed=SEED)
    df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample')
    print(f'\n{meta[target_col_name].value_counts()}')

    print(f'\ndf_rna.shape {df_rna.shape}')
    print(f'meta.shape   {meta.shape}')

    # Create the class `other`
    # df_other = df_rna_small.sample(min_class_size, random_state=SEED)
    # df_rna = pd.concat([df_rna, df_other], axis=0)
    # df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample')
    # print(f'df_rna.shape {df_rna.shape}')
    # print(meta[target_col_name].value_counts())

    # Encode target variable
    ydata = meta['ctype'].values
    y_enc = LabelEncoder()
    ydata = y_enc.fit_transform(ydata)
    y_enc = pd.DataFrame(data={'label': np.arange(0, len(y_enc.classes_)), 'type': y_enc.classes_})
    y_enc.to_csv(os.path.join(OUTDIR, f'{APP}_y_enc'), sep='\t', index=False)

    # Permute data
    xdata = df_rna.iloc[:, 1:].copy()
    shuf_idx = np.random.permutation(xdata.shape[0])
    xdata = xdata.iloc[shuf_idx].reset_index(drop=True)
    ydata = pd.Series(ydata[shuf_idx], name='y')
    features = xdata.columns

    # Drop low var cols
    xdata, idx = utils.drop_low_var_cols(xdata, verbose=True)
    features = xdata.columns    

    # Split train/val
    xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)
    # print(xtr.index[:5])
    # print(ytr.index[:5])
    xtr, ytr = xtr.reset_index(drop=True), ytr.reset_index(drop=True)
    xvl, yvl = xvl.reset_index(drop=True), yvl.reset_index(drop=True)

    # Scale data
    scaler = StandardScaler()
    xtr = scaler.fit_transform(xtr)
    xvl = scaler.transform(xvl)
    xtr = pd.DataFrame(xtr, columns=features)
    xvl = pd.DataFrame(xvl, columns=features)
    # print('xtr.var(axis=0).mean()', xtr.var(axis=0).mean())
    # print('xvl.var(axis=0).mean()', xvl.var(axis=0).mean())

    # Concat
    # data = pd.concat([pd.DataFrame(ydata), xdata], axis=1)
    data_train = pd.concat([pd.DataFrame(ytr), xtr], axis=1)
    data_val = pd.concat([pd.DataFrame(yvl), xvl], axis=1)
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    # Save
    # data.to_csv(os.path.join(OUTDIR, f'{APP}_data'), sep='\t', index=False)
    data_train.to_csv(os.path.join(OUTDIR, f'{APP}_data_train_{dataset}'), sep='\t', index=False)
    data_val.to_csv(os.path.join(OUTDIR, f'{APP}_data_val_{dataset}'), sep='\t', index=False)
示例#11
0
def run(args):
    print(args)
    epochs = args.epochs
    batch = args.batch

    # ==========  NN classifier  ==========
    print('\nLoading classification dataset...')

    # Load data
    data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t')
    print('data.shape', data.shape)
    print(data.iloc[:3, :5])

    xdata = data.iloc[:, 1:].values
    ydata = data.iloc[:, 0].values
    print('np.unique(ydata)', np.unique(ydata))

    n_classes = len(np.unique(ydata))
    ydata = keras.utils.to_categorical(ydata, num_classes=n_classes)

    # Scale data
    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)

    # Split data
    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=0.2,
                                          random_state=SEED,
                                          shuffle=True,
                                          stratify=ydata)

    print('\nTrain NN Classifier ...')
    keras_model = create_nn_classifier(n_features=xtr.shape[1],
                                       n_classes=n_classes)
    history = keras_model.fit(xtr,
                              ytr,
                              epochs=epochs,
                              batch_size=batch,
                              verbose=0)
    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    print('Prediction score (val loss): {:.4f}'.format(score))

    yvl_preds = keras_model.predict(xvl)
    print('true', np.argmax(yvl[:10], axis=1))
    print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    print('f1_score micro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='micro')))
    print('f1_score macro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='macro')))

    # Create dir to save the model
    MODELDIR = os.path.join(file_path, 'keras_model_classifier')
    utils.make_dir(MODELDIR)

    # Save initial model
    print('\nSave keras model (classifier) ...')
    model_json = keras_model.to_json()
    model_path = os.path.join(MODELDIR, 'keras_model.json')
    with open(model_path, 'w') as json_file:
        json_file.write(model_json)

    # Save the initialized weights to HDF5
    print('Saving keras weights...')
    weights_path = os.path.join(MODELDIR, 'keras_weights.h5')
    keras_model.save_weights(weights_path)

    # ==========  NN regressor  ==========
    print('\nLoad regression dataset ...')

    # Load data
    data = pd.read_csv(DATAPATH_REGRESSION, sep='\t')
    print('data.shape', data.shape)
    print(data.iloc[:3, :5])

    ydata = data.iloc[:, 0].values
    xdata = data.iloc[:, 1:].values

    # Scale data
    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)

    # Split data
    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=0.2,
                                          random_state=SEED,
                                          shuffle=True)

    print('\nTrain NN Regressor ...')
    keras_model = create_nn_regressor(n_features=xtr.shape[1])
    history = keras_model.fit(xtr,
                              ytr,
                              epochs=epochs,
                              batch_size=batch,
                              verbose=0)
    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    print('Prediction score (val loss): {:.4f}'.format(score))

    # Create dir to save the model
    MODELDIR = os.path.join(file_path, 'keras_model_regressor')
    utils.make_dir(MODELDIR)

    # Save initial model
    print('\nSave keras model (regressor) ...')
    model_json = keras_model.to_json()
    model_path = os.path.join(MODELDIR, 'keras_model.json')
    with open(model_path, 'w') as json_file:
        json_file.write(model_json)

    # Save the initialized weights to HDF5
    print('Save keras weights ...')
    weights_path = os.path.join(MODELDIR, 'keras_weights.h5')
    keras_model.save_weights(weights_path)