示例#1
0
def main(argv):
    infile = argv[0]
    outdir = argv[1]

    sleep_states = ['Wake', 'Sleep']
    df = pd.read_csv(infile)
    df = df[(df['label'] != 'Wake_ext')
            & (df['label'] != 'Nonwear')].reset_index()
    df['binary_label'] = df['label']
    df.loc[df['label'] == 'NREM 1', 'binary_label'] = 'Sleep'
    df.loc[df['label'] == 'NREM 2', 'binary_label'] = 'Sleep'
    df.loc[df['label'] == 'NREM 3', 'binary_label'] = 'Sleep'
    df.loc[df['label'] == 'REM', 'binary_label'] = 'Sleep'

    y_true = np.array([sleep_states.index(val) for val in df['binary_label']])
    y_pred = np.array([sleep_states.index(val) for val in df['heuristic']])
    y_pred_onehot = np.zeros(
        (len(y_pred), len(sleep_states)))  # convert to one-hot representation
    y_pred_onehot[np.arange(len(y_pred)), y_pred] = 1
    predictions = [(df['user'], df['timestamp'], df['filename'], y_true,
                    y_pred_onehot)]

    cv_save_classification_result(
        predictions, sleep_states,
        os.path.join(outdir, 'heuristic_classification.csv'))
示例#2
0
def main(argv):
    infile = argv[0]
    mode = argv[1]  # binary or multiclass or nonwear
    dataset = argv[2]
    outdir = argv[3]

    resultdir = os.path.join(outdir, 'models')
    if not os.path.exists(resultdir):
        os.makedirs(resultdir)

    # Read data file and retain data only corresponding to 5 sleep states or nonwear
    df = pd.read_csv(infile,
                     dtype={
                         'label': object,
                         'user': object,
                         'position': object,
                         'dataset': object
                     })
    if mode == 'binary':
        states = ['Wake', 'Sleep']
        collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Sleep'
    elif mode == 'nonwear':
        states = ['Wear', 'Nonwear']
        collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Wear'
    else:
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']

    df = df[df['label'].isin(states)].reset_index()

    print('... Number of data samples: %d' % len(df))
    ctr = Counter(df['label'])
    for cls in ctr:
        print('%s: %d (%0.2f%%)' % (cls, ctr[cls], ctr[cls] * 100.0 / len(df)))

    feat_cols = [
        'ENMO_mean', 'ENMO_std', 'ENMO_min', 'ENMO_max', 'ENMO_mad',
        'ENMO_entropy1', 'ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff',
        'angz_mean', 'angz_std', 'angz_min', 'angz_max', 'angz_mad',
        'angz_entropy1', 'angz_entropy2', 'angz_prevdiff', 'angz_nextdiff',
        'LIDS_mean', 'LIDS_std', 'LIDS_min', 'LIDS_max', 'LIDS_mad',
        'LIDS_entropy1', 'LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff'
    ]

    ######################## Partition the datasets #######################

    # Nested cross-validation - outer CV for estimating model performance
    # Inner CV for estimating model hyperparameters

    # Split data based on users, not on samples, for outer CV
    # Use Stratified CV for inner CV to ensure similar label distribution
    ts = df['timestamp']
    X = df[feat_cols].values
    y = df['label']
    y = np.array([states.index(i) for i in y])
    groups = df['user']
    fnames = df['filename']

    feat_len = X.shape[1]

    # Outer CV
    imbalanced_pred = []
    imbalanced_imp = []
    balanced_pred = []
    balanced_imp = []
    outer_cv_splits = 5
    inner_cv_splits = 5
    group_kfold = GroupKFold(n_splits=outer_cv_splits)
    out_fold = 0
    for train_indices, test_indices in group_kfold.split(X, y, groups):
        out_fold += 1
        out_fold_X_train = X[train_indices, :]
        out_fold_X_test = X[test_indices, :]
        out_fold_y_train = y[train_indices]
        out_fold_y_test = y[test_indices]
        out_fold_users_test = groups[test_indices]
        out_fold_ts_test = ts[test_indices]
        out_fold_fnames_test = fnames[test_indices]

        # Inner CV
        strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,
                                      random_state=0,
                                      shuffle=True)
        #    #################### Without balancing #######################
        #
        #    custom_cv_indices = []
        #    for grp_train_idx, grp_test_idx in \
        #            strat_kfold.split(out_fold_X_train,out_fold_y_train):
        #      custom_cv_indices.append((grp_train_idx, grp_test_idx))
        #
        #    pipe = Pipeline([('scl', StandardScaler()),
        #                 ('clf', RandomForestClassifier(class_weight='balanced',
        #                 random_state=0))])
        #
        #    print('Fold'+str(out_fold)+' - Imbalanced: Hyperparameter search')
        #    search_params = {'clf__n_estimators':[50,100,200,300,500],
        #                 'clf__max_depth': [5,10,None]}
        #    cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params,
        #                            cv=custom_cv_indices, scoring='f1_macro', n_iter=5,
        #                            n_jobs=-1, verbose=2)
        #    cv_clf.fit(out_fold_X_train, out_fold_y_train)
        #    pickle.dump(cv_clf, open(os.path.join(resultdir,\
        #                'fold'+str(out_fold)+'_'+ mode + '_imbalanced_RF.sav'),'wb'))
        #    out_fold_y_test_pred = cv_clf.predict_proba(out_fold_X_test)
        #    print('Fold'+str(out_fold)+' - Imbalanced', cv_clf.best_params_)
        #
        #    imbalanced_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
        #                            out_fold_y_test, out_fold_y_test_pred))
        #    imbalanced_imp.append(cv_clf.best_estimator_.named_steps['clf'].feature_importances_)

        ################## Balancing with SMOTE ###################

        scaler = StandardScaler()
        scaler.fit(out_fold_X_train)
        out_fold_X_train_sc = scaler.transform(out_fold_X_train)
        out_fold_X_test_sc = scaler.transform(out_fold_X_test)

        # Resample training data
        print('Fold' + str(out_fold) + ' - Balanced: SMOTE')
        # Imblearn - Undersampling techniques ENN and Tomek are too slow and
        # difficult to parallelize
        # So stick only with oversampling techniques
        smote = SMOTE(random_state=0, n_jobs=-1, sampling_strategy='all')
        out_fold_X_train_resamp, out_fold_y_train_resamp = \
                        smote.fit_resample(out_fold_X_train_sc, out_fold_y_train)

        custom_resamp_cv_indices = []
        for grp_train_idx, grp_test_idx in \
              strat_kfold.split(out_fold_X_train_resamp,out_fold_y_train_resamp):
            custom_resamp_cv_indices.append((grp_train_idx, grp_test_idx))

        # Note: imblearn Pipeline is slow and sklearn pipeline yields poor results
        clf = RandomForestClassifier(class_weight='balanced',
                                     max_depth=None,
                                     random_state=0)

        print('Fold' + str(out_fold) + ' - Balanced: Hyperparameter search')
        search_params = {
            'n_estimators': [50, 100, 200, 300, 500],
            'max_depth': [5, 10, None]
        }
        cv_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=search_params,
                                    cv=custom_resamp_cv_indices,
                                    scoring='f1_macro',
                                    n_iter=5,
                                    n_jobs=-1,
                                    verbose=2)
        cv_clf.fit(out_fold_X_train_resamp, out_fold_y_train_resamp)
        pickle.dump([scaler,cv_clf], open(os.path.join(resultdir,\
                    'fold'+str(out_fold)+'_'+ mode + '_balanced_RF.sav'),'wb'))
        out_fold_y_test_pred = cv_clf.predict_proba(out_fold_X_test_sc)
        print('Fold' + str(out_fold) + ' - Balanced', cv_clf.best_params_)

        balanced_pred.append(
            (out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
             out_fold_y_test, out_fold_y_test_pred))
        balanced_imp.append(cv_clf.best_estimator_.feature_importances_)


#  print('############## Imbalanced classification ##############')
#  # Save imbalanced classification reports
#  cv_save_feat_importances_result(imbalanced_imp, feat_cols,
#                   os.path.join(outdir, mode + '_imbalanced_feat_imp.csv'))
#  cv_save_classification_result(imbalanced_pred, states,
#                   os.path.join(outdir, mode + '_imbalanced_classification.csv'))

    print('############## Balanced classification ##############')
    # Save balanced classification reports
    cv_save_feat_importances_result(
        balanced_imp, feat_cols,
        os.path.join(outdir, mode + '_balanced_feat_imp.csv'))
    cv_save_classification_result(
        balanced_pred, states,
        os.path.join(outdir, mode + '_balanced_classification.csv'))
示例#3
0
def main(argv):
    indir = args.indir
    mode = args.mode  # binary or multiclass or nonwear
    outdir = args.outdir

    if mode == 'multiclass':
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM', 'Wake_ext']
    elif mode == 'binary':
        states = ['Wake', 'Sleep', 'Wake_ext']
        collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
    elif mode == 'nonwear':
        states = ['Wear', 'Nonwear']
        collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']

    valid_states = [state for state in states if state != 'Wake_ext']
    num_classes = len(valid_states)

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    resultdir = os.path.join(outdir, mode, 'models')
    if not os.path.exists(resultdir):
        os.makedirs(resultdir)

    # Read data from disk
    data = pd.read_csv(os.path.join(indir, 'features_30.0s.csv'))
    labels = data['label'].values
    users = data['user'].values
    if mode == 'binary':
        labels = np.array(
            ['Sleep' if lbl in collate_states else lbl for lbl in labels])
    elif mode == 'nonwear':
        labels = np.array(
            ['Wear' if lbl in collate_states else lbl for lbl in labels])

    # Read raw data
    shape_df = pd.read_csv(os.path.join(indir, 'datashape_30.0s.csv'))
    num_samples = shape_df['num_samples'].values[0]
    seqlen = shape_df['num_timesteps'].values[0]
    n_channels = shape_df['num_channels'].values[0]
    raw_data = np.memmap(os.path.join(indir, 'rawdata_30.0s.npz'),
                         dtype='float32',
                         mode='r',
                         shape=(num_samples, seqlen, n_channels))

    # Hyperparameters
    lr = args.lr  # learning rate
    num_epochs = args.num_epochs
    batch_size = args.batchsize
    max_seqlen = 1504
    num_channels = args.num_channels  # number of raw data channels
    feat_channels = args.feat_channels  # Add ENMO, z-angle and LIDS as additional channels

    # Use nested cross-validation based on users
    # Outer CV
    unique_users = list(set(users))
    random.shuffle(unique_users)
    cv_splits = 5
    user_cnt = Counter(users[np.isin(labels, valid_states)]).most_common()
    samp_per_fold = len(users) // cv_splits

    # Get users to be used in test for each fold such that each fold has similar
    # number of samples
    fold_users = [[] for i in range(cv_splits)]
    fold_cnt = [[] for i in range(cv_splits)]
    for user, cnt in user_cnt:
        idx = -1
        maxdiff = 0
        for j in range(cv_splits):
            if (samp_per_fold - sum(fold_cnt[j])) > maxdiff:
                maxdiff = samp_per_fold - sum(fold_cnt[j])
                idx = j
        fold_users[idx].append(user)
        fold_cnt[idx].append(cnt)

    predictions = []
    if mode != 'nonwear':
        wake_idx = states.index('Wake')
        wake_ext_idx = states.index('Wake_ext')
    for fold in range(cv_splits):
        print('Evaluating fold %d' % (fold + 1))
        test_users = fold_users[fold]
        trainval_users = [(key, val) for key, val in user_cnt
                          if key not in test_users]
        random.shuffle(trainval_users)
        # validation data is approximately 10% of total samples
        val_samp = 0.1 * sum([tup[1] for tup in user_cnt])
        nval = 0
        val_sum = 0
        while (val_sum < val_samp):
            val_sum += trainval_users[nval][1]
            nval += 1
        val_users = [key for key, val in trainval_users[:nval]]
        train_users = [key for key, val in trainval_users[nval:]]
        print('#users: Train = {:d}, Val = {:d}, Test = {:d}'.format(
            len(train_users), len(val_users), len(test_users)))

        # Create partitions
        # make a copy to change wake_ext for this fold
        fold_labels = np.array(
            [states.index(lbl) if lbl in states else -1 for lbl in labels])
        train_indices = get_partition(raw_data,
                                      fold_labels,
                                      users,
                                      train_users,
                                      states,
                                      mode,
                                      is_train=True)
        val_indices = get_partition(raw_data, fold_labels, users, val_users,
                                    states, mode)
        test_indices = get_partition(raw_data, fold_labels, users, test_users,
                                     states, mode)
        nsamples = len(train_indices) + len(val_indices) + len(test_indices)
        print('Train: {:0.2f}%, Val: {:0.2f}%, Test: {:0.2f}%'\
                .format(len(train_indices)*100.0/nsamples, len(val_indices)*100.0/nsamples,\
                        len(test_indices)*100.0/nsamples))

        if mode != 'nonwear':
            chosen_indices = train_indices[
                fold_labels[train_indices] != wake_ext_idx]
        else:
            chosen_indices = train_indices
        class_wts = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(fold_labels[chosen_indices]),
            y=fold_labels[chosen_indices])

        # Rename wake_ext as wake for training samples
        if mode != 'nonwear':
            rename_indices = train_indices[fold_labels[train_indices] ==
                                           wake_ext_idx]
            fold_labels[rename_indices] = wake_idx

        print('Train', Counter(np.array(fold_labels)[train_indices]))
        print('Val', Counter(np.array(fold_labels)[val_indices]))
        print('Test', Counter(np.array(fold_labels)[test_indices]))

        # Data generators for computing statistics
        stat_gen = DataGenerator(train_indices, raw_data, fold_labels, valid_states, partition='stat',\
                                  batch_size=batch_size, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                                  n_classes=num_classes, shuffle=True)
        mean, std = stat_gen.fit()
        np.savez(os.path.join(resultdir, 'Fold' + str(fold + 1) + '_stats'),
                 mean=mean,
                 std=std)

        # Data generators for train/val/test
        train_gen = DataGenerator(train_indices, raw_data, fold_labels, valid_states, partition='train',\
                                  batch_size=batch_size, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                                  n_classes=num_classes, shuffle=True, augment=True, aug_factor=0.75, balance=True,
                                  mean=mean, std=std)
        val_gen = DataGenerator(val_indices, raw_data, fold_labels, valid_states, partition='val',\
                                batch_size=batch_size, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                                n_classes=num_classes, mean=mean, std=std)
        test_gen = DataGenerator(test_indices, raw_data, fold_labels, valid_states, partition='test',\
                                 batch_size=batch_size, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                                 n_classes=num_classes, mean=mean, std=std)

        # Create model
        # Use batchnorm as first step since computing mean and std
        # across entire dataset is time-consuming
        model = FCN(input_shape=(seqlen, num_channels + feat_channels),
                    max_seqlen=max_seqlen,
                    num_classes=len(valid_states),
                    norm_max=args.maxnorm)
        #print(model.summary())
        model.compile(optimizer=Adam(lr=lr),
                      loss=focal_loss(),
                      metrics=['accuracy', macro_f1])

        # Train model
        # Use callback to compute F-scores over entire validation data
        metrics_cb = Metrics(val_data=val_gen, batch_size=batch_size)
        # Use early stopping and model checkpoints to handle overfitting and save best model
        model_checkpt = ModelCheckpoint(os.path.join(resultdir,'fold'+str(fold+1)+'_'+mode+'-{epoch:02d}-{val_f1:.4f}.h5'),\
                                                     monitor='val_f1',\
                                                     mode='max', save_best_only=True)
        batch_renorm_cb = BatchRenormScheduler(len(train_gen))
        history = model.fit(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=1,
            shuffle=False,
            callbacks=[batch_renorm_cb, metrics_cb, model_checkpt],
            workers=2,
            max_queue_size=20,
            use_multiprocessing=False)

        # Plot training history
        plot_results(fold+1, history.history['loss'], history.history['val_loss'],\
                     os.path.join(resultdir,'Fold'+str(fold+1)+'_'+mode+'_loss.jpg'), metric='Loss')
        plot_results(fold+1, history.history['accuracy'], history.history['val_accuracy'],\
                     os.path.join(resultdir,'Fold'+str(fold+1)+'_'+mode+'_accuracy.jpg'), metric='Accuracy')
        plot_results(fold+1, history.history['macro_f1'], metrics_cb.val_f1,\
                     os.path.join(resultdir,'Fold'+str(fold+1)+'_'+mode+'_macro_f1.jpg'), metric='Macro F1')

        # Predict probability on validation data using best model
        best_model_file, epoch, val_f1 = get_best_model(resultdir, fold + 1)
        print('Predicting with model saved at Epoch={:d} with val_f1={:0.4f}'.
              format(epoch, val_f1))
        model.load_weights(os.path.join(resultdir, best_model_file))
        probs = model.predict(test_gen)
        y_pred = probs.argmax(axis=1)
        y_true = fold_labels[test_indices]
        predictions.append(
            (users[test_indices], data.iloc[test_indices]['timestamp'],
             data.iloc[test_indices]['filename'], test_indices, y_true, probs))

        # Save user report
        cv_save_classification_result(
            predictions,
            valid_states,
            os.path.join(
                resultdir, 'fold' + str(fold + 1) + '_deeplearning_' + mode +
                '_results.csv'),
            method='dl')
        cv_get_classification_report(predictions,
                                     mode,
                                     valid_states,
                                     method='dl')

    cv_get_classification_report(predictions, mode, valid_states, method='dl')

    # Save user report
    cv_save_classification_result(predictions,
                                  valid_states,
                                  os.path.join(
                                      resultdir,
                                      'deeplearning_' + mode + '_results.csv'),
                                  method='dl')
示例#4
0
def main(args):
    if not os.path.exists(os.path.join(args.outdir, 'models')):
        os.makedirs(os.path.join(args.outdir, 'models'))

    if args.mode == 'binary':
        sleep_states = ['Wake', 'Sleep', 'Nonwear']
    else:
        sleep_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM', 'Nonwear']

    feat_cols = [
        'ENMO_mean', 'ENMO_std', 'ENMO_min', 'ENMO_max', 'ENMO_mad',
        'ENMO_entropy1', 'ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff',
        'angz_mean', 'angz_std', 'angz_min', 'angz_max', 'angz_mad',
        'angz_entropy1', 'angz_entropy2', 'angz_prevdiff', 'angz_nextdiff',
        'LIDS_mean', 'LIDS_std', 'LIDS_min', 'LIDS_max', 'LIDS_mad',
        'LIDS_entropy1', 'LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff'
    ]

    X_test, y_test, users_test, ts_test, fnames_test = get_data(args.test,
                                                                feat_cols,
                                                                sleep_states,
                                                                mode=args.mode)
    predictions = []
    feat_imp = []
    if args.testmode == 'pretrain':
        # 'pretrained' - use pretrained models on test data
        model_files = os.listdir(args.modeldir)
        for i, fname in enumerate(model_files):
            scaler, clf = pickle.load(
                open(os.path.join(args.modeldir, fname), 'rb'))
            X_test_sc = scaler.transform(X_test)
            y_pred = clf.predict_proba(X_test_sc)
            predictions.append(
                (users_test, ts_test, fnames_test, y_test, y_pred))
            feat_imp.append(clf.best_estimator_.feature_importances_)
    else:
        resultdir = os.path.join(args.outdir, 'models', 'tranfer', args.mode)
        if not os.path.exists(resultdir):
            os.makedirs(resultdir)
        # 'finetune'- use models tuned using validation data from same distribution as test data
        X_train, y_train, users_train, _, _ = get_data(args.train,
                                                       feat_cols,
                                                       sleep_states,
                                                       mode=args.mode)
        X_val, y_val, _, _, _ = get_data(args.val,
                                         feat_cols,
                                         sleep_states,
                                         mode=args.mode)

        outer_cv_splits = 5
        inner_cv_splits = 5
        group_kfold = GroupKFold(n_splits=outer_cv_splits)
        fold = 0
        for train_indices, test_indices in group_kfold.split(
                X_train, y_train, users_train):
            fold += 1
            fold_X_train = X_train[train_indices, :]
            fold_y_train = y_train[train_indices]

            # Scale features
            scaler = StandardScaler()
            scaler.fit(fold_X_train)
            X_train_sc = scaler.transform(fold_X_train)
            X_val_sc = scaler.transform(X_val)
            X_test_sc = scaler.transform(X_test)

            # Balance training samples using SMOTE
            smote = SMOTE(random_state=0, n_jobs=-1, sampling_strategy='all')
            X_train_resamp, y_train_resamp = smote.fit_resample(
                X_train_sc, fold_y_train)
            X_concat = np.concatenate((X_train_resamp, X_val_sc), axis=0)
            y_concat = np.concatenate((y_train_resamp, y_val), axis=0)

            # Get suitable parameters using validation data
            clf = RandomForestClassifier(class_weight='balanced',
                                         max_depth=None,
                                         random_state=0)
            search_params = {
                'n_estimators': [50, 100, 200, 300, 500],
                'max_depth': [5, 10, None]
            }
            cv_indices = [(np.arange(X_train_resamp.shape[0]),
                           np.arange(X_train_resamp.shape[0],
                                     X_concat.shape[0]))]
            cv_clf = RandomizedSearchCV(estimator=clf,
                                        param_distributions=search_params,
                                        cv=cv_indices,
                                        scoring='f1_macro',
                                        n_iter=5,
                                        n_jobs=-1,
                                        verbose=2)
            cv_clf.fit(X_concat, y_concat)
            pickle.dump(
                cv_clf,
                open(
                    os.path.join(
                        resultdir,
                        'fold_' + str(fold) + '_' + args.testmode + '_RF.sav'),
                    'wb'))
            y_pred = cv_clf.predict_proba(X_test_sc)

            predictions.append(
                (users_test, ts_test, fnames_test, y_test, y_pred))
            feat_imp.append(cv_clf.best_estimator_.feature_importances_)

    cv_save_feat_importances_result(
        feat_imp, feat_cols,
        os.path.join(
            args.outdir,
            'transfer_' + args.mode + '_' + args.testmode + '_feat_imp.csv'))
    cv_save_classification_result(
        predictions, sleep_states,
        os.path.join(
            args.outdir, 'transfer_' + args.mode + '_' + args.testmode +
            '_classification.csv'))
def main(argv):
  infile = argv[0]
  mode = argv[1] # binary or multiclass or nonwear
  dataset = argv[2]
  outdir = argv[3]

  resultdir = os.path.join(outdir,'models')
  if not os.path.exists(resultdir):
    os.makedirs(resultdir)

  # Read data file and retain data only corresponding to 5 sleep states or nonwear
  df = pd.read_csv(infile, dtype={'label':object, 'user':object,
                   'position':object, 'dataset':object})
  if mode == 'binary':
    states = ['Wake', 'Sleep']
    collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
    df.loc[df['label'].isin(collate_states), 'label'] = 'Sleep'
  elif mode == 'nonwear':
    states = ['Wear', 'Nonwear']
    collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    df.loc[df['label'].isin(collate_states), 'label'] = 'Wear'
  else:
    states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    
  df = df[df['label'].isin(states)].reset_index()
  
  print('... Number of data samples: %d' % len(df))
  ctr = Counter(df['label'])
  for cls in ctr:
    print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) 

  feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad',
               'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
               'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff',
               'angz_mean','angz_std','angz_range','angz_mad',
               'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
               'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff',
               'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad',
               'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
               'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff']

  ######################## Partition the datasets #######################

  # Nested cross-validation - outer CV for estimating model performance
  # Inner CV for estimating model hyperparameters

  # Split data based on users, not on samples, for outer CV
  # Use Stratified CV for inner CV to ensure similar label distribution
  ts = df['timestamp']
  X = df[feat_cols].values
  y = df['label']
  y = np.array([states.index(i) for i in y])
  groups = df['user']
  fnames = df['filename']

  feat_len = X.shape[1]

  encoder = OneHotEncoder()
  scorer = make_scorer(average_precision_score, average='macro')

  # Outer CV
  imbalanced_pred = []; imbalanced_imp = []
  balanced_pred = []; balanced_imp = []
  outer_cv_splits = 5; inner_cv_splits = 5
  outer_group_kfold = GroupKFold(n_splits=outer_cv_splits)
  out_fold = 0
  for train_indices, test_indices in outer_group_kfold.split(X,y,groups):
    out_fold += 1
    out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:]
    out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices]
    out_fold_users_train = groups[train_indices]; out_fold_users_test = groups[test_indices]
    out_fold_ts_test = ts[test_indices]
    out_fold_fnames_test = fnames[test_indices]

    if mode != 'multiclass':
      class_wt = compute_class_weight('balanced', np.unique(out_fold_y_train), out_fold_y_train)
      class_wt = {i:val for i,val in enumerate(class_wt)}
    else:
      class_wt = []
      for cls in range(len(states)):
        class_train = (out_fold_y_train == cls).astype(np.int32)
        cls_wt = compute_class_weight('balanced', np.unique(class_train), class_train)
        cls_wt = {i:val for i,val in enumerate(cls_wt)}
        class_wt.append(cls_wt)

    # Inner CV
    ################## Balancing with SMOTE ###################
    scaler = StandardScaler()
    scaler.fit(out_fold_X_train)
    out_fold_X_train_sc = scaler.transform(out_fold_X_train)
    out_fold_X_test_sc = scaler.transform(out_fold_X_test)
    
    # Imblearn - Undersampling techniques ENN and Tomek are too slow and 
    # difficult to parallelize
    # So stick only with oversampling techniques
    print('Fold'+str(out_fold)+' - Balanced: SMOTE')
    smote = SMOTE(random_state=0, n_jobs=-1, sampling_strategy='all')
    # Resample training data for each user
    train_users = list(set(out_fold_users_train))
    out_fold_X_train_resamp, out_fold_y_train_resamp, out_fold_users_train_resamp = None, None, None
    for i,user in enumerate(train_users):
      #print('%d/%d - %s' % (i+1,len(train_users),user))
      user_X = out_fold_X_train_sc[out_fold_users_train == user]
      user_y = out_fold_y_train[out_fold_users_train == user]
      if len(set(user_y)) == 1:
        print('%d/%d: %s has only one class' % (i+1,len(train_users),user))
        print(Counter(user_y))
        continue
      try:
        user_X_resamp, user_y_resamp = smote.fit_resample(user_X, user_y)
      except:
        print('%d/%d: %s failed to fit' % (i+1,len(train_users),user))
        print(Counter(user_y))
        continue
      user_y_resamp = user_y_resamp.reshape(-1,1)
      user_resamp = np.array([user] * len(user_X_resamp)).reshape(-1,1)
      if out_fold_X_train_resamp is None:
        out_fold_X_train_resamp = user_X_resamp
        out_fold_y_train_resamp = user_y_resamp
        out_fold_users_train_resamp = user_resamp
      else:
        out_fold_X_train_resamp = np.vstack((out_fold_X_train_resamp, user_X_resamp))
        out_fold_y_train_resamp = np.vstack((out_fold_y_train_resamp, user_y_resamp))
        out_fold_users_train_resamp = np.vstack((out_fold_users_train_resamp, user_resamp))
    # Shuffle resampled data
    resamp_indices = np.arange(len(out_fold_X_train_resamp))
    np.random.shuffle(resamp_indices)
    out_fold_X_train_resamp = out_fold_X_train_resamp[resamp_indices]
    out_fold_y_train_resamp = out_fold_y_train_resamp[resamp_indices].reshape(-1)
    out_fold_users_train_resamp = out_fold_users_train_resamp[resamp_indices].reshape(-1)

    inner_group_kfold = GroupKFold(n_splits=inner_cv_splits)
    custom_resamp_cv_indices = []
    for grp_train_idx, grp_test_idx in \
          inner_group_kfold.split(out_fold_X_train_resamp, out_fold_y_train_resamp, out_fold_users_train_resamp):
      custom_resamp_cv_indices.append((grp_train_idx, grp_test_idx))
      grp_train_users = out_fold_users_train_resamp[grp_train_idx]
      grp_test_users = out_fold_users_train_resamp[grp_test_idx]

    # Note: imblearn Pipeline is slow and sklearn pipeline yields poor results 
    clf = RandomForestClassifier(class_weight=class_wt,
                             max_depth=None, random_state=0)

    print('Fold'+str(out_fold)+' - Balanced: Hyperparameter search')
    search_params = {'n_estimators':[100,150,200,300,400,500],
                 'max_depth': [5,10,15,20,None]}
    cv_clf = RandomizedSearchCV(estimator=clf, param_distributions=search_params,
                            cv=custom_resamp_cv_indices, scoring=scorer,
                            n_iter=10, n_jobs=-1, verbose=2)
    if mode == 'multiclass':
      out_fold_y_train_resamp = encoder.fit_transform(out_fold_y_train_resamp.reshape(-1,1)).todense()
    cv_clf.fit(out_fold_X_train_resamp, out_fold_y_train_resamp)
    print(cv_clf.best_estimator_)
    joblib.dump([scaler,cv_clf], os.path.join(resultdir,\
                'fold'+str(out_fold)+'_'+ mode + '_balanced_RF.sav'))
    out_fold_y_test_pred = cv_clf.predict_proba(out_fold_X_test_sc)
    if mode == 'multiclass': # collect probabilities from each binary classification
      multiclass_y_pred = None
      for cls in range(len(out_fold_y_test_pred)):
        if cls == 0:
          multiclass_y_pred = out_fold_y_test_pred[cls][:,1].reshape(-1,1)
        else:
          multiclass_y_pred = np.hstack((multiclass_y_pred, out_fold_y_test_pred[cls][:,1].reshape(-1,1)))
      out_fold_y_test_pred = multiclass_y_pred
      
    print('Fold'+str(out_fold)+' - Balanced', cv_clf.best_params_)

    balanced_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
                          out_fold_y_test, out_fold_y_test_pred))
    balanced_imp.append(cv_clf.best_estimator_.feature_importances_)

  print('############## Balanced classification ##############')
  # Save balanced classification reports
  cv_save_feat_importances_result(balanced_imp, feat_cols,
                   os.path.join(outdir, mode + '_balanced_feat_imp.csv'))
  cv_save_classification_result(balanced_pred, states,
                   os.path.join(outdir, mode + '_balanced_classification.csv'))
示例#6
0
def main(argv):
    indir = args.indir
    mode = args.mode  # binary or multiclass or nonwear
    modeldir = args.modeldir
    outdir = args.outdir

    if mode == 'multiclass':
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    elif mode == 'binary':
        states = ['Wake', 'Sleep']
        collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
    elif mode == 'nonwear':
        states = ['Wear', 'Nonwear']
        collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']

    num_classes = len(states)

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    resultdir = os.path.join(outdir, mode, 'models')
    if not os.path.exists(resultdir):
        os.makedirs(resultdir)

    # Load pretrained model
    pretrained_model = load_model(os.path.join(modeldir,
                                               'pretrained_model.h5'))
    pretrained_renet_weights = None
    for layer in pretrained_model.layers:
        if layer.name == "model":
            pretrained_resnet_weights = layer.get_weights()

    # Hyperparameters
    num_epochs = args.num_epochs
    num_channels = args.num_channels  # number of raw data channels
    feat_channels = args.feat_channels  # Add ENMO, z-angle and LIDS as additional channels
    batchsize = args.batchsize  # Batchsize
    hp_epochs = args.hp_epochs  # No. of hyperparameter validation epochs
    lr = args.lr  # Learning rate
    batchsize = args.batchsize  # Batch size

    resultdir = os.path.join(outdir, mode,
                             'lr-{:4f}_batchsize-{:d}'.format(lr, batchsize))
    if not os.path.exists(resultdir):
        os.makedirs(resultdir)

    model_hyperparam = {}
    model_hyperparam['maxnorm'] = [0.5, 1.0, 2.0, 3.0]
    model_hyperparam['dense_units'] = {'min': 100, 'max': 700, 'step': 50}
    model_hyperparam['dropout'] = [0.1, 0.2, 0.3]
    model_hyperparam['lr'] = [1e-3, 1e-4]

    # Read data from disk
    data = pd.read_csv(os.path.join(indir, 'all_train_features_30.0s.csv'))
    ts = data['timestamp']
    fnames = data['filename']
    labels = data['label']
    users = data['user'].astype(str).values
    if mode == 'binary':
        labels = np.array(
            ['Sleep' if lbl in collate_states else lbl for lbl in labels])
    elif mode == 'nonwear':
        labels = np.array(
            ['Wear' if lbl in collate_states else lbl for lbl in labels])


#  unique_users = list(set(users))
#  random.shuffle(unique_users)
#  unique_users = unique_users[:10]

    num_classes = len(states)

    # Get valid values for CV split
    valid_indices = data[labels != -1].index.values

    # dummy values for partition as raw data cannot be loaded to memory
    X = data[['ENMO_mean', 'ENMO_std', 'ENMO_mad']].values[valid_indices]
    y = labels[valid_indices]
    groups = users[valid_indices]

    # Read raw data
    fp = h5py.File(os.path.join(indir, 'all_train_rawdata_30.0s.h5'), 'r')
    raw_data = fp['data']
    [num_samples, seqlen, n_channels] = raw_data.shape

    # Read raw data statistics
    stats = np.load(os.path.join(modeldir, "stats.npz"))
    mean = stats['mean']
    std = stats['std']

    # Use nested cross-validation based on users
    # Outer CV
    outer_cv_splits = 5
    inner_cv_splits = 5
    out_fold = 0
    predictions = []
    outer_group_kfold = GroupKFold(n_splits=outer_cv_splits)
    for train_indices, test_indices in outer_group_kfold.split(X, y, groups):
        if os.path.exists('untitled_project'):
            shutil.rmtree('untitled_project')
        out_fold += 1
        print('Evaluating outer fold %d' % (out_fold))
        out_fold_train_indices = valid_indices[train_indices]
        out_fold_test_indices = valid_indices[test_indices]
        out_fold_y_train = labels[out_fold_train_indices]
        out_fold_y_test = labels[out_fold_test_indices]
        out_fold_users_train = users[out_fold_train_indices]
        out_fold_users_test = users[out_fold_test_indices]
        out_fold_ts_test = ts[out_fold_test_indices]
        out_fold_fnames_test = fnames[out_fold_test_indices]

        # Build a hypermodel for hyperparam search
        hyperModel = ResnetHyperModel(hyperparam=model_hyperparam, seqlen=seqlen,\
                                      channels=num_channels+feat_channels,\
                                      pretrained_wts=pretrained_resnet_weights,\
                                      num_classes=num_classes)
        tuner = CVTuner(hypermodel=hyperModel,
                        oracle=kerastuner.oracles.Hyperband(
                            objective='val_loss', max_epochs=3),
                        cv=inner_cv_splits,
                        states=states,
                        num_classes=num_classes,
                        seqlen=seqlen,
                        num_channels=num_channels,
                        feat_channels=feat_channels,
                        mean=mean,
                        std=std)
        # Use a subset of training data for hyperparam search
        #    train_users = list(set(out_fold_users_train))
        #    random.shuffle(train_users)
        #    nsubtrain_users = int(0.5*len(train_users))
        #    sub_train_users = np.array(train_users[:nsubtrain_users])
        #    sub_train_indices = out_fold_train_indices[np.isin(out_fold_users_train, sub_train_users)]
        tuner.search(data=raw_data,
                     labels=labels,
                     users=users,
                     indices=out_fold_train_indices,
                     batch_size=batchsize)

        # Train fold with best best hyperparameters
        best_hp = tuner.get_best_hyperparameters()[0]
        with open(
                os.path.join(resultdir,
                             'fold' + str(out_fold) + '_hyperparameters.txt'),
                "w") as fp:
            fp.write("Maxnorm = {:.2f}\n".format(best_hp.values['maxnorm']))
            fp.write("Learning rate = {:.4f}\n".format(best_hp.values['lr']))
            fp.write("Preclassification layer units = {:d}\n".format(
                best_hp.values['preclassification']))
            fp.write("Dropout = {:.2f}\n".format(best_hp.values['dropout']))
        print(best_hp.values)
        model = tuner.hypermodel.build(best_hp)
        for layer in model.layers:
            if layer.name == "model":
                layer.set_weights(pretrained_resnet_weights)
        print(model.summary())

        # Data generators
        # Split train data into train and validation data based on users
        trainval_users = list(set(out_fold_users_train))
        random.shuffle(trainval_users)
        ntrainval_users = len(trainval_users)
        nval_users = int(0.1 * ntrainval_users)
        ntrain_users = ntrainval_users - nval_users
        train_users = np.array(trainval_users[:ntrain_users])
        val_users = np.array(trainval_users[ntrain_users:])
        out_fold_val_indices = out_fold_train_indices[np.isin(
            out_fold_users_train, val_users)]
        out_fold_train_indices = out_fold_train_indices[np.isin(
            out_fold_users_train, train_users)]

        train_gen = DataGenerator(out_fold_train_indices, raw_data, labels, states, partition='train',\
                              batch_size=batchsize, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                              n_classes=num_classes, shuffle=True, augment=True, aug_factor=0.75, balance=True,
                              mean=mean, std=std)
        val_gen = DataGenerator(out_fold_val_indices, raw_data, labels, states, partition='val',\
                            batch_size=batchsize, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                            n_classes=num_classes, mean=mean, std=std)
        test_gen = DataGenerator(out_fold_test_indices, raw_data, labels, states, partition='test',\
                            batch_size=batchsize, seqlen=seqlen, n_channels=num_channels, feat_channels=feat_channels,\
                            n_classes=num_classes, mean=mean, std=std)

        # Use callback to compute F-scores over entire validation data
        metrics_cb = Metrics(val_data=val_gen, batch_size=batchsize)
        # Use early stopping and model checkpoints to handle overfitting and save best model
        model_checkpt = ModelCheckpoint(os.path.join(resultdir,'fold'+str(out_fold)+'_'+mode+'-{epoch:02d}-{val_f1:.4f}.h5'),\
                                                     monitor='val_f1',\
                                                     mode='max', save_best_only=True)
        batch_renorm_cb = BatchRenormScheduler(len(train_gen))
        history = model.fit(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=1,
            shuffle=False,
            callbacks=[batch_renorm_cb, metrics_cb, model_checkpt],
            workers=2,
            max_queue_size=20,
            use_multiprocessing=False)

        # Plot training history
        plot_results(out_fold+1, history.history['loss'], history.history['val_loss'],\
                     os.path.join(resultdir,'Fold'+str(out_fold)+'_'+mode+'_loss.jpg'), metric='Loss')
        plot_results(out_fold+1, history.history['accuracy'], history.history['val_accuracy'],\
                     os.path.join(resultdir,'Fold'+str(out_fold)+'_'+mode+'_accuracy.jpg'), metric='Accuracy')
        plot_results(out_fold+1, history.history['macro_f1'], metrics_cb.val_f1,\
                     os.path.join(resultdir,'Fold'+str(out_fold)+'_'+mode+'_macro_f1.jpg'), metric='Macro F1')

        # Predict probability on validation data using best model
        best_model_file, epoch, val_f1 = get_best_model(resultdir, out_fold)
        print('Predicting with model saved at Epoch={:d} with val_f1={:0.4f}'.
              format(epoch, val_f1))
        model.load_weights(os.path.join(resultdir, best_model_file))
        probs = model.predict(test_gen)
        y_pred = probs.argmax(axis=1)
        y_true = out_fold_y_test
        predictions.append(
            (users[test_indices], data.iloc[test_indices]['timestamp'],
             data.iloc[test_indices]['filename'], test_indices, y_true, probs))

        # Save user report
        cv_save_classification_result(
            predictions,
            states,
            os.path.join(
                resultdir, 'fold' + str(out_fold) + '_deeplearning_' + mode +
                '_results.csv'),
            method='dl')
        cv_get_classification_report(predictions, mode, states, method='dl')

    cv_get_classification_report(predictions, mode, states, method='dl')

    # Save user report
    cv_save_classification_result(predictions,
                                  states,
                                  os.path.join(
                                      resultdir,
                                      'deeplearning_' + mode + '_results.csv'),
                                  method='dl')
def main(argv):
    infile = argv[0]
    modeldir = argv[1]
    mode = argv[2]
    ensemble = int(argv[3])  # 0 - use best model, 1 - use ensemble
    outdir = argv[4]

    df = pd.read_csv(infile)
    method = 'feat_eng'
    if mode == 'binary':
        states = ['Wake', 'Sleep']
        collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Sleep'
    elif mode == 'nonwear':
        states = ['Wear', 'Nonwear']
        collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Wear'
    elif mode == 'multiclass':
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    elif mode == 'hierarchical':
        method = 'hierarchical'
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM', 'Nonwear']
        # Class hierarchy for sleep stages
        class_hierarchy = {
            ROOT: {"Wear", "Nonwear"},
            "Wear": {"Wake", "Sleep"},
            "Sleep": {"NREM", "REM"},
            "NREM": {"Light", "NREM 3"},
            "Light": {"NREM 1", "NREM 2"}
        }

        graph = DiGraph(class_hierarchy)
        classes = [node for node in graph.nodes if node != ROOT]

    df = df[df['label'].isin(states)].reset_index()

    feat_cols = [
        'ENMO_mean', 'ENMO_std', 'ENMO_range', 'ENMO_mad', 'ENMO_entropy1',
        'ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
        'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff',
        'ENMO_next120diff', 'angz_mean', 'angz_std', 'angz_range', 'angz_mad',
        'angz_entropy1', 'angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
        'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff',
        'angz_next120diff', 'LIDS_mean', 'LIDS_std', 'LIDS_range', 'LIDS_mad',
        'LIDS_entropy1', 'LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
        'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff',
        'LIDS_next120diff'
    ]

    ts_test = df['timestamp']
    x_test = df[feat_cols].values
    y_test = df['label']
    if mode != 'hierarchical':
        y_test = np.array([states.index(i) for i in y_test])
    users_test = df['user']
    fnames_test = df['filename']

    N = x_test.shape[0]

    if ensemble:
        model_fnames = os.listdir(modeldir)
        model_fnames = [fname for fname in model_fnames if mode in fname]
        nfolds = len(model_fnames)
        for fold, fname in enumerate(model_fnames):
            print('Processing fold ' + str(fold + 1))
            if mode != 'hierarchical':
                scaler, cv_clf = joblib.load(
                    open(os.path.join(modeldir, fname), 'rb'))
                x_test_sc = scaler.transform(x_test)
                fold_y_pred = cv_clf.predict_proba(x_test_sc)
                if mode == 'multiclass':
                    fold_y_pred_collated = np.zeros(
                        (fold_y_pred[0].shape[0], len(fold_y_pred)))
                    for cls in range(len(fold_y_pred)):
                        fold_y_pred_collated[:, cls] = fold_y_pred[cls][:, 1]
                    fold_y_pred = fold_y_pred_collated
            else:
                cv_clf = pickle.load(open(os.path.join(modeldir, fname), 'rb'))
                cv_clf = cv_clf.best_estimator_
                fold_y_pred = cv_clf.predict(x_test)
                fold_y_pred_prob = cv_clf.predict_proba(x_test)
                with multi_labeled(y_test, fold_y_pred, cv_clf.named_steps['clf'].graph_) \
                                      as (y_test_, y_pred_, graph_, classes_):
                    states = classes_
                    y_test_ = fill_ancestors(y_test_, graph=graph_)
                    fold_y_pred_ = np.zeros(fold_y_pred_prob.shape)
                    for new_idx, label in enumerate(classes_):
                        old_idx = classes.index(label)
                        fold_y_pred_[:, new_idx] = fold_y_pred_prob[:, old_idx]
                fold_y_pred = fold_y_pred_

            # Accumulate prediction probabilities
            if fold == 0:
                y_pred = np.zeros((N, len(states)))
            y_pred += fold_y_pred

        # Get average predictions
        y_pred = y_pred / float(nfolds)
        if mode == 'hierarchical':
            y_test = y_test_
    else:
        if mode != 'hierarchical':
            model_fnames = os.listdir(modeldir)
            model_fname = [fname for fname in model_fnames if mode in fname][0]
            scaler, clf = joblib.load(
                open(os.path.join(modeldir, model_fname), 'rb'))
            x_test_sc = scaler.transform(x_test)
            y_pred = clf.predict_proba(x_test_sc)

    # Save test results
    y_pred = [(users_test, ts_test, fnames_test, y_test, y_pred)]
    cv_save_classification_result(y_pred,
                                  states,
                                  os.path.join(
                                      outdir,
                                      mode + '_test_classification.csv'),
                                  method=method)
示例#8
0
def main(argv):
  infile = argv[0]
  dataset = argv[1]
  outdir = argv[2]

  resultdir = os.path.join(outdir, 'models')
  if not os.path.exists(resultdir):
    os.makedirs(resultdir)

  # Read data file and retain data only corresponding to 5 sleep states
  df = pd.read_csv(infile, dtype={'label':object, 'user':object,\
                   'position':object, 'dataset':object})
  states = ['Wake','NREM 1','NREM 2','NREM 3','REM','Nonwear']
  df = df[df['label'].isin(states)].reset_index()
  
  print('... Number of data samples: %d' % len(df))
  ctr = Counter(df['label'])
  for cls in ctr:
    print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) 

  feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad',
               'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
               'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff',
               'angz_mean','angz_std','angz_range','angz_mad',
               'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
               'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff',
               'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad',
               'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
               'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff']

  ts = df['timestamp']
  X = df[feat_cols].values
  y = df['label']
  #y = np.array([states.index(i) for i in y])
  groups = df['user']
  fnames = df['filename']
  feat_len = X.shape[1]

  # Class hierarchy for sleep stages
  class_hierarchy = {
    ROOT : {"Wear", "Nonwear"},
    "Wear" : {"Wake", "Sleep"},
    "Sleep" : {"NREM", "REM"},
    "NREM" : {"Light", "NREM 3"},
    "Light" : {"NREM 1", "NREM 2"} 
  }
  
  graph = DiGraph(class_hierarchy)    
  classes = [node for node in graph.nodes if node != ROOT]
 
  outer_cv_splits = 5; inner_cv_splits = 5
  factor = 10.0
  
  # Outer CV
  group_kfold = GroupKFold(n_splits=outer_cv_splits)
  out_fold = 0
  hierarchical_pred = []
  for train_indices, test_indices in group_kfold.split(X,y,groups):
    out_fold += 1
    print('Processing fold ' + str(out_fold))
    out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:]
    out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices]
    out_fold_users_test = groups[test_indices]
    out_fold_ts_test = ts[test_indices]
    out_fold_fnames_test = fnames[test_indices]
    
    # Create a pipeline with scaler and hierarchical classifier
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('clf', HierarchicalClassifier(
                        base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1),
                        class_hierarchy=class_hierarchy,
                        prediction_depth='mlnp',
                        progress_wrapper=tqdm,
                        #stopping_criteria=0.7
                     ))
                    ])
    
    # Inner CV
    strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,\
                                  random_state=0, shuffle=True)       

    custom_cv_indices = []
    for grp_train_idx, grp_test_idx in strat_kfold.split(out_fold_X_train,out_fold_y_train):
      custom_cv_indices.append((grp_train_idx, grp_test_idx))
        
    print('Training')        
    search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500,700], \
         'clf__base_estimator__max_depth': [5,10,15,None]}
    cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \
                       cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \
                       n_jobs=-1, verbose=1)
    cv_clf.fit(out_fold_X_train, out_fold_y_train)
    joblib.dump(cv_clf, os.path.join(resultdir,\
                'fold'+str(out_fold)+'_hierarchical_RF.sav'))
    print('Predicting')
    out_fold_y_pred = cv_clf.predict(out_fold_X_test)
    out_fold_y_pred_prob = cv_clf.predict_proba(out_fold_X_test)
    
    best_clf = cv_clf.best_estimator_
        
    # Demonstrate using our hierarchical metrics module with MLB wrapper
    with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \
                            as (y_test_, y_pred_, graph_, classes_):
      states = classes_ 
      y_test_ = fill_ancestors(y_test_, graph=graph_)
      y_pred_ = fill_ancestors(y_pred_, graph=graph_)
      y_pred_prob_ = np.zeros(out_fold_y_pred_prob.shape)
      for new_idx, label in enumerate(classes_):
        old_idx = classes.index(label)
        y_pred_prob_[:,new_idx] = out_fold_y_pred_prob[:,old_idx]

      hierarchical_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
                                y_test_, y_pred_prob_))

  cv_save_classification_result(hierarchical_pred, states,
                                os.path.join(outdir, 'hierarchical_classification_results.csv'),
                                method = 'hierarchical')