예제 #1
0
def test_intermediate_node_training_data():
    r"""Test that a training set which includes intermediate (non-leaf) nodes
    as labels, as well as leaf nodes, constructs a correct classifier hierarchy

    """
    G, (X, y) = make_clothing_graph_and_data(root=ROOT)

    # Add a new node rendering "Bottoms" an intermediate node with training data
    G.add_edge("Bottoms", "Pants")

    assert_that(any(yi == "Pants" for yi in y), is_(False))
    assert_that(any(yi == "Bottoms" for yi in y), is_(True))

    base_estimator = LogisticRegression(
        solver="lbfgs",
        max_iter=1_000,
        multi_class="multinomial",
    )

    clf = HierarchicalClassifier(
        base_estimator,
        class_hierarchy=G,
        algorithm="lcpn",
        root=ROOT,
    )
    clf.fit(X, y)

    # Ensure non-terminal node with training data is included in its' parent classifier classes
    assert_that(clf.graph_.nodes()["Mens"]["classifier"].classes_,
                has_item("Bottoms"))
예제 #2
0
def classify_digits():
    r"""Test that a nontrivial hierarchy leaf classification behaves as expected.
    We build the following class hierarchy along with data from the handwritten digits dataset:
            <ROOT>
           /      \
          A        B
         / \       |  \
        1   7      C   9
                 /   \
                3     8
    """
    class_hierarchy = {
        ROOT: ["A", "B"],
        "A": ["1", "7"],
        "B": ["C", "9"],
        "C": ["3", "8"],
    }
    base_estimator = make_pipeline(
        TruncatedSVD(n_components=24),
        svm.SVC(
            gamma=0.001,
            kernel="rbf",
            probability=True
        ),
    )
    clf = HierarchicalClassifier(
        base_estimator=base_estimator,
        class_hierarchy=class_hierarchy,
    )
    X, y = make_digits_dataset(
        targets=[1, 7, 3, 8, 9],
        as_str=False,
    )
    print(type(X), X.shape, X)
    
    # cast the targets to strings so we have consistent typing of labels across hierarchy
    y = y.astype(str)
    print(type(y[0]), y.shape, y)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=RANDOM_STATE,
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Demonstrate using our hierarchical metrics module with MLB wrapper
    with multi_labeled(y_test, y_pred, clf.graph_) as (y_test_, y_pred_, graph_):
        h_fbeta = h_fbeta_score(
            y_test_,
            y_pred_,
            graph_,
        )
        print("h_fbeta_score: ", h_fbeta)
예제 #3
0
 def create_classifier(cls, category_hierarchy):
     return HierarchicalClassifier(
         base_estimator=cls.create_base_classifier(),
         class_hierarchy=category_hierarchy,
         prediction_depth='nmlnp',
         algorithm='lcpn',
         stopping_criteria=0.5)
예제 #4
0
def classify_digits():
    """Test that a nontrivial hierarchy leaf classification behaves as expected.

    We build the following class hierarchy along with data from the handwritten digits dataset:

            <ROOT>
           /      \
          A        B
         / \      / \ \
        1   7    3   8  9

    """
    class_hierarchy = {
        ROOT: ["A", "B"],
        "A": [1, 7],
        "B": [3, 8, 9],
    }
    base_estimator = make_pipeline(
        TruncatedSVD(n_components=24),
        svm.SVC(
            gamma=0.001,
            kernel="rbf",
            probability=True
        ),
    )
    clf = HierarchicalClassifier(
        base_estimator=base_estimator,
        class_hierarchy=class_hierarchy,
    )
    X, y = make_digits_dataset(
        targets=[1, 7, 3, 8, 9],
        as_str=False,
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=RANDOM_STATE,
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))
hierarchy_f = build_hierarchy([tj for tk in y_train_str_p for tj in tk])
if "ROOT" in hierarchy_f:
    hierarchy_f[ROOT] = hierarchy_f["ROOT"]
    del hierarchy_f["ROOT"]

class_hierarchy = extend_hierarchy(hierarchy_f, y_train_str)

bclf = OneVsRestClassifier(LinearSVC())

base_estimator = make_pipeline(vectorizer, bclf)

clf = HierarchicalClassifier(
    base_estimator=base_estimator,
    class_hierarchy=class_hierarchy,
    algorithm="lcn",
    training_strategy="siblings",
    #preprocessing=True,
    mlb=mlb,
    #use_decision_function=True
)

print("training classifier")
print(len(x_train_str), len(y_train))

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_train_str,
                                                            y_train,
                                                            random_state=42,
                                                            test_size=0.2)
print('X_train_s.shape', len(X_train_s), len(X_train_s[0]))
for x in X_train_s:
    print(len(x))
def test_estimator_inteface():
    """Run the scikit-learn estimator compatability test suite."""
    check_estimator(HierarchicalClassifier())
def main():
    subexp_name = 'YoungJaeShinSamples/4'
    anno_incomplete_file = '../data/data_jan2019_anno/anno_all_incomplete_YoungJaeShinSamples_4_useryoungjae.db'
    anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_useryoungjae.db'
    # anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_usertest0123.db'
    thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_useryoungjae.db'
    # thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_usertest0123.db'

    data_path = os.path.join('../data/data_jan2019', subexp_name)
    result_path = os.path.join('../results/data_jan2019_script/mat',
                               subexp_name)
    clf_path = os.path.join(
        '../results/data_jan2019_script/thickthinglue_clf_complete',
        subexp_name)
    if not os.path.exists(clf_path):
        os.makedirs(clf_path)

    # merge the annotation
    if not os.path.exists(anno_complete_file):
        os.system('cp %s %s' % (anno_incomplete_file, anno_complete_file))

        # update the annotation into the all annotation
        thickthin_oriname_newflakeids = readthickthindb(thickthin_anno_file)
        n_thickthin_flakes = len(thickthin_oriname_newflakeids)
        print(n_thickthin_flakes)

        updatedb(anno_complete_file, thickthin_oriname_newflakeids)

    # get the train/val split
    split_name = os.path.join(clf_path, 'train_val_split.p')
    if os.path.exists(split_name):
        to_load = pickle.load(open(split_name, 'rb'))
        train_names = to_load['train_names']
        train_labels = to_load['train_labels']
        val_names = to_load['val_names']
        val_labels = to_load['val_labels']
    else:
        itemname_labels = readdb(anno_complete_file)
        train_names, train_labels, val_names, val_labels = split_trainval(
            itemname_labels)
        to_save = dict()
        to_save['train_names'] = train_names
        to_save['train_labels'] = train_labels
        to_save['val_names'] = val_names
        to_save['val_labels'] = val_labels
        pickle.dump(to_save, open(split_name, 'wb'))

    # load flakes
    flake_save_name = os.path.join(clf_path, 'train_val_data.p')
    if os.path.exists(flake_save_name):
        to_load = pickle.load(open(flake_save_name, 'rb'))
        train_flakes = to_load['train_flakes']
        train_feats = to_load['train_feats']
        val_flakes = to_load['val_flakes']
        val_feats = to_load['val_feats']
    else:
        img_names = os.listdir(data_path)
        img_names.sort()
        img_flakes = Parallel(n_jobs=8)(delayed(load_one_image)(
            os.path.join(data_path, img_names[i]),
            os.path.join(result_path, img_names[i][:-4] + '.p'))
                                        for i in range(len(img_names)))
        # pickle.dump(img_flakes, open(flake_save_name, 'wb'))
        # load corresponding flakes
        train_flakes, train_feats = locate_flakes(train_names, img_flakes,
                                                  img_names)
        val_flakes, val_feats = locate_flakes(val_names, img_flakes, img_names)
        to_save = dict()
        to_save['train_flakes'] = train_flakes
        to_save['train_feats'] = train_feats
        # to_save['train_names'] = train_names
        # to_save['train_labels'] = train_labels
        to_save['val_flakes'] = val_flakes
        to_save['val_feats'] = val_feats
        # to_save['val_names'] = val_names
        # to_save['val_labels'] = val_labels
        pickle.dump(to_save, open(flake_save_name, 'wb'))
    print('loading done')

    # normalize data
    mean_feat = np.mean(train_feats, axis=0, keepdims=True)
    std_feat = np.std(train_feats, axis=0, keepdims=True)
    norm_fea = {}
    norm_fea['mean'] = mean_feat
    norm_fea['std'] = std_feat
    pickle.dump(norm_fea, open(os.path.join(clf_path, 'normfea.p'), 'wb'))
    train_feats -= mean_feat
    train_feats = train_feats / std_feat
    # train_feats = train_feats / np.linalg.norm(train_feats, 2, axis=1, keepdims=True)
    val_feats -= mean_feat
    val_feats = val_feats / std_feat
    # val_feats = val_feats / np.linalg.norm(val_feats, 2, axis=1, keepdims=True)

    # run classifier
    # method = 'linearsvm'
    # method = 'ridge'
    # method = 'rbfkernelsvm'
    method = hyperparams['clf_method']
    print(method)
    C = hyperparams['C']
    # C = 10
    Cs = [
        0.001,
        0.01,
        0.1,
        0.25,
        0.5,
        0.75,
        1,
        2.5,
        5,
    ]

    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import make_pipeline
    from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
    from sklearn_hierarchical_classification.constants import ROOT
    from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
    from sklearn_hierarchical_classification.tests.fixtures import make_digits_dataset

    class_hierarchy = {
        ROOT: ["A", "2"],
        "A": ["0", "1"],
    }
    for C in Cs:
        clf_save_path = os.path.join(
            clf_path, 'feanorm_weighted_classifier-%s-%f.p' % (method, C))
        if os.path.exists(clf_save_path):
            clf = pickle.load(open(clf_save_path, 'rb'))
        else:
            if method == 'hielinearsvm':
                # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=5e4, class_weight='balanced')
                # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=9e4, class_weight={0:1, 1:5, 2:1})#, multi_class='crammer_singer')
                # clf.fit(train_feats, train_labels)

                base_estimator = make_pipeline(
                    # TruncatedSVD(n_components=24),
                    SVC(gamma='auto', kernel="linear", probability=True,
                        C=C), )

            # elif method == 'ridge':
            #     clf = RidgeClassifier(random_state=0, alpha=C)
            #     clf.fit(train_feats, train_labels)
            elif method == 'hierbfsvm':
                # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto')
                # clf.fit(train_feats, train_labels)
                base_estimator = make_pipeline(
                    # TruncatedSVD(n_components=24),
                    SVC(gamma='auto', kernel="rbf", probability=True, C=C), )
            else:
                raise NotImplementedError

            clf = HierarchicalClassifier(
                base_estimator=base_estimator,
                class_hierarchy=class_hierarchy,
            )
            train_labels_str = [str(_) for _ in train_labels]
            clf.fit(train_feats, train_labels_str)
            pickle.dump(clf, open(clf_save_path, 'wb'))

        train_pred_cls = clf.predict(train_feats)
        train_pred_cls = [int(_) for _ in train_pred_cls]
        train_pred_scores = clf.predict_proba(train_feats)
        val_pred_cls = clf.predict(val_feats)
        val_pred_cls = [int(_) for _ in val_pred_cls]
        val_pred_scores = clf.predict_proba(val_feats)
        clf_vis_path = os.path.join(clf_path, subexp_name, 'vis',
                                    'feanorm_weighted_%s-%f' % (method, C))
        if not os.path.exists(clf_vis_path):
            os.makedirs(clf_vis_path)

        from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_recall_curve
        train_acc = accuracy_score(train_labels, train_pred_cls)
        val_acc = accuracy_score(val_labels, val_pred_cls)
        train_conf = confusion_matrix(train_labels, train_pred_cls)
        train_conf = train_conf / np.sum(train_conf, 1, keepdims=True)
        val_conf = confusion_matrix(val_labels, val_pred_cls)
        val_conf = val_conf / np.sum(val_conf, 1, keepdims=True)
        # # val_acc = accuracy_score(, test_pred_cls)
        # print('train acc: %.4f' % (train_acc))
        print(train_conf)
        print(val_conf)
        # vis_error(val_pred_cls, val_pred_scores, val_labels, val_flakes, clf_vis_path, val_names, 'val')
        # vis_error(train_pred_cls, train_pred_scores, train_labels, train_flakes, clf_vis_path, train_names, 'train')

        # calculate map:
        uniquelabels = [0, 1, 2]
        train_aps = []
        val_aps = []
        fig = plt.figure()
        ax = fig.add_subplot(111)
        legends = ['thick', 'thin', 'glue']
        for l in uniquelabels:
            l_train_labels = [_ == l for _ in train_labels]
            l_val_labels = [_ == l for _ in val_labels]
            # if method == 'linearsvm':
            #     clf = LinearSVC(random_state=0, tol=1e-5, C=C)
            #     clf.fit(train_feats, l_train_labels)
            # elif method == 'ridge':
            #     clf = RidgeClassifier(random_state=0, alpha=C)
            #     clf.fit(train_feats, l_train_labels)
            # elif method == 'rbfkernelsvm':
            #     clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto')
            #     clf.fit(train_feats, l_train_labels)
            # else:
            #     raise NotImplementedError

            # l_train_pred_scores = clf.decision_function(train_feats)
            # l_val_pred_scores = clf.decision_function(val_feats)
            train_aps.append(
                average_precision_score(l_train_labels, train_pred_scores[:,
                                                                          l]))
            val_aps.append(
                average_precision_score(l_val_labels, val_pred_scores[:, l]))
            # print(val_pred_scores[:, l])
            # print(np.array(l_val_labels, dtype=np.uint8))
            precision_l, recall_l, _ = precision_recall_curve(
                np.array(l_val_labels, dtype=np.uint8), val_pred_scores[:, l])
            print(l, getPrecisionAtRecall(precision_l, recall_l, 0.90),
                  getPrecisionAtRecall(precision_l, recall_l, 0.95))
            ax.plot(recall_l, precision_l, label=legends[l])

        plt.legend()
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])

        plt.savefig(os.path.join(clf_path,
                                 'feanorm_weighted_%s-%f.png' % (method, C)),
                    dpi=300)
        plt.close(fig)

        # train_labels = (np.array(train_labels) + 1)//2
        # val_labels = (np.array(val_labels) + 1 ) // 2
        # train_ap = average_precision_score(train_labels, train_pred_scores)
        # val_ap = average_precision_score(val_labels, pred_scores)
        print(train_aps)
        print(val_aps)
        print('%s-%f: train: %.4f, val: %.4f, ap train: %4f, ap val: %4f' %
              (method, C, train_acc, val_acc, np.mean(train_aps),
               np.mean(val_aps)))
예제 #8
0
def make_classifier(base_estimator=None, class_hierarchy=None, **kwargs):
    return HierarchicalClassifier(class_hierarchy=class_hierarchy,
                                  base_estimator=base_estimator,
                                  **kwargs)
예제 #9
0
def main(argv):
    infile = argv[0]
    outdir = argv[1]

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Read data file and retain data only corresponding to 5 sleep states
    df = pd.read_csv(infile,
                     dtype={
                         'label': object,
                         'user': object,
                         'position': object,
                         'dataset': object
                     })
    orig_cols = df.columns
    sleep_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    df = df[df['label'].isin(sleep_states)].reset_index()
    df = df[df['dataset'] == 'UPenn'].reset_index()
    df = df[orig_cols]
    print('... Number of data samples: %d' % len(df))
    ctr = Counter(df['label'])
    for cls in ctr:
        print('%s: %d (%0.2f%%)' % (cls, ctr[cls], ctr[cls] * 100.0 / len(df)))

    feat_cols = ['ENMO_mean','ENMO_std','ENMO_min','ENMO_max','ENMO_mad','ENMO_entropy1','ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff', \
                 'angz_mean','angz_std','angz_min','angz_max','angz_mad','angz_entropy1','angz_entropy2', 'angz_prevdiff', 'angz_nextdiff', \
                 'LIDS_mean','LIDS_std','LIDS_min','LIDS_max','LIDS_mad','LIDS_entropy1','LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff']

    X = df[feat_cols].values
    y = df['label']
    groups = df['user']

    # Class hierarchy for sleep stages
    class_hierarchy = {
        ROOT: {"Wake", "Sleep"},
        "Sleep": {"NREM", "REM"},
        "NREM": {"Light", "NREM 3"},
        "Light": {"NREM 1", "NREM 2"}
    }

    graph = DiGraph(class_hierarchy)

    outer_cv_splits = 5
    inner_cv_splits = 3
    factor = 10.0

    results = {
        'Wake': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Sleep': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'REM': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 3': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Light': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 1': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 2': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Overall': {
            'precision': [],
            'recall': [],
            'fbeta': []
        }
    }

    # Outer CV
    group_kfold = GroupKFold(n_splits=outer_cv_splits)
    out_fold = 0
    hierarchical_pred = []
    for train_indices, test_indices in group_kfold.split(X, y, groups):
        out_fold += 1
        print('Processing fold ' + str(out_fold))
        out_fold_X_train = X[train_indices, :]
        out_fold_X_test = X[test_indices, :]
        out_fold_y_train = y[train_indices]
        out_fold_y_test = y[test_indices]
        out_fold_users_test = groups[test_indices]

        # Create a pipeline with scaler and hierarchical classifier
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            (
                'clf',
                HierarchicalClassifier(
                    base_estimator=RandomForestClassifier(random_state=0,
                                                          n_estimators=100,
                                                          n_jobs=-1),
                    class_hierarchy=class_hierarchy,
                    prediction_depth='mlnp',
                    progress_wrapper=tqdm,
                    #stopping_criteria=0.7
                ))
        ])

        # Inner CV
        strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,
                                      random_state=0,
                                      shuffle=True)

        custom_cv_indices = []
        for grp_train_idx, grp_test_idx in strat_kfold.split(
                out_fold_X_train, out_fold_y_train):
            custom_cv_indices.append((grp_train_idx, grp_test_idx))

        print('Training')
        search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500], \
             'clf__base_estimator__max_depth': [5,10,None]}
        cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \
                           cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \
                           n_jobs=-1, verbose=1)
        cv_clf.fit(out_fold_X_train, out_fold_y_train)
        print('Predicting')
        out_fold_y_pred = cv_clf.predict(out_fold_X_test)

        best_clf = cv_clf.best_estimator_

        # Demonstrate using our hierarchical metrics module with MLB wrapper
        with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \
                                as (y_test_, y_pred_, graph_, classes_):
            fold_h_prec, fold_h_rec, fold_h_fbeta = h_fbeta_score(
                y_test_, y_pred_, graph_)
            results['Overall']['precision'].append(fold_h_prec)
            results['Overall']['recall'].append(fold_h_rec)
            results['Overall']['fbeta'].append(fold_h_fbeta)
            print("Fold %d: precision: %0.4f, recall: %0.4f, fbeta: %0.4f" %
                  (out_fold, fold_h_prec, fold_h_rec, fold_h_fbeta))

            y_test_ = fill_ancestors(y_test_, graph=graph_)
            y_pred_ = fill_ancestors(y_pred_, graph=graph_)

            hierarchical_pred.append(
                (out_fold_users_test, y_test_, y_pred_, classes_))

            fold_wake_prec, fold_wake_rec, fold_wake_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Wake')
            fold_sleep_prec, fold_sleep_rec, fold_sleep_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Sleep')
            fold_rem_prec, fold_rem_rec, fold_rem_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'REM')
            fold_nrem_prec, fold_nrem_rec, fold_nrem_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM')
            fold_nrem3_prec, fold_nrem3_rec, fold_nrem3_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 3')
            fold_light_prec, fold_light_rec, fold_light_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Light')
            fold_nrem1_prec, fold_nrem1_rec, fold_nrem1_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 1')
            fold_nrem2_prec, fold_nrem2_rec, fold_nrem2_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 2')

            results['Wake']['precision'].append(fold_wake_prec)
            results['Wake']['recall'].append(fold_wake_rec)
            results['Wake']['fbeta'].append(fold_wake_fbeta)
            results['Sleep']['precision'].append(fold_sleep_prec)
            results['Sleep']['recall'].append(fold_sleep_rec)
            results['Sleep']['fbeta'].append(fold_sleep_fbeta)
            results['REM']['precision'].append(fold_rem_prec)
            results['REM']['recall'].append(fold_rem_rec)
            results['REM']['fbeta'].append(fold_rem_fbeta)
            results['NREM']['precision'].append(fold_nrem_prec)
            results['NREM']['recall'].append(fold_nrem_rec)
            results['NREM']['fbeta'].append(fold_nrem_fbeta)
            results['NREM 3']['precision'].append(fold_nrem3_prec)
            results['NREM 3']['recall'].append(fold_nrem3_rec)
            results['NREM 3']['fbeta'].append(fold_nrem3_fbeta)
            results['Light']['precision'].append(fold_light_prec)
            results['Light']['recall'].append(fold_light_rec)
            results['Light']['fbeta'].append(fold_light_fbeta)
            results['NREM 1']['precision'].append(fold_nrem1_prec)
            results['NREM 1']['recall'].append(fold_nrem1_rec)
            results['NREM 1']['fbeta'].append(fold_nrem1_fbeta)
            results['NREM 2']['precision'].append(fold_nrem2_prec)
            results['NREM 2']['recall'].append(fold_nrem2_rec)
            results['NREM 2']['fbeta'].append(fold_nrem2_fbeta)

    get_classification_report(results)
    save_user_report(hierarchical_pred,
                     os.path.join(outdir, 'hierarchical_results.csv'))
예제 #10
0
파일: hsvm.py 프로젝트: godkillok/xunfei
def svm_train():
    train_x, train_y, apps, train_y2 = get_data_set(train_path)
    test_x, test_y, apps, test_y2 = get_data_set(test_path)
    pred_x, _, apps, _ = get_data_set(pred_path)
    # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f:
    #     pickle.dump(label_dic, f)

    logging.info('train {} test{}'.format(len(train_x), len(test_x)))
    t = time.time()
    data_set = train_x + test_x + pred_x
    vec = TfidfVectorizer(ngram_range=(1, 3),
                          min_df=10,
                          max_df=0.9,
                          use_idf=1,
                          smooth_idf=1,
                          sublinear_tf=1)
    #vec=HashingVectorizer(ngram_range=(1, 3))
    vec.fit_transform(data_set)
    #
    with open(project_path + 'tfidf.pkl', 'wb') as f:
        pickle.dump(vec, f)
    # with open(CHANNEL_MODEL + 'tfidf.pkl', 'rb') as f:
    #     vec = pickle.load(f)

    trn_term_doc = vec.transform(train_x)
    print(label1_label2)
    time.sleep(20)
    tfidf_time = time.time()
    logging.info('time spend {}'.format(tfidf_time - t))

    logging.info('begin svm ')
    lin_clf = svm.LinearSVC(C=1)
    lin_clf = CalibratedClassifierCV(lin_clf)

    clf = HierarchicalClassifier(
        base_estimator=lin_clf,
        class_hierarchy=label1_label2,
    )

    clf.fit(trn_term_doc, train_y)

    print(clf.classes_)

    logging.info('end  svm ')
    # with open(project_path + 'svm_model.pkl', 'wb') as f:
    #     pickle.dump(lin_clf, f)

    train_preds = clf.predict(trn_term_doc)
    train_preds_prob = clf.predict_proba(trn_term_doc)
    print(len((clf.classes_)), train_preds_prob.shape)
    time.sleep(20)
    for reg, prob in zip(train_preds, train_preds_prob):
        print(reg, list(prob.argsort()[-1:][::-1]))
    time.sleep(20)
    from sklearn.metrics import classification_report

    logging.info('train {} accuracy_score {},  \n {}'.format(
        'train', accuracy_score(train_y, train_preds),
        classification_report(train_y, train_preds)))
    t2 = time.time()
    logging.info('time spend {}'.format(t2 - t))

    test_term_doc = vec.transform(test_x)
    test_preds_1 = clf.predict_proba(test_term_doc)
    test_preds = clf.predict(test_term_doc)
    logging.info('train {} accuracy_score {},  \n {}'.format(
        'train', accuracy_score(train_y, train_preds),
        classification_report(test_y2, test_preds)))

    dic_lab = {}
    for k, v in label_dic2.items():
        dic_lab[v] = k
    test_preds = []
    test_preds = []
    for prob in test_preds_1:
        test_preds.append(list(prob.argsort()[-2:][::-1]))

    test_y_name = []
    test_preds_name = []
    for real, pred in zip(test_y2, test_preds):
        prd = pred[0]
        print(real, pred)
        for pr in pred:
            if real == clf.classes_[pr]:
                prd = pr
        test_y_name.append(real)
        test_preds_name.append(clf.classes_[prd])

    logging.info('{} model on {} data accuracy_score {} top2 test\n {}'.format(
        "train", test_path, accuracy_score(test_y_name, test_preds_name),
        classification_report(test_y_name, test_preds_name)))
예제 #11
0
        1   7      C   9
                 /   \
                3     8
    """
    if "ROOT" in hierarchy:
        hierarchy[ROOT] = hierarchy["ROOT"]
        del hierarchy["ROOT"]
    class_hierarchy = hierarchy
    bclf = OneVsRestClassifier(LinearSVC())

    base_estimator = make_pipeline(vectorizer, bclf)

    clf = HierarchicalClassifier(base_estimator=base_estimator,
                                 class_hierarchy=class_hierarchy,
                                 algorithm="lcn",
                                 training_strategy="siblings",
                                 preprocessing=True,
                                 mlb=mlb,
                                 use_decision_function=True)

    print("training classifier")
    clf.fit(X_train_raw, y_train[:, :])
    print("predicting")

    y_pred_scores = clf.predict_proba(X_dev_raw)

    y_pred_scores[np.where(y_pred_scores == 0)] = -10
    y_pred = y_pred_scores > -0.25

    if train == 1:
        print('f1 micro:',
예제 #12
0
def main(argv):
  infile = argv[0]
  dataset = argv[1]
  outdir = argv[2]

  resultdir = os.path.join(outdir, 'models')
  if not os.path.exists(resultdir):
    os.makedirs(resultdir)

  # Read data file and retain data only corresponding to 5 sleep states
  df = pd.read_csv(infile, dtype={'label':object, 'user':object,\
                   'position':object, 'dataset':object})
  states = ['Wake','NREM 1','NREM 2','NREM 3','REM','Nonwear']
  df = df[df['label'].isin(states)].reset_index()
  
  print('... Number of data samples: %d' % len(df))
  ctr = Counter(df['label'])
  for cls in ctr:
    print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) 

  feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad',
               'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
               'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff',
               'angz_mean','angz_std','angz_range','angz_mad',
               'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
               'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff',
               'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad',
               'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
               'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff']

  ts = df['timestamp']
  X = df[feat_cols].values
  y = df['label']
  #y = np.array([states.index(i) for i in y])
  groups = df['user']
  fnames = df['filename']
  feat_len = X.shape[1]

  # Class hierarchy for sleep stages
  class_hierarchy = {
    ROOT : {"Wear", "Nonwear"},
    "Wear" : {"Wake", "Sleep"},
    "Sleep" : {"NREM", "REM"},
    "NREM" : {"Light", "NREM 3"},
    "Light" : {"NREM 1", "NREM 2"} 
  }
  
  graph = DiGraph(class_hierarchy)    
  classes = [node for node in graph.nodes if node != ROOT]
 
  outer_cv_splits = 5; inner_cv_splits = 5
  factor = 10.0
  
  # Outer CV
  group_kfold = GroupKFold(n_splits=outer_cv_splits)
  out_fold = 0
  hierarchical_pred = []
  for train_indices, test_indices in group_kfold.split(X,y,groups):
    out_fold += 1
    print('Processing fold ' + str(out_fold))
    out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:]
    out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices]
    out_fold_users_test = groups[test_indices]
    out_fold_ts_test = ts[test_indices]
    out_fold_fnames_test = fnames[test_indices]
    
    # Create a pipeline with scaler and hierarchical classifier
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('clf', HierarchicalClassifier(
                        base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1),
                        class_hierarchy=class_hierarchy,
                        prediction_depth='mlnp',
                        progress_wrapper=tqdm,
                        #stopping_criteria=0.7
                     ))
                    ])
    
    # Inner CV
    strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,\
                                  random_state=0, shuffle=True)       

    custom_cv_indices = []
    for grp_train_idx, grp_test_idx in strat_kfold.split(out_fold_X_train,out_fold_y_train):
      custom_cv_indices.append((grp_train_idx, grp_test_idx))
        
    print('Training')        
    search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500,700], \
         'clf__base_estimator__max_depth': [5,10,15,None]}
    cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \
                       cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \
                       n_jobs=-1, verbose=1)
    cv_clf.fit(out_fold_X_train, out_fold_y_train)
    joblib.dump(cv_clf, os.path.join(resultdir,\
                'fold'+str(out_fold)+'_hierarchical_RF.sav'))
    print('Predicting')
    out_fold_y_pred = cv_clf.predict(out_fold_X_test)
    out_fold_y_pred_prob = cv_clf.predict_proba(out_fold_X_test)
    
    best_clf = cv_clf.best_estimator_
        
    # Demonstrate using our hierarchical metrics module with MLB wrapper
    with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \
                            as (y_test_, y_pred_, graph_, classes_):
      states = classes_ 
      y_test_ = fill_ancestors(y_test_, graph=graph_)
      y_pred_ = fill_ancestors(y_pred_, graph=graph_)
      y_pred_prob_ = np.zeros(out_fold_y_pred_prob.shape)
      for new_idx, label in enumerate(classes_):
        old_idx = classes.index(label)
        y_pred_prob_[:,new_idx] = out_fold_y_pred_prob[:,old_idx]

      hierarchical_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
                                y_test_, y_pred_prob_))

  cv_save_classification_result(hierarchical_pred, states,
                                os.path.join(outdir, 'hierarchical_classification_results.csv'),
                                method = 'hierarchical')