def test_intermediate_node_training_data(): r"""Test that a training set which includes intermediate (non-leaf) nodes as labels, as well as leaf nodes, constructs a correct classifier hierarchy """ G, (X, y) = make_clothing_graph_and_data(root=ROOT) # Add a new node rendering "Bottoms" an intermediate node with training data G.add_edge("Bottoms", "Pants") assert_that(any(yi == "Pants" for yi in y), is_(False)) assert_that(any(yi == "Bottoms" for yi in y), is_(True)) base_estimator = LogisticRegression( solver="lbfgs", max_iter=1_000, multi_class="multinomial", ) clf = HierarchicalClassifier( base_estimator, class_hierarchy=G, algorithm="lcpn", root=ROOT, ) clf.fit(X, y) # Ensure non-terminal node with training data is included in its' parent classifier classes assert_that(clf.graph_.nodes()["Mens"]["classifier"].classes_, has_item("Bottoms"))
def classify_digits(): r"""Test that a nontrivial hierarchy leaf classification behaves as expected. We build the following class hierarchy along with data from the handwritten digits dataset: <ROOT> / \ A B / \ | \ 1 7 C 9 / \ 3 8 """ class_hierarchy = { ROOT: ["A", "B"], "A": ["1", "7"], "B": ["C", "9"], "C": ["3", "8"], } base_estimator = make_pipeline( TruncatedSVD(n_components=24), svm.SVC( gamma=0.001, kernel="rbf", probability=True ), ) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) X, y = make_digits_dataset( targets=[1, 7, 3, 8, 9], as_str=False, ) print(type(X), X.shape, X) # cast the targets to strings so we have consistent typing of labels across hierarchy y = y.astype(str) print(type(y[0]), y.shape, y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE, ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classification Report:\n", classification_report(y_test, y_pred)) # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(y_test, y_pred, clf.graph_) as (y_test_, y_pred_, graph_): h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_, ) print("h_fbeta_score: ", h_fbeta)
def create_classifier(cls, category_hierarchy): return HierarchicalClassifier( base_estimator=cls.create_base_classifier(), class_hierarchy=category_hierarchy, prediction_depth='nmlnp', algorithm='lcpn', stopping_criteria=0.5)
def classify_digits(): """Test that a nontrivial hierarchy leaf classification behaves as expected. We build the following class hierarchy along with data from the handwritten digits dataset: <ROOT> / \ A B / \ / \ \ 1 7 3 8 9 """ class_hierarchy = { ROOT: ["A", "B"], "A": [1, 7], "B": [3, 8, 9], } base_estimator = make_pipeline( TruncatedSVD(n_components=24), svm.SVC( gamma=0.001, kernel="rbf", probability=True ), ) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) X, y = make_digits_dataset( targets=[1, 7, 3, 8, 9], as_str=False, ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE, ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classification Report:\n", classification_report(y_test, y_pred))
hierarchy_f = build_hierarchy([tj for tk in y_train_str_p for tj in tk]) if "ROOT" in hierarchy_f: hierarchy_f[ROOT] = hierarchy_f["ROOT"] del hierarchy_f["ROOT"] class_hierarchy = extend_hierarchy(hierarchy_f, y_train_str) bclf = OneVsRestClassifier(LinearSVC()) base_estimator = make_pipeline(vectorizer, bclf) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, algorithm="lcn", training_strategy="siblings", #preprocessing=True, mlb=mlb, #use_decision_function=True ) print("training classifier") print(len(x_train_str), len(y_train)) X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_train_str, y_train, random_state=42, test_size=0.2) print('X_train_s.shape', len(X_train_s), len(X_train_s[0])) for x in X_train_s: print(len(x))
def test_estimator_inteface(): """Run the scikit-learn estimator compatability test suite.""" check_estimator(HierarchicalClassifier())
def main(): subexp_name = 'YoungJaeShinSamples/4' anno_incomplete_file = '../data/data_jan2019_anno/anno_all_incomplete_YoungJaeShinSamples_4_useryoungjae.db' anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_useryoungjae.db' # anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_usertest0123.db' thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_useryoungjae.db' # thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_usertest0123.db' data_path = os.path.join('../data/data_jan2019', subexp_name) result_path = os.path.join('../results/data_jan2019_script/mat', subexp_name) clf_path = os.path.join( '../results/data_jan2019_script/thickthinglue_clf_complete', subexp_name) if not os.path.exists(clf_path): os.makedirs(clf_path) # merge the annotation if not os.path.exists(anno_complete_file): os.system('cp %s %s' % (anno_incomplete_file, anno_complete_file)) # update the annotation into the all annotation thickthin_oriname_newflakeids = readthickthindb(thickthin_anno_file) n_thickthin_flakes = len(thickthin_oriname_newflakeids) print(n_thickthin_flakes) updatedb(anno_complete_file, thickthin_oriname_newflakeids) # get the train/val split split_name = os.path.join(clf_path, 'train_val_split.p') if os.path.exists(split_name): to_load = pickle.load(open(split_name, 'rb')) train_names = to_load['train_names'] train_labels = to_load['train_labels'] val_names = to_load['val_names'] val_labels = to_load['val_labels'] else: itemname_labels = readdb(anno_complete_file) train_names, train_labels, val_names, val_labels = split_trainval( itemname_labels) to_save = dict() to_save['train_names'] = train_names to_save['train_labels'] = train_labels to_save['val_names'] = val_names to_save['val_labels'] = val_labels pickle.dump(to_save, open(split_name, 'wb')) # load flakes flake_save_name = os.path.join(clf_path, 'train_val_data.p') if os.path.exists(flake_save_name): to_load = pickle.load(open(flake_save_name, 'rb')) train_flakes = to_load['train_flakes'] train_feats = to_load['train_feats'] val_flakes = to_load['val_flakes'] val_feats = to_load['val_feats'] else: img_names = os.listdir(data_path) img_names.sort() img_flakes = Parallel(n_jobs=8)(delayed(load_one_image)( os.path.join(data_path, img_names[i]), os.path.join(result_path, img_names[i][:-4] + '.p')) for i in range(len(img_names))) # pickle.dump(img_flakes, open(flake_save_name, 'wb')) # load corresponding flakes train_flakes, train_feats = locate_flakes(train_names, img_flakes, img_names) val_flakes, val_feats = locate_flakes(val_names, img_flakes, img_names) to_save = dict() to_save['train_flakes'] = train_flakes to_save['train_feats'] = train_feats # to_save['train_names'] = train_names # to_save['train_labels'] = train_labels to_save['val_flakes'] = val_flakes to_save['val_feats'] = val_feats # to_save['val_names'] = val_names # to_save['val_labels'] = val_labels pickle.dump(to_save, open(flake_save_name, 'wb')) print('loading done') # normalize data mean_feat = np.mean(train_feats, axis=0, keepdims=True) std_feat = np.std(train_feats, axis=0, keepdims=True) norm_fea = {} norm_fea['mean'] = mean_feat norm_fea['std'] = std_feat pickle.dump(norm_fea, open(os.path.join(clf_path, 'normfea.p'), 'wb')) train_feats -= mean_feat train_feats = train_feats / std_feat # train_feats = train_feats / np.linalg.norm(train_feats, 2, axis=1, keepdims=True) val_feats -= mean_feat val_feats = val_feats / std_feat # val_feats = val_feats / np.linalg.norm(val_feats, 2, axis=1, keepdims=True) # run classifier # method = 'linearsvm' # method = 'ridge' # method = 'rbfkernelsvm' method = hyperparams['clf_method'] print(method) C = hyperparams['C'] # C = 10 Cs = [ 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, ] from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn_hierarchical_classification.classifier import HierarchicalClassifier from sklearn_hierarchical_classification.constants import ROOT from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled from sklearn_hierarchical_classification.tests.fixtures import make_digits_dataset class_hierarchy = { ROOT: ["A", "2"], "A": ["0", "1"], } for C in Cs: clf_save_path = os.path.join( clf_path, 'feanorm_weighted_classifier-%s-%f.p' % (method, C)) if os.path.exists(clf_save_path): clf = pickle.load(open(clf_save_path, 'rb')) else: if method == 'hielinearsvm': # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=5e4, class_weight='balanced') # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=9e4, class_weight={0:1, 1:5, 2:1})#, multi_class='crammer_singer') # clf.fit(train_feats, train_labels) base_estimator = make_pipeline( # TruncatedSVD(n_components=24), SVC(gamma='auto', kernel="linear", probability=True, C=C), ) # elif method == 'ridge': # clf = RidgeClassifier(random_state=0, alpha=C) # clf.fit(train_feats, train_labels) elif method == 'hierbfsvm': # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto') # clf.fit(train_feats, train_labels) base_estimator = make_pipeline( # TruncatedSVD(n_components=24), SVC(gamma='auto', kernel="rbf", probability=True, C=C), ) else: raise NotImplementedError clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) train_labels_str = [str(_) for _ in train_labels] clf.fit(train_feats, train_labels_str) pickle.dump(clf, open(clf_save_path, 'wb')) train_pred_cls = clf.predict(train_feats) train_pred_cls = [int(_) for _ in train_pred_cls] train_pred_scores = clf.predict_proba(train_feats) val_pred_cls = clf.predict(val_feats) val_pred_cls = [int(_) for _ in val_pred_cls] val_pred_scores = clf.predict_proba(val_feats) clf_vis_path = os.path.join(clf_path, subexp_name, 'vis', 'feanorm_weighted_%s-%f' % (method, C)) if not os.path.exists(clf_vis_path): os.makedirs(clf_vis_path) from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_recall_curve train_acc = accuracy_score(train_labels, train_pred_cls) val_acc = accuracy_score(val_labels, val_pred_cls) train_conf = confusion_matrix(train_labels, train_pred_cls) train_conf = train_conf / np.sum(train_conf, 1, keepdims=True) val_conf = confusion_matrix(val_labels, val_pred_cls) val_conf = val_conf / np.sum(val_conf, 1, keepdims=True) # # val_acc = accuracy_score(, test_pred_cls) # print('train acc: %.4f' % (train_acc)) print(train_conf) print(val_conf) # vis_error(val_pred_cls, val_pred_scores, val_labels, val_flakes, clf_vis_path, val_names, 'val') # vis_error(train_pred_cls, train_pred_scores, train_labels, train_flakes, clf_vis_path, train_names, 'train') # calculate map: uniquelabels = [0, 1, 2] train_aps = [] val_aps = [] fig = plt.figure() ax = fig.add_subplot(111) legends = ['thick', 'thin', 'glue'] for l in uniquelabels: l_train_labels = [_ == l for _ in train_labels] l_val_labels = [_ == l for _ in val_labels] # if method == 'linearsvm': # clf = LinearSVC(random_state=0, tol=1e-5, C=C) # clf.fit(train_feats, l_train_labels) # elif method == 'ridge': # clf = RidgeClassifier(random_state=0, alpha=C) # clf.fit(train_feats, l_train_labels) # elif method == 'rbfkernelsvm': # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto') # clf.fit(train_feats, l_train_labels) # else: # raise NotImplementedError # l_train_pred_scores = clf.decision_function(train_feats) # l_val_pred_scores = clf.decision_function(val_feats) train_aps.append( average_precision_score(l_train_labels, train_pred_scores[:, l])) val_aps.append( average_precision_score(l_val_labels, val_pred_scores[:, l])) # print(val_pred_scores[:, l]) # print(np.array(l_val_labels, dtype=np.uint8)) precision_l, recall_l, _ = precision_recall_curve( np.array(l_val_labels, dtype=np.uint8), val_pred_scores[:, l]) print(l, getPrecisionAtRecall(precision_l, recall_l, 0.90), getPrecisionAtRecall(precision_l, recall_l, 0.95)) ax.plot(recall_l, precision_l, label=legends[l]) plt.legend() plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.savefig(os.path.join(clf_path, 'feanorm_weighted_%s-%f.png' % (method, C)), dpi=300) plt.close(fig) # train_labels = (np.array(train_labels) + 1)//2 # val_labels = (np.array(val_labels) + 1 ) // 2 # train_ap = average_precision_score(train_labels, train_pred_scores) # val_ap = average_precision_score(val_labels, pred_scores) print(train_aps) print(val_aps) print('%s-%f: train: %.4f, val: %.4f, ap train: %4f, ap val: %4f' % (method, C, train_acc, val_acc, np.mean(train_aps), np.mean(val_aps)))
def make_classifier(base_estimator=None, class_hierarchy=None, **kwargs): return HierarchicalClassifier(class_hierarchy=class_hierarchy, base_estimator=base_estimator, **kwargs)
def main(argv): infile = argv[0] outdir = argv[1] if not os.path.exists(outdir): os.makedirs(outdir) # Read data file and retain data only corresponding to 5 sleep states df = pd.read_csv(infile, dtype={ 'label': object, 'user': object, 'position': object, 'dataset': object }) orig_cols = df.columns sleep_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM'] df = df[df['label'].isin(sleep_states)].reset_index() df = df[df['dataset'] == 'UPenn'].reset_index() df = df[orig_cols] print('... Number of data samples: %d' % len(df)) ctr = Counter(df['label']) for cls in ctr: print('%s: %d (%0.2f%%)' % (cls, ctr[cls], ctr[cls] * 100.0 / len(df))) feat_cols = ['ENMO_mean','ENMO_std','ENMO_min','ENMO_max','ENMO_mad','ENMO_entropy1','ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff', \ 'angz_mean','angz_std','angz_min','angz_max','angz_mad','angz_entropy1','angz_entropy2', 'angz_prevdiff', 'angz_nextdiff', \ 'LIDS_mean','LIDS_std','LIDS_min','LIDS_max','LIDS_mad','LIDS_entropy1','LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff'] X = df[feat_cols].values y = df['label'] groups = df['user'] # Class hierarchy for sleep stages class_hierarchy = { ROOT: {"Wake", "Sleep"}, "Sleep": {"NREM", "REM"}, "NREM": {"Light", "NREM 3"}, "Light": {"NREM 1", "NREM 2"} } graph = DiGraph(class_hierarchy) outer_cv_splits = 5 inner_cv_splits = 3 factor = 10.0 results = { 'Wake': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Sleep': { 'precision': [], 'recall': [], 'fbeta': [] }, 'REM': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 3': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Light': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 1': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 2': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Overall': { 'precision': [], 'recall': [], 'fbeta': [] } } # Outer CV group_kfold = GroupKFold(n_splits=outer_cv_splits) out_fold = 0 hierarchical_pred = [] for train_indices, test_indices in group_kfold.split(X, y, groups): out_fold += 1 print('Processing fold ' + str(out_fold)) out_fold_X_train = X[train_indices, :] out_fold_X_test = X[test_indices, :] out_fold_y_train = y[train_indices] out_fold_y_test = y[test_indices] out_fold_users_test = groups[test_indices] # Create a pipeline with scaler and hierarchical classifier pipe = Pipeline([ ('scaler', StandardScaler()), ( 'clf', HierarchicalClassifier( base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1), class_hierarchy=class_hierarchy, prediction_depth='mlnp', progress_wrapper=tqdm, #stopping_criteria=0.7 )) ]) # Inner CV strat_kfold = StratifiedKFold(n_splits=inner_cv_splits, random_state=0, shuffle=True) custom_cv_indices = [] for grp_train_idx, grp_test_idx in strat_kfold.split( out_fold_X_train, out_fold_y_train): custom_cv_indices.append((grp_train_idx, grp_test_idx)) print('Training') search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500], \ 'clf__base_estimator__max_depth': [5,10,None]} cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \ cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \ n_jobs=-1, verbose=1) cv_clf.fit(out_fold_X_train, out_fold_y_train) print('Predicting') out_fold_y_pred = cv_clf.predict(out_fold_X_test) best_clf = cv_clf.best_estimator_ # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \ as (y_test_, y_pred_, graph_, classes_): fold_h_prec, fold_h_rec, fold_h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_) results['Overall']['precision'].append(fold_h_prec) results['Overall']['recall'].append(fold_h_rec) results['Overall']['fbeta'].append(fold_h_fbeta) print("Fold %d: precision: %0.4f, recall: %0.4f, fbeta: %0.4f" % (out_fold, fold_h_prec, fold_h_rec, fold_h_fbeta)) y_test_ = fill_ancestors(y_test_, graph=graph_) y_pred_ = fill_ancestors(y_pred_, graph=graph_) hierarchical_pred.append( (out_fold_users_test, y_test_, y_pred_, classes_)) fold_wake_prec, fold_wake_rec, fold_wake_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Wake') fold_sleep_prec, fold_sleep_rec, fold_sleep_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Sleep') fold_rem_prec, fold_rem_rec, fold_rem_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'REM') fold_nrem_prec, fold_nrem_rec, fold_nrem_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM') fold_nrem3_prec, fold_nrem3_rec, fold_nrem3_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 3') fold_light_prec, fold_light_rec, fold_light_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Light') fold_nrem1_prec, fold_nrem1_rec, fold_nrem1_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 1') fold_nrem2_prec, fold_nrem2_rec, fold_nrem2_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 2') results['Wake']['precision'].append(fold_wake_prec) results['Wake']['recall'].append(fold_wake_rec) results['Wake']['fbeta'].append(fold_wake_fbeta) results['Sleep']['precision'].append(fold_sleep_prec) results['Sleep']['recall'].append(fold_sleep_rec) results['Sleep']['fbeta'].append(fold_sleep_fbeta) results['REM']['precision'].append(fold_rem_prec) results['REM']['recall'].append(fold_rem_rec) results['REM']['fbeta'].append(fold_rem_fbeta) results['NREM']['precision'].append(fold_nrem_prec) results['NREM']['recall'].append(fold_nrem_rec) results['NREM']['fbeta'].append(fold_nrem_fbeta) results['NREM 3']['precision'].append(fold_nrem3_prec) results['NREM 3']['recall'].append(fold_nrem3_rec) results['NREM 3']['fbeta'].append(fold_nrem3_fbeta) results['Light']['precision'].append(fold_light_prec) results['Light']['recall'].append(fold_light_rec) results['Light']['fbeta'].append(fold_light_fbeta) results['NREM 1']['precision'].append(fold_nrem1_prec) results['NREM 1']['recall'].append(fold_nrem1_rec) results['NREM 1']['fbeta'].append(fold_nrem1_fbeta) results['NREM 2']['precision'].append(fold_nrem2_prec) results['NREM 2']['recall'].append(fold_nrem2_rec) results['NREM 2']['fbeta'].append(fold_nrem2_fbeta) get_classification_report(results) save_user_report(hierarchical_pred, os.path.join(outdir, 'hierarchical_results.csv'))
def svm_train(): train_x, train_y, apps, train_y2 = get_data_set(train_path) test_x, test_y, apps, test_y2 = get_data_set(test_path) pred_x, _, apps, _ = get_data_set(pred_path) # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f: # pickle.dump(label_dic, f) logging.info('train {} test{}'.format(len(train_x), len(test_x))) t = time.time() data_set = train_x + test_x + pred_x vec = TfidfVectorizer(ngram_range=(1, 3), min_df=10, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) #vec=HashingVectorizer(ngram_range=(1, 3)) vec.fit_transform(data_set) # with open(project_path + 'tfidf.pkl', 'wb') as f: pickle.dump(vec, f) # with open(CHANNEL_MODEL + 'tfidf.pkl', 'rb') as f: # vec = pickle.load(f) trn_term_doc = vec.transform(train_x) print(label1_label2) time.sleep(20) tfidf_time = time.time() logging.info('time spend {}'.format(tfidf_time - t)) logging.info('begin svm ') lin_clf = svm.LinearSVC(C=1) lin_clf = CalibratedClassifierCV(lin_clf) clf = HierarchicalClassifier( base_estimator=lin_clf, class_hierarchy=label1_label2, ) clf.fit(trn_term_doc, train_y) print(clf.classes_) logging.info('end svm ') # with open(project_path + 'svm_model.pkl', 'wb') as f: # pickle.dump(lin_clf, f) train_preds = clf.predict(trn_term_doc) train_preds_prob = clf.predict_proba(trn_term_doc) print(len((clf.classes_)), train_preds_prob.shape) time.sleep(20) for reg, prob in zip(train_preds, train_preds_prob): print(reg, list(prob.argsort()[-1:][::-1])) time.sleep(20) from sklearn.metrics import classification_report logging.info('train {} accuracy_score {}, \n {}'.format( 'train', accuracy_score(train_y, train_preds), classification_report(train_y, train_preds))) t2 = time.time() logging.info('time spend {}'.format(t2 - t)) test_term_doc = vec.transform(test_x) test_preds_1 = clf.predict_proba(test_term_doc) test_preds = clf.predict(test_term_doc) logging.info('train {} accuracy_score {}, \n {}'.format( 'train', accuracy_score(train_y, train_preds), classification_report(test_y2, test_preds))) dic_lab = {} for k, v in label_dic2.items(): dic_lab[v] = k test_preds = [] test_preds = [] for prob in test_preds_1: test_preds.append(list(prob.argsort()[-2:][::-1])) test_y_name = [] test_preds_name = [] for real, pred in zip(test_y2, test_preds): prd = pred[0] print(real, pred) for pr in pred: if real == clf.classes_[pr]: prd = pr test_y_name.append(real) test_preds_name.append(clf.classes_[prd]) logging.info('{} model on {} data accuracy_score {} top2 test\n {}'.format( "train", test_path, accuracy_score(test_y_name, test_preds_name), classification_report(test_y_name, test_preds_name)))
1 7 C 9 / \ 3 8 """ if "ROOT" in hierarchy: hierarchy[ROOT] = hierarchy["ROOT"] del hierarchy["ROOT"] class_hierarchy = hierarchy bclf = OneVsRestClassifier(LinearSVC()) base_estimator = make_pipeline(vectorizer, bclf) clf = HierarchicalClassifier(base_estimator=base_estimator, class_hierarchy=class_hierarchy, algorithm="lcn", training_strategy="siblings", preprocessing=True, mlb=mlb, use_decision_function=True) print("training classifier") clf.fit(X_train_raw, y_train[:, :]) print("predicting") y_pred_scores = clf.predict_proba(X_dev_raw) y_pred_scores[np.where(y_pred_scores == 0)] = -10 y_pred = y_pred_scores > -0.25 if train == 1: print('f1 micro:',
def main(argv): infile = argv[0] dataset = argv[1] outdir = argv[2] resultdir = os.path.join(outdir, 'models') if not os.path.exists(resultdir): os.makedirs(resultdir) # Read data file and retain data only corresponding to 5 sleep states df = pd.read_csv(infile, dtype={'label':object, 'user':object,\ 'position':object, 'dataset':object}) states = ['Wake','NREM 1','NREM 2','NREM 3','REM','Nonwear'] df = df[df['label'].isin(states)].reset_index() print('... Number of data samples: %d' % len(df)) ctr = Counter(df['label']) for cls in ctr: print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad', 'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff', 'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff', 'angz_mean','angz_std','angz_range','angz_mad', 'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff', 'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff', 'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad', 'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff', 'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff'] ts = df['timestamp'] X = df[feat_cols].values y = df['label'] #y = np.array([states.index(i) for i in y]) groups = df['user'] fnames = df['filename'] feat_len = X.shape[1] # Class hierarchy for sleep stages class_hierarchy = { ROOT : {"Wear", "Nonwear"}, "Wear" : {"Wake", "Sleep"}, "Sleep" : {"NREM", "REM"}, "NREM" : {"Light", "NREM 3"}, "Light" : {"NREM 1", "NREM 2"} } graph = DiGraph(class_hierarchy) classes = [node for node in graph.nodes if node != ROOT] outer_cv_splits = 5; inner_cv_splits = 5 factor = 10.0 # Outer CV group_kfold = GroupKFold(n_splits=outer_cv_splits) out_fold = 0 hierarchical_pred = [] for train_indices, test_indices in group_kfold.split(X,y,groups): out_fold += 1 print('Processing fold ' + str(out_fold)) out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:] out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices] out_fold_users_test = groups[test_indices] out_fold_ts_test = ts[test_indices] out_fold_fnames_test = fnames[test_indices] # Create a pipeline with scaler and hierarchical classifier pipe = Pipeline([('scaler', StandardScaler()), ('clf', HierarchicalClassifier( base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1), class_hierarchy=class_hierarchy, prediction_depth='mlnp', progress_wrapper=tqdm, #stopping_criteria=0.7 )) ]) # Inner CV strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,\ random_state=0, shuffle=True) custom_cv_indices = [] for grp_train_idx, grp_test_idx in strat_kfold.split(out_fold_X_train,out_fold_y_train): custom_cv_indices.append((grp_train_idx, grp_test_idx)) print('Training') search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500,700], \ 'clf__base_estimator__max_depth': [5,10,15,None]} cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \ cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \ n_jobs=-1, verbose=1) cv_clf.fit(out_fold_X_train, out_fold_y_train) joblib.dump(cv_clf, os.path.join(resultdir,\ 'fold'+str(out_fold)+'_hierarchical_RF.sav')) print('Predicting') out_fold_y_pred = cv_clf.predict(out_fold_X_test) out_fold_y_pred_prob = cv_clf.predict_proba(out_fold_X_test) best_clf = cv_clf.best_estimator_ # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \ as (y_test_, y_pred_, graph_, classes_): states = classes_ y_test_ = fill_ancestors(y_test_, graph=graph_) y_pred_ = fill_ancestors(y_pred_, graph=graph_) y_pred_prob_ = np.zeros(out_fold_y_pred_prob.shape) for new_idx, label in enumerate(classes_): old_idx = classes.index(label) y_pred_prob_[:,new_idx] = out_fold_y_pred_prob[:,old_idx] hierarchical_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test, y_test_, y_pred_prob_)) cv_save_classification_result(hierarchical_pred, states, os.path.join(outdir, 'hierarchical_classification_results.csv'), method = 'hierarchical')