print(len(x_train_str), len(y_train))

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_train_str,
                                                            y_train,
                                                            random_state=42,
                                                            test_size=0.2)
print('X_train_s.shape', len(X_train_s), len(X_train_s[0]))
for x in X_train_s:
    print(len(x))
sys.exit(1)
print('y_train_s.shape', len(y_train_s), y_train_s[0].shape)

clf.fit(X_train_s, y_train_s)
print("predicting")

y_pred_scores_s = clf.predict_proba(X_test_s)

y_pred_scores_s[np.where(y_pred_scores_s == 0)] = -10
y_pred_s = y_pred_scores_s > -0.25

print(
    'f1 micro:',
    f1_score(y_true=y_test_s,
             y_pred=y_pred_s[:, :y_test_s.shape[1]],
             average='micro'))
print(
    'f1 macro:',
    f1_score(y_true=y_test_s,
             y_pred=y_pred_s[:, :y_test_s.shape[1]],
             average='macro'))
print(
Пример #2
0
def svm_train():
    train_x, train_y, apps, train_y2 = get_data_set(train_path)
    test_x, test_y, apps, test_y2 = get_data_set(test_path)
    pred_x, _, apps, _ = get_data_set(pred_path)
    # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f:
    #     pickle.dump(label_dic, f)

    logging.info('train {} test{}'.format(len(train_x), len(test_x)))
    t = time.time()
    data_set = train_x + test_x + pred_x
    vec = TfidfVectorizer(ngram_range=(1, 3),
                          min_df=10,
                          max_df=0.9,
                          use_idf=1,
                          smooth_idf=1,
                          sublinear_tf=1)
    #vec=HashingVectorizer(ngram_range=(1, 3))
    vec.fit_transform(data_set)
    #
    with open(project_path + 'tfidf.pkl', 'wb') as f:
        pickle.dump(vec, f)
    # with open(CHANNEL_MODEL + 'tfidf.pkl', 'rb') as f:
    #     vec = pickle.load(f)

    trn_term_doc = vec.transform(train_x)
    print(label1_label2)
    time.sleep(20)
    tfidf_time = time.time()
    logging.info('time spend {}'.format(tfidf_time - t))

    logging.info('begin svm ')
    lin_clf = svm.LinearSVC(C=1)
    lin_clf = CalibratedClassifierCV(lin_clf)

    clf = HierarchicalClassifier(
        base_estimator=lin_clf,
        class_hierarchy=label1_label2,
    )

    clf.fit(trn_term_doc, train_y)

    print(clf.classes_)

    logging.info('end  svm ')
    # with open(project_path + 'svm_model.pkl', 'wb') as f:
    #     pickle.dump(lin_clf, f)

    train_preds = clf.predict(trn_term_doc)
    train_preds_prob = clf.predict_proba(trn_term_doc)
    print(len((clf.classes_)), train_preds_prob.shape)
    time.sleep(20)
    for reg, prob in zip(train_preds, train_preds_prob):
        print(reg, list(prob.argsort()[-1:][::-1]))
    time.sleep(20)
    from sklearn.metrics import classification_report

    logging.info('train {} accuracy_score {},  \n {}'.format(
        'train', accuracy_score(train_y, train_preds),
        classification_report(train_y, train_preds)))
    t2 = time.time()
    logging.info('time spend {}'.format(t2 - t))

    test_term_doc = vec.transform(test_x)
    test_preds_1 = clf.predict_proba(test_term_doc)
    test_preds = clf.predict(test_term_doc)
    logging.info('train {} accuracy_score {},  \n {}'.format(
        'train', accuracy_score(train_y, train_preds),
        classification_report(test_y2, test_preds)))

    dic_lab = {}
    for k, v in label_dic2.items():
        dic_lab[v] = k
    test_preds = []
    test_preds = []
    for prob in test_preds_1:
        test_preds.append(list(prob.argsort()[-2:][::-1]))

    test_y_name = []
    test_preds_name = []
    for real, pred in zip(test_y2, test_preds):
        prd = pred[0]
        print(real, pred)
        for pr in pred:
            if real == clf.classes_[pr]:
                prd = pr
        test_y_name.append(real)
        test_preds_name.append(clf.classes_[prd])

    logging.info('{} model on {} data accuracy_score {} top2 test\n {}'.format(
        "train", test_path, accuracy_score(test_y_name, test_preds_name),
        classification_report(test_y_name, test_preds_name)))
def main():
    subexp_name = 'YoungJaeShinSamples/4'
    anno_incomplete_file = '../data/data_jan2019_anno/anno_all_incomplete_YoungJaeShinSamples_4_useryoungjae.db'
    anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_useryoungjae.db'
    # anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_usertest0123.db'
    thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_useryoungjae.db'
    # thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_usertest0123.db'

    data_path = os.path.join('../data/data_jan2019', subexp_name)
    result_path = os.path.join('../results/data_jan2019_script/mat',
                               subexp_name)
    clf_path = os.path.join(
        '../results/data_jan2019_script/thickthinglue_clf_complete',
        subexp_name)
    if not os.path.exists(clf_path):
        os.makedirs(clf_path)

    # merge the annotation
    if not os.path.exists(anno_complete_file):
        os.system('cp %s %s' % (anno_incomplete_file, anno_complete_file))

        # update the annotation into the all annotation
        thickthin_oriname_newflakeids = readthickthindb(thickthin_anno_file)
        n_thickthin_flakes = len(thickthin_oriname_newflakeids)
        print(n_thickthin_flakes)

        updatedb(anno_complete_file, thickthin_oriname_newflakeids)

    # get the train/val split
    split_name = os.path.join(clf_path, 'train_val_split.p')
    if os.path.exists(split_name):
        to_load = pickle.load(open(split_name, 'rb'))
        train_names = to_load['train_names']
        train_labels = to_load['train_labels']
        val_names = to_load['val_names']
        val_labels = to_load['val_labels']
    else:
        itemname_labels = readdb(anno_complete_file)
        train_names, train_labels, val_names, val_labels = split_trainval(
            itemname_labels)
        to_save = dict()
        to_save['train_names'] = train_names
        to_save['train_labels'] = train_labels
        to_save['val_names'] = val_names
        to_save['val_labels'] = val_labels
        pickle.dump(to_save, open(split_name, 'wb'))

    # load flakes
    flake_save_name = os.path.join(clf_path, 'train_val_data.p')
    if os.path.exists(flake_save_name):
        to_load = pickle.load(open(flake_save_name, 'rb'))
        train_flakes = to_load['train_flakes']
        train_feats = to_load['train_feats']
        val_flakes = to_load['val_flakes']
        val_feats = to_load['val_feats']
    else:
        img_names = os.listdir(data_path)
        img_names.sort()
        img_flakes = Parallel(n_jobs=8)(delayed(load_one_image)(
            os.path.join(data_path, img_names[i]),
            os.path.join(result_path, img_names[i][:-4] + '.p'))
                                        for i in range(len(img_names)))
        # pickle.dump(img_flakes, open(flake_save_name, 'wb'))
        # load corresponding flakes
        train_flakes, train_feats = locate_flakes(train_names, img_flakes,
                                                  img_names)
        val_flakes, val_feats = locate_flakes(val_names, img_flakes, img_names)
        to_save = dict()
        to_save['train_flakes'] = train_flakes
        to_save['train_feats'] = train_feats
        # to_save['train_names'] = train_names
        # to_save['train_labels'] = train_labels
        to_save['val_flakes'] = val_flakes
        to_save['val_feats'] = val_feats
        # to_save['val_names'] = val_names
        # to_save['val_labels'] = val_labels
        pickle.dump(to_save, open(flake_save_name, 'wb'))
    print('loading done')

    # normalize data
    mean_feat = np.mean(train_feats, axis=0, keepdims=True)
    std_feat = np.std(train_feats, axis=0, keepdims=True)
    norm_fea = {}
    norm_fea['mean'] = mean_feat
    norm_fea['std'] = std_feat
    pickle.dump(norm_fea, open(os.path.join(clf_path, 'normfea.p'), 'wb'))
    train_feats -= mean_feat
    train_feats = train_feats / std_feat
    # train_feats = train_feats / np.linalg.norm(train_feats, 2, axis=1, keepdims=True)
    val_feats -= mean_feat
    val_feats = val_feats / std_feat
    # val_feats = val_feats / np.linalg.norm(val_feats, 2, axis=1, keepdims=True)

    # run classifier
    # method = 'linearsvm'
    # method = 'ridge'
    # method = 'rbfkernelsvm'
    method = hyperparams['clf_method']
    print(method)
    C = hyperparams['C']
    # C = 10
    Cs = [
        0.001,
        0.01,
        0.1,
        0.25,
        0.5,
        0.75,
        1,
        2.5,
        5,
    ]

    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import make_pipeline
    from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
    from sklearn_hierarchical_classification.constants import ROOT
    from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
    from sklearn_hierarchical_classification.tests.fixtures import make_digits_dataset

    class_hierarchy = {
        ROOT: ["A", "2"],
        "A": ["0", "1"],
    }
    for C in Cs:
        clf_save_path = os.path.join(
            clf_path, 'feanorm_weighted_classifier-%s-%f.p' % (method, C))
        if os.path.exists(clf_save_path):
            clf = pickle.load(open(clf_save_path, 'rb'))
        else:
            if method == 'hielinearsvm':
                # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=5e4, class_weight='balanced')
                # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=9e4, class_weight={0:1, 1:5, 2:1})#, multi_class='crammer_singer')
                # clf.fit(train_feats, train_labels)

                base_estimator = make_pipeline(
                    # TruncatedSVD(n_components=24),
                    SVC(gamma='auto', kernel="linear", probability=True,
                        C=C), )

            # elif method == 'ridge':
            #     clf = RidgeClassifier(random_state=0, alpha=C)
            #     clf.fit(train_feats, train_labels)
            elif method == 'hierbfsvm':
                # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto')
                # clf.fit(train_feats, train_labels)
                base_estimator = make_pipeline(
                    # TruncatedSVD(n_components=24),
                    SVC(gamma='auto', kernel="rbf", probability=True, C=C), )
            else:
                raise NotImplementedError

            clf = HierarchicalClassifier(
                base_estimator=base_estimator,
                class_hierarchy=class_hierarchy,
            )
            train_labels_str = [str(_) for _ in train_labels]
            clf.fit(train_feats, train_labels_str)
            pickle.dump(clf, open(clf_save_path, 'wb'))

        train_pred_cls = clf.predict(train_feats)
        train_pred_cls = [int(_) for _ in train_pred_cls]
        train_pred_scores = clf.predict_proba(train_feats)
        val_pred_cls = clf.predict(val_feats)
        val_pred_cls = [int(_) for _ in val_pred_cls]
        val_pred_scores = clf.predict_proba(val_feats)
        clf_vis_path = os.path.join(clf_path, subexp_name, 'vis',
                                    'feanorm_weighted_%s-%f' % (method, C))
        if not os.path.exists(clf_vis_path):
            os.makedirs(clf_vis_path)

        from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_recall_curve
        train_acc = accuracy_score(train_labels, train_pred_cls)
        val_acc = accuracy_score(val_labels, val_pred_cls)
        train_conf = confusion_matrix(train_labels, train_pred_cls)
        train_conf = train_conf / np.sum(train_conf, 1, keepdims=True)
        val_conf = confusion_matrix(val_labels, val_pred_cls)
        val_conf = val_conf / np.sum(val_conf, 1, keepdims=True)
        # # val_acc = accuracy_score(, test_pred_cls)
        # print('train acc: %.4f' % (train_acc))
        print(train_conf)
        print(val_conf)
        # vis_error(val_pred_cls, val_pred_scores, val_labels, val_flakes, clf_vis_path, val_names, 'val')
        # vis_error(train_pred_cls, train_pred_scores, train_labels, train_flakes, clf_vis_path, train_names, 'train')

        # calculate map:
        uniquelabels = [0, 1, 2]
        train_aps = []
        val_aps = []
        fig = plt.figure()
        ax = fig.add_subplot(111)
        legends = ['thick', 'thin', 'glue']
        for l in uniquelabels:
            l_train_labels = [_ == l for _ in train_labels]
            l_val_labels = [_ == l for _ in val_labels]
            # if method == 'linearsvm':
            #     clf = LinearSVC(random_state=0, tol=1e-5, C=C)
            #     clf.fit(train_feats, l_train_labels)
            # elif method == 'ridge':
            #     clf = RidgeClassifier(random_state=0, alpha=C)
            #     clf.fit(train_feats, l_train_labels)
            # elif method == 'rbfkernelsvm':
            #     clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto')
            #     clf.fit(train_feats, l_train_labels)
            # else:
            #     raise NotImplementedError

            # l_train_pred_scores = clf.decision_function(train_feats)
            # l_val_pred_scores = clf.decision_function(val_feats)
            train_aps.append(
                average_precision_score(l_train_labels, train_pred_scores[:,
                                                                          l]))
            val_aps.append(
                average_precision_score(l_val_labels, val_pred_scores[:, l]))
            # print(val_pred_scores[:, l])
            # print(np.array(l_val_labels, dtype=np.uint8))
            precision_l, recall_l, _ = precision_recall_curve(
                np.array(l_val_labels, dtype=np.uint8), val_pred_scores[:, l])
            print(l, getPrecisionAtRecall(precision_l, recall_l, 0.90),
                  getPrecisionAtRecall(precision_l, recall_l, 0.95))
            ax.plot(recall_l, precision_l, label=legends[l])

        plt.legend()
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])

        plt.savefig(os.path.join(clf_path,
                                 'feanorm_weighted_%s-%f.png' % (method, C)),
                    dpi=300)
        plt.close(fig)

        # train_labels = (np.array(train_labels) + 1)//2
        # val_labels = (np.array(val_labels) + 1 ) // 2
        # train_ap = average_precision_score(train_labels, train_pred_scores)
        # val_ap = average_precision_score(val_labels, pred_scores)
        print(train_aps)
        print(val_aps)
        print('%s-%f: train: %.4f, val: %.4f, ap train: %4f, ap val: %4f' %
              (method, C, train_acc, val_acc, np.mean(train_aps),
               np.mean(val_aps)))
Пример #4
0
    base_estimator = make_pipeline(vectorizer, bclf)

    clf = HierarchicalClassifier(base_estimator=base_estimator,
                                 class_hierarchy=class_hierarchy,
                                 algorithm="lcn",
                                 training_strategy="siblings",
                                 preprocessing=True,
                                 mlb=mlb,
                                 use_decision_function=True)

    print("training classifier")
    clf.fit(X_train_raw, y_train[:, :])
    print("predicting")

    y_pred_scores = clf.predict_proba(X_dev_raw)

    y_pred_scores[np.where(y_pred_scores == 0)] = -10
    y_pred = y_pred_scores > -0.25

    if train == 1:
        print('f1 micro:',
              f1_score(y_true=y_dev, y_pred=y_pred, average='micro'))
        print('f1 macro:',
              f1_score(y_true=y_dev, y_pred=y_pred, average='macro'))
        print(classification_report(y_true=y_dev, y_pred=y_pred))
    else:
        import networkx as nx
        graph = nx.DiGraph(hierarchy)
        print_results("submission_baseline.txt", hierarchy, y_pred > -0.25,
                      mlb, ids, graph)