Exemplo n.º 1
0
def main():
    # ---------------
    # create corpus
    # ---------------
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)

    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    # -----------
    # evaluation
    # -----------
    file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)]
    filenames = [os.path.basename(f) for f in file_list]
    dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames]
    print('dirnames')
    print(dirnames)
    make_dirs = [
        mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames
    ]
    evaluation = [
        eval_run(_RESULT_DIR, f, word_to_id, matrix, d)
        for f, d in zip(file_list, dirnames)
    ]
Exemplo n.º 2
0
def main():
    # -------------
    # load dataset
    # -------------
    make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR]
    for i in make_dirs:
        if os.path.isdir(i) is False:
            os.makedirs(i)
        else:
            pass

    category_data = _CATEGORY_DATA
    rne_map = convert_txt_to_dict(category_data)
    print('rne_map')
    print(rne_map)

    train_data_list = os.listdir(_LOG_DIR)
    print('train_data')
    dst_filepath = os.path.join(_LOG_DIR, 'all.csv')

    all_df = pd.DataFrame({})
    for f in train_data_list:
        # if f == 'all.csv':
        #     print('already exist all.csv')
        #     sys.exit(1)
        print(f)
        read_filepath = os.path.join(_LOG_DIR, f)
        preprocess_df = data_preprocessing(read_filepath, _COLUMNS)
        df_dependency_tag = convert_id_to_rne(preprocess_df)
        print(df_dependency_tag)
        df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1)
        print(df_concat.tail)
        target_list = [
            'new_tag', 'new_word', 'dependency_tag', 'dependency_dst'
        ]
        target_df = df_concat[target_list]
        all_df = pd.concat([all_df, target_df], axis=0)
    all_df.to_csv(dst_filepath, index=False)

    df = all_df
    print('all_df')
    print(all_df)
    print('df')
    print(df.head())
    del all_df

    # ----------------------------
    # create corpus and co-matrix
    # ----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print(matrix)
def main():
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)

    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    df = pd.read_csv('lr_train_20190425.csv')
    print(df.head())

    # # ---------------------------
    # # adjust the number of data
    # # ---------------------------
    # df_0 = df[df['label'] == 0]
    # df_1 = df[df['label'] == 1]
    # print('0')
    # print(len(df_0))
    # print('1')
    # print(len(df_1))

    # X_0 = df_0[:4000]
    # X_1 = df_1

    # df = pd.concat([X_0, X_1])
    # print(len(df))
    # # ---------------------------

    # -------
    # train
    # -------
    X_org_word = df['org'].values
    X_dst_word = df['dst'].values
    y = df['label'].values

    # print('X_org_word')
    # print(X_org_word)
    # print('X_dst_word')
    # print(X_dst_word)
    # print('y')
    # print(y)

    X_org_to_id = np.array([word_to_id[x] for x in X_org_word])
    X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word])

    # print('X_org_to_id')
    # print(X_org_to_id)
    # print('X_dst_to_id')
    # print(X_dst_to_id)

    X_org_feature = np.array([matrix[x] for x in X_org_to_id])
    X_dst_feature = np.array([matrix[x] for x in X_dst_to_id])

    # print('X_org_feature')
    # print(X_org_feature)
    # print('X_dst_feature')
    # print(X_dst_feature)

    X = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)])
    X = X[:, np.newaxis]

    # print('X')
    # print(X)
    # print(len(X))

    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    print('StandardScaler')
    print(X_scaler)

    X_train, X_test, y_train, y_test = train_test_split(X_scaler,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    clf = LogisticRegression(
        random_state=0,
        solver='liblinear',
    ).fit(X_train, y_train)

    joblib.dump(clf, 'lr.pkl')

    # ------
    # eval
    # ------
    print(clf.score(X_test, y_test))
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))
    print(classification_report(pred, y_test))
    print(confusion_matrix(pred, y_test))

    print(clf.predict_proba(X_test))
Exemplo n.º 4
0
def main():
    # -----------------------------
    # create corpus and co-matirx
    # -----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    # output corpus to txt
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    # ---------------------
    # generate label data
    # ---------------------
    label = np.array([x for x in id_to_word.values()])
    label = label[:, np.newaxis]
    print('label')
    print(label)

    # ------------------------
    # generate category data
    # ------------------------
    category_label_data = generate_arc_category_data(_ANNOTATION_DIR)
    unique_category = category_label_data['arclabel'].unique()
    print(category_label_data.head())
    print(category_label_data.tail())
    print(unique_category)

    # ----------------------------
    # generate feature and label
    # ----------------------------
    category_label_data['feature_org_idx'] = category_label_data['new_word']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_dst_idx'] = category_label_data['dependency_dst']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_org'] = category_label_data['feature_org_idx']\
      .apply(lambda x: matrix[x])
    category_label_data['feature_dst'] = category_label_data['feature_dst_idx']\
      .apply(lambda x: matrix[x])
    print('category_label_data')
    print(category_label_data)

    extend_feature = extend_columns(
        category_label_data['feature_org'], category_label_data['feature_dst']
    )
    print('extend_feature')
    print(extend_feature)
    print(extend_feature.shape)
    X = extend_feature

    category_map = category_mapping(unique_category)
    print('category_map')
    print(category_map)
    category_label = category_label_data['arclabel'].values
    category_label = category_label.flatten()
    print('category_label')
    print(category_label)
    y = convert_category_to_numerical(category_label, category_map)
    print('y')
    print(y)

    # ----------
    # training
    # ----------
    print('dataset size')
    print('X: {0} , y:{1}'.format(X.shape, y.shape))
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    clf = SVC(kernel='linear', C=1).fit(X_train, y_train)
    t0 = time.time()
    clf.fit(X_train, y_train)
    joblib.dump(word_to_id, 'word_to_id.pkl')
    joblib.dump(matrix, 'matrix.pkl')
    joblib.dump(clf, 'svc.pkl')
    t1 = time.time()
    print('exec time : {}'.format(t1 - t0))

    # ------------
    # validation
    # ------------
    prediction_map = {k: v for v, k in category_map.items()}
    joblib.dump(prediction_map, 'prediction_map.pkl')
    print(clf.score(X_test, y_test))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    print(classification_report(
        y_test,
        clf.predict(X_test),
        # target_names=category_map.values()
    ))

    # # tamanegi test
    # print('**************** tamanegi-surioro ****************')
    # onion_id = word_to_id['玉ねぎ']
    # print('onion_id')
    # print(onion_id)
    # suri_id = word_to_id['すりおろ']
    # print('suri_id')
    # print(suri_id)
    # onion_feature = matrix[0]
    # suri_feature = matrix[2]
    # sample_feature = np.hstack((onion_feature, suri_feature)).flatten()
    # print('sample_feature')
    # print(sample_feature)
    # print(clf.predict([sample_feature]))
    # pred = clf.predict([sample_feature])
    # print(prediction_map[pred[0]])

    # model load
    load_model = joblib.load('svc.pkl')
    print('load_model')
    print(load_model)
Exemplo n.º 5
0
def main():
    # -------------
    # load dataset
    # -------------
    make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR]
    for i in make_dirs:
        if os.path.isdir(i) is False:
            os.makedirs(i)
        else:
            pass

    category_data = _CATEGORY_DATA
    rne_map = convert_txt_to_dict(category_data)
    print('rne_map')
    print(rne_map)

    train_data_list = os.listdir(_LOG_DIR)
    print('train_data')
    dst_filepath = os.path.join(_LOG_DIR, 'all.csv')

    all_df = pd.DataFrame({})
    for f in train_data_list:
        # if f == 'all.csv':
        #     print('already exist all.csv')
        #     sys.exit(1)
        print(f)
        read_filepath = os.path.join(_LOG_DIR, f)
        preprocess_df = data_preprocessing(read_filepath, _COLUMNS)
        df_dependency_tag = convert_id_to_rne(preprocess_df)
        print(df_dependency_tag)
        df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1)
        print(df_concat.tail)
        target_list = [
            'new_tag', 'new_word', 'dependency_tag', 'dependency_dst'
        ]
        target_df = df_concat[target_list]
        all_df = pd.concat([all_df, target_df], axis=0)
    all_df.to_csv(dst_filepath, index=False)

    dst_file = os.path.join(_TRAIN_DIR, 'lr_train.csv')
    df = all_df
    print('all_df')
    print(all_df)
    print('df')
    print(df.head())
    del all_df

    # ----------------------------
    # create corpus and co-matrix
    # ----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print(matrix)

    # -------------------------
    # label to one-hot-encode
    # -------------------------
    enc = OneHotEncoder()
    label_data = df['new_tag'].values
    label_reshape = label_data[:, np.newaxis]
    print('label_data')
    print(label_data)
    enc.fit(label_reshape)
    onehotlabel = enc.transform(label_reshape).toarray()
    print('onehotlabel')
    print(onehotlabel)

    # ------------------------------------
    # join feature and one-hot-encode
    # ------------------------------------
    category_label_data = df
    category_label_data['feature_org_idx'] = category_label_data['new_word']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_org'] = category_label_data['feature_org_idx']\
      .apply(lambda x: matrix[x])

    feature_matrix = category_label_data['feature_org'].values
    train_feature_matrix = np.array([x.flatten() for x in feature_matrix])
    print('train_feature_matrix')
    print(train_feature_matrix)
    print(train_feature_matrix.shape)

    print(onehotlabel.shape)
    train_data = np.hstack((train_feature_matrix, onehotlabel))
    print(train_data)
    print(train_data.shape)