예제 #1
0
def validate_model(validate_data, columns, mentioned_clf, clfs):
    logger.info("Begin validate")
    content_validate = validate_data.iloc[:, 1]
    validate_data_segs = seg_words(content_validate)
    logger.debug("seg validate data done")

    scores = dict()
    predict = mentioned_clf.predict(validate_data_segs)
    predict = predict * -2
    for column in columns:
        logger.debug("predict:%s", column)
        tmp_predict = predict.copy()
        for v_index, v_content_seg in enumerate(validate_data_segs):
            if tmp_predict[v_index] == 0:
                tmp_predict[v_index] = clfs[column].predict([v_content_seg])
        report(validate_data[column], tmp_predict)
        score = f1_score(validate_data[column], tmp_predict, average='macro')
        scores[column] = score

    str_score = "\n"
    score = np.mean(list(scores.values()))
    for column in columns:
        str_score = str_score + column + ":" + str(scores[column]) + "\n"

    logger.info("f1_scores: %s\n" % str_score)
    logger.info("f1_score: %s" % score)
    logger.info("complete validate model")
예제 #2
0
def train_mentioned():
    logger.info("########################################")
    logger.info("start train mentioned models")
    logger.info("########################################")
    # load train data
    train_data_df = load_data_from_csv(config.train_data_path)
    validate_data_df = load_data_from_csv(config.validate_data_path)

    content_train = train_data_df.iloc[0:config.train_data_size, 1]
    logger.debug("start seg train data")
    train_content_segs = seg_words(content_train)

    logger.debug("start seg validate data")
    content_validate = validate_data_df.iloc[0:, 1]
    validate_segs = seg_words(content_validate)
    logger.debug("load vectorizer")
    vectorizer_tfidf = joblib.load(config.model_save_path + vec_name)

    for model in models:
        train_mentioned_model(train_data_df, train_content_segs,
                              validate_data_df, validate_segs,
                              vectorizer_tfidf, model)
예제 #3
0
def vectorizer():
    logger.info("start to vectorizer content")
    train_data = load_data_from_csv(config.train_data_path)
    content_segs = seg_words(train_data.iloc[0:, 1])
    tf_idf = TfidfVectorizer(ngram_range=(1, 5),
                             min_df=2,
                             norm="l2",
                             max_df=0.4,
                             stop_words=stopwords)
    tf_idf.fit(content_segs)
    if not os.path.exists(config.model_save_path):
        os.makedirs(config.model_save_path)
    joblib.dump(tf_idf, config.model_save_path + vec_name, compress=True)
    logger.info("succes to save vectorizer")
예제 #4
0
def train_specific_model(train_data):
    columns = train_data.columns.values.tolist()
    logger.debug("begin to seg train content")
    content_segments = seg_words(
        train_data.content.iloc[0:config.train_data_size])
    logger.debug("seg train content done")
    vectorizer = joblib.load(config.model_save_path + vec_name)
    logger.debug("load vectorizer")
    validate_data_df = load_data_from_csv(config.validate_data_path)
    validata_segs = seg_words(validate_data_df.content)
    logger.debug("seg validate content")
    scores = dict()
    for model_name in columns[:-1]:
        logger.info("begin to train %s model", model_name)
        cw = [{
            -2: a,
            -1: b,
            0: w,
            1: x
        } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12)
              for x in range(5, 8)]
        # cw = {0: 7, 1: 6, -1: 6, -2: 1}
        positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
        y_label = train_data[model_name].iloc[0:config.train_data_size]
        positive_clf.fit(content_segments, y_label)

        y_pre = positive_clf.predict(validata_segs)
        y_true = validate_data_df[model_name].iloc[0:]
        report(y_true, y_pre)
        score = f1_score(y_true, y_pre, average="macro")
        logger.info("score for model:%s is %s ", model_name, str(score))
        scores[model_name] = score
        joblib.dump(positive_clf,
                    config.model_save_path + model_name + ".pkl",
                    compress=True)
    score = np.mean(list(scores.values()))
    logger.info("f1_scores: %s" % score)
예제 #5
0
    args = parser.parse_args()
    model_name = args.model_name
    if not model_name:
        model_name = "model_dict.pkl"

    # load data
    logger.info("start load data...")
    test_data_df = load_data_from_csv(config.test_data_path)

    # load model
    logger.info("start load model...")
    classifier_dict = joblib.load(config.model_save_path + model_name)

    columns = test_data_df.columns.tolist()
    # seg words
    logger.info("start seg test data...")
    content_test = test_data_df.iloc[:, 1]
    content_test = seg_words(content_test)
    logger.info("complete seg test data.")

    # model predict
    logger.info("start predict test data...")
    for column in columns[2:]:
        test_data_df[column] = classifier_dict[column].predict(content_test)
        logger.info("compete %s predict." % column)

    test_data_df.to_csv(config.test_data_predict_out_path,
                        encoding="utf_8_sig",
                        index=False)
    logger.info("compete predict test data.")
예제 #6
0
                        help='the name of model')

    args = parser.parse_args()
    model_name = args.model_name
    if not model_name:
        model_name = "model_dict.pkl"

    # load train data
    logger.info("start load data")
    train_data_df = load_data_from_csv(config.train_data_path)
    validate_data_df = load_data_from_csv(config.validate_data_path)

    content_train = train_data_df.iloc[:, 1]

    logger.info("start seg train data")
    content_train = seg_words(content_train)
    logger.info("complete seg train data")

    columns = train_data_df.columns.values.tolist()

    logger.info("start train feature extraction")
    vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2')
    vectorizer_tfidf.fit(content_train)
    logger.info("complete train feature extraction models")
    logger.info("vocab shape: %s" % np.shape(vectorizer_tfidf.vocabulary_.keys()))

    # model train
    logger.info("start train model")
    classifier_dict = dict()
    for column in columns[2:]:
        label_train = train_data_df[column]
예제 #7
0
    classifier_dict = dict()
    for column in columns[2:]:
        label_train = train_data_df[column]
        text_classifier = TextClassifier(vectorizer=vectorizer_tfidf)
        logger.info("start train %s model" % column)
        text_classifier.fit(content_train, label_train)
        logger.info("complete train %s model" % column)
        classifier_dict[column] = text_classifier

    logger.info("complete train model")

    # validate model
    content_validate = validate_data_df.iloc[:, 1]

    logger.info("start seg validate data")
    content_validate = seg_words(content_validate)
    logger.info("complete seg validate data")

    logger.info("start validate model")
    f1_score_dict = dict()
    for column in columns[2:]:
        label_validate = validate_data_df[column]
        text_classifier = classifier_dict[column]
        f1_score = text_classifier.get_f1_score(content_validate,
                                                label_validate)
        f1_score_dict[column] = f1_score

    f1_score = np.mean(list(f1_score_dict.values()))
    str_score = "\n"
    for column in columns[2:]:
        str_score = str_score + column + ":" + str(
예제 #8
0
#!/user/bin/env python
# -*- coding:utf-8 -*-
from data_process import load_data_from_csv, seg_words
from sklearn.feature_extraction.text import TfidfVectorizer
import config
import numpy as np
import pickle

if __name__ == '__main__':

    # load train data
    train_data_df = load_data_from_csv(config.train_data_path)
    validate_data_df = load_data_from_csv(config.validate_data_path)

    content_train = train_data_df.iloc[:, 1]

    print("start seg train data")
    content_train = seg_words(content_train)
    print("complete seg train data")

    print("start train feature extraction")
    vectorizer_tfidf = TfidfVectorizer(analyzer='word',
                                       ngram_range=(1, 5),
                                       min_df=5,
                                       norm='l2')
    vectorizer_tfidf.fit(content_train)
    print("complete train feature extraction models")
    print("vocab shape: ")
    print(vectorizer_tfidf.vocabulary_.keys())

    pickle.dump(vectorizer_tfidf, open("../model/vectorizer_tfidf.pkl", 'w'))