示例#1
0
def train_model():
    logger.info("########################################")
    logger.info("start train models")
    logger.info("########################################")
    train_data_df = load_data_from_csv(config.train_data_path)
    for model in models:
        data_to_train = filter_data(train_data_df, model)
        train_specific_model(data_to_train)
示例#2
0
def validate():
    validate_data_df = load_data_from_csv(config.validate_data_path)
    mentioned_clf = joblib.load(config.model_save_path +
                                "traffic_mentioned.pkl")
    clfs = {}
    clfs['location_traffic_convenience'] = joblib.load(
        config.model_save_path + "location_traffic_convenience.pkl")
    validate_model(validate_data_df, ['location_traffic_convenience'],
                   mentioned_clf, clfs)
示例#3
0
def train_mentioned():
    logger.info("########################################")
    logger.info("start train mentioned models")
    logger.info("########################################")
    # load train data
    train_data_df = load_data_from_csv(config.train_data_path)
    validate_data_df = load_data_from_csv(config.validate_data_path)

    content_train = train_data_df.iloc[0:config.train_data_size, 1]
    logger.debug("start seg train data")
    train_content_segs = seg_words(content_train)

    logger.debug("start seg validate data")
    content_validate = validate_data_df.iloc[0:, 1]
    validate_segs = seg_words(content_validate)
    logger.debug("load vectorizer")
    vectorizer_tfidf = joblib.load(config.model_save_path + vec_name)

    for model in models:
        train_mentioned_model(train_data_df, train_content_segs,
                              validate_data_df, validate_segs,
                              vectorizer_tfidf, model)
示例#4
0
def vectorizer():
    logger.info("start to vectorizer content")
    train_data = load_data_from_csv(config.train_data_path)
    content_segs = seg_words(train_data.iloc[0:, 1])
    tf_idf = TfidfVectorizer(ngram_range=(1, 5),
                             min_df=2,
                             norm="l2",
                             max_df=0.4,
                             stop_words=stopwords)
    tf_idf.fit(content_segs)
    if not os.path.exists(config.model_save_path):
        os.makedirs(config.model_save_path)
    joblib.dump(tf_idf, config.model_save_path + vec_name, compress=True)
    logger.info("succes to save vectorizer")
示例#5
0
def train_specific_model(train_data):
    columns = train_data.columns.values.tolist()
    logger.debug("begin to seg train content")
    content_segments = seg_words(
        train_data.content.iloc[0:config.train_data_size])
    logger.debug("seg train content done")
    vectorizer = joblib.load(config.model_save_path + vec_name)
    logger.debug("load vectorizer")
    validate_data_df = load_data_from_csv(config.validate_data_path)
    validata_segs = seg_words(validate_data_df.content)
    logger.debug("seg validate content")
    scores = dict()
    for model_name in columns[:-1]:
        logger.info("begin to train %s model", model_name)
        cw = [{
            -2: a,
            -1: b,
            0: w,
            1: x
        } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12)
              for x in range(5, 8)]
        # cw = {0: 7, 1: 6, -1: 6, -2: 1}
        positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
        y_label = train_data[model_name].iloc[0:config.train_data_size]
        positive_clf.fit(content_segments, y_label)

        y_pre = positive_clf.predict(validata_segs)
        y_true = validate_data_df[model_name].iloc[0:]
        report(y_true, y_pre)
        score = f1_score(y_true, y_pre, average="macro")
        logger.info("score for model:%s is %s ", model_name, str(score))
        scores[model_name] = score
        joblib.dump(positive_clf,
                    config.model_save_path + model_name + ".pkl",
                    compress=True)
    score = np.mean(list(scores.values()))
    logger.info("f1_scores: %s" % score)
示例#6
0
    parser = argparse.ArgumentParser()
    parser.add_argument('-mn',
                        '--model_name',
                        type=str,
                        nargs='?',
                        help='the name of model')

    args = parser.parse_args()
    model_name = args.model_name
    if not model_name:
        model_name = "model_dict.pkl"

    # load data
    logger.info("start load data...")
    test_data_df = load_data_from_csv(config.test_data_path)

    # load model
    logger.info("start load model...")
    classifier_dict = joblib.load(config.model_save_path + model_name)

    columns = test_data_df.columns.tolist()
    # seg words
    logger.info("start seg test data...")
    content_test = test_data_df.iloc[:, 1]
    content_test = seg_words(content_test)
    logger.info("complete seg test data.")

    # model predict
    logger.info("start predict test data...")
    for column in columns[2:]:
    is_test = True
    if args.test is None:
        is_test = True
    else:
        is_test = False if args.test == 0 else True

    load_cache = True
    if args.load_cache is None:
        load_cache = True
    else:
        load_cache = False if args.load_cache == 0 else True

    # load data
    test_num = 100 if is_test else None
    logger.info("start load data, try read {0} records, test mode {1}".format(test_num, is_test))
    test_data_df = load_data_from_csv(config.test_data_path, nrow=test_num)

    # load embedding matrix
    embedding_matrix = load_data("emb.npy")   

    # load vocab
    vocab = load_data("vocab.npy").tolist()

    # load all test columns
    columns = test_data_df.columns.tolist()

    # seg content words to sequence
    logger.info("start seg test data, let's look at some data")
    logger.info(test_data_df.iloc[1, :])
    content_test = test_data_df.iloc[:, 1]
    if not load_cache:
示例#8
0
logger = logging.getLogger(__name__)

if __name__ == '__main__':

    # parser = argparse.ArgumentParser()
    # parser.add_argument('-mn', '--model_name', type=str, nargs='?',
    #                     help='the name of model')
    #
    # args = parser.parse_args()
    # model_name = args.model_name
    # if not model_name:
    #     model_name = "model_dict.pkl"

    # load train data
    logger.info("start load data")
    train_data_df = load_data_from_csv(config.train_data_path)
    validate_data_df = load_data_from_csv(config.validate_data_path)

    content_train = train_data_df.iloc[:, 1]
    content_validate = validate_data_df.iloc[:,1]

    # vocab 大小
    max_document_length = max([len(x.split(" ")) for x in content_train])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    content_train_numeric = np.array(list(vocab_processor.fit_transform(content_train)))
    content_validate_numeric = np.array(list(vocab_processor.fit_transform(content_validate)))

    logger.info("start seg train data")
    logger.info("complete seg train data")

    columns = train_data_df.columns.values.tolist()
示例#9
0
    # a = np.array(s)
    # print("save ", a)
    # np.save("a.npy", a)
    # b = np.load("a.npy")
    # print("load ", b)

    # vocab = { u"你好" : 0, u"朋友" : 1, u"人" : 2 , u"年":3, u"一个":4}
    # embedding_matrix = get_embeding_weights(vocab, config.word2vec_path, 1000000)
    # print(vocab)
    # print(embedding_matrix)

    # load train data and validate data
    logger.info("start load data")
    traing_num = 10000 if is_test else None
    validate_num = 5000 if is_test else None
    train_data_df = load_data_from_csv(config.train_data_path, nrow=traing_num)
    validate_data_df = load_data_from_csv(config.validate_data_path,
                                          nrow=validate_num)

    # get all train sentences
    content_train = train_data_df.iloc[:, 1]
    logger.info(content_train[0])
    logger.info(content_train[1])

    logger.info("start seg train sentences to vector")
    if not load_cache:
        max_len, word, vocab, sequences = sentences_to_indices(content_train)
        save_data(vocab, "all_vocab.npy")
        save_data(word, "word.npy")
        # save_data(sequences, "seq.npy")