def train(save_vocab_path='',
          train_path='',
          test_path='',
          train_seg_path='',
          test_seg_path='',
          model_save_dir='',
          vocab_max_size=5000,
          vocab_min_count=5,
          hidden_dim=512,
          use_cuda=False):

    train_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            avg_cost = train_model()
            optimizer = optimizer_func(hidden_dim)
            optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    seg_data(train_path, test_path)
    train_texts = build_dataset(train_seg_path)

    if os.path.exists(save_vocab_path):
        vocab = load_vocab(save_vocab_path)
    else:
        vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count)
        write_vocab(vocab, save_vocab_path)
        vocab = load_vocab(save_vocab_path)

    train_set = read_data(train_seg_path)
    train_set_ids = transform_data(train_set, vocab)
    num_encoder_tokens = len(train_set_ids)
    max_input_texts_len = max([len(text) for text in train_texts])
    print('num of samples:', len(train_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('max sequence length for inputs:', max_input_texts_len)
    # save_word_dict(vocab2id, save_vocab_path)

    train_reader = data_generator(train_set_ids)

    train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size)

    feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'],
                              place=place,
                              program=train_prog)

    exe.run(startup_prog)

    EPOCH_NUM = 20
    for pass_id in six.moves.xrange(EPOCH_NUM):
        batch_id = 0
        for data in train_data():
            cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
            print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost))
            batch_id += 1
        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
Exemplo n.º 2
0
def prepare_origin_data(config):
    # Load the training / test set
    gen_train, gen_test = read_dialog(config.data_dir)
    gen_data_list = [gen_train, gen_test]
    # Load the word vectors
    vectors = data_utils.build_word2vec(config.vector_dir)
    # Build the vocabulary
    gen_vocab_list = data_utils.build_vocab(gen_data_list[0],
                                            config.vocab_size)
    # Initialize the word embedding
    gen_embed = data_utils.initialize_word2vec(gen_vocab_list, vectors,
                                               config.embed_units)

    return gen_data_list, gen_vocab_list, gen_embed, vectors
Exemplo n.º 3
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
Exemplo n.º 4
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=512,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = cnn_model(max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          num_classses=num_classes,
                          dropout=dropout)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    cp = ModelCheckpoint(model_save_path,
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True)
    # fit and save model
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=nb_epoch,
                        validation_data=(X_val, y_val),
                        callbacks=[cp])
    logger.info('save model:%s' % model_save_path)
    plt_history(history, model_name=model_type)
Exemplo n.º 5
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=2,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split(" "))

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'

    word_dic = {}
    count = 1
    for word in word_vocab:
        word_dic[word] = count
        count += 1
    data_filter = []
    for line in data_content:
        line_filter = " ".join(
            list(filter(lambda x: x in word_dic, line.split(" "))))
        data_filter.append(line_filter)
    feature = Feature(data=data_filter,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = load_model(model_save_path)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    #loss,accuracy = model.evaluate(X_val,y_val)
    #print loss,accuracy
    pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None)
    print(y_val)
    print(type(y_val))
    with open("./output/result", "w") as f:
        for i in range(len(y_val)):
            f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2]))
    f.close()
Exemplo n.º 6
0
    def __init__(self,
                 input_file_path,
                 seg_input_file_path='',
                 word_vocab_path='',
                 label_vocab_path='',
                 feature_vec_path='',
                 model_save_path='',
                 pred_save_path='',
                 feature_type='tf_word',
                 model_type='logistic',
                 num_classes=2,
                 col_sep='\t',
                 min_count=1,
                 lower_thres=0.5,
                 upper_thres=0.85,
                 label_ratio=0.9,
                 label_min_size=200,
                 batch_size=10,
                 warmstart_size=0.02,
                 stop_words_path='data/stop_words.txt'):
        self.input_file_path = input_file_path
        self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg"
        self.stop_words_path = stop_words_path
        self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt"
        self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt"
        self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl"
        self.model_save_path = model_save_path if model_save_path else "model.pkl"
        self.pred_save_path = pred_save_path if pred_save_path else "predict.txt"
        self.feature_type = feature_type
        self.num_classes = num_classes
        self.col_sep = col_sep
        self.min_count = min_count
        self.lower_thres = lower_thres
        self.upper_thres = upper_thres
        self.label_ratio = label_ratio

        # 1. load segment data
        if not os.path.exists(self.seg_input_file_path):
            start_time = time()
            seg_data(self.input_file_path,
                     self.seg_input_file_path,
                     col_sep=self.col_sep,
                     stop_words_path=self.stop_words_path)
            logger.info("spend time: %s s" % (time() - start_time))
        self.seg_contents, self.data_lbl = data_reader(
            self.seg_input_file_path, self.col_sep)

        # 2. load original data
        self.content, _ = data_reader(self.input_file_path, self.col_sep)

        # 3. load feature
        word_lst = []
        for i in self.seg_contents:
            word_lst.extend(i.split())
        # word vocab
        self.word_vocab = build_vocab(word_lst,
                                      min_count=self.min_count,
                                      sort=True,
                                      lower=True)
        # save word vocab
        write_vocab(self.word_vocab, self.word_vocab_path)
        # label
        label_vocab = build_vocab(self.data_lbl)
        # save label vocab
        write_vocab(label_vocab, self.label_vocab_path)
        label_id = load_vocab(self.label_vocab_path)
        print("label_id: %s" % label_id)
        self.set_label_id(label_id)
        self.id_label = {v: k for k, v in label_id.items()}
        print('num_classes:%d' % self.num_classes)
        self.data_feature = self._get_feature(self.word_vocab)

        # 4. assemble sample DataObject
        self.samples = self._get_samples(self.data_feature)
        self.batch_num = batch_size if batch_size > 1 else batch_size * len(
            self.samples)
        self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len(
            self.samples)
        self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len(
            self.samples)

        # 5. init model
        self.model = get_model(model_type)
Exemplo n.º 7
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type))
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    word_id = load_vocab(word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    print(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)
    logger.info('data size:%d' % len(data_content))
    logger.info('label size:%d' % len(data_lbl))

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.error('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content, feature_type=feature_type,
                      feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(
        data_feature, data_label, test_size=0.1, random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        save_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight = {}
        word_dict_rev = sorted(word_id.items(), key=lambda x: x[1])
        for feature, index in word_dict_rev:
            feature_weight[feature] = list(map(float, model.coef_[:, index]))
        save_dict(feature_weight, config.lr_feature_weight_path)
Exemplo n.º 8
0
test_dataset_sequences = tokenizer.texts_to_sequences(df_test_dataset_seg)
padded_test_dataset_sequences = pad_sequences(test_dataset_sequences,
                                              maxlen=MAX_SEQUENCE_LENGTH)

# 划分训练集和测试集
print('split train_X,valid_X,train_y,valid_y')
train_X, valid_X, train_y, valid_y = train_test_split(
    padded_dataset_sequences[:df_train_dataset.shape[0]],
    df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]],
    test_size=0.1)

# label one-hot 表示
labels = df_all_dataset['COMMLEVEL'].dropna().map(int)  #.values.tolist()
labels = to_categorical(labels - 1)

vocab, vocab_freqs = build_vocab(df_all_dataset['COMMCONTENT_SEG'])
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {
    x[0]: i + 2
    for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))
}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v: k for k, v in word2index.items()}
len(word2index)

# 解压 bz2 的词向量压缩文件
if not os.path.exists('./embeddings/sgns.weibo.word'):

    print('Start unbz2 embeddings file')