def train(save_vocab_path='', train_path='', test_path='', train_seg_path='', test_seg_path='', model_save_dir='', vocab_max_size=5000, vocab_min_count=5, hidden_dim=512, use_cuda=False): train_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = train_model() optimizer = optimizer_func(hidden_dim) optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) seg_data(train_path, test_path) train_texts = build_dataset(train_seg_path) if os.path.exists(save_vocab_path): vocab = load_vocab(save_vocab_path) else: vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count) write_vocab(vocab, save_vocab_path) vocab = load_vocab(save_vocab_path) train_set = read_data(train_seg_path) train_set_ids = transform_data(train_set, vocab) num_encoder_tokens = len(train_set_ids) max_input_texts_len = max([len(text) for text in train_texts]) print('num of samples:', len(train_texts)) print('num of unique input tokens:', num_encoder_tokens) print('max sequence length for inputs:', max_input_texts_len) # save_word_dict(vocab2id, save_vocab_path) train_reader = data_generator(train_set_ids) train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'], place=place, program=train_prog) exe.run(startup_prog) EPOCH_NUM = 20 for pass_id in six.moves.xrange(EPOCH_NUM): batch_id = 0 for data in train_data(): cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0] print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost)) batch_id += 1 fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
def build_vocab(train_path, word_vocab_path, pos_vocab_path, label_vocab_path, col_sep='\t', min_count=5, word_vocab_start=2, pos_vocab_start=1): word_lst, pos_lst, label_lst = _load_data(train_path, col_sep=col_sep) # word vocab word_vocab = build_dict(word_lst, start=word_vocab_start, min_count=min_count, sort=True, lower=True) # save write_vocab(word_vocab, word_vocab_path) # pos vocab pos_vocab = build_dict(pos_lst, start=pos_vocab_start, sort=True, lower=False) # save write_vocab(pos_vocab, pos_vocab_path) # label vocab label_types = [str(i) for i in label_lst] label_vocab = build_dict(label_types) # save write_vocab(label_vocab, label_vocab_path) return word_vocab, pos_vocab, label_vocab
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=512, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' feature = Feature(data=data_content, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = cnn_model(max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_filters=num_filters, filter_sizes=filter_sizes, num_classses=num_classes, dropout=dropout) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) cp = ModelCheckpoint(model_save_path, monitor='val_acc', verbose=1, save_best_only=True) # fit and save model history = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_val, y_val), callbacks=[cp]) logger.info('save model:%s' % model_save_path) plt_history(history, model_name=model_type)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=2, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split(" ")) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' word_dic = {} count = 1 for word in word_vocab: word_dic[word] = count count += 1 data_filter = [] for line in data_content: line_filter = " ".join( list(filter(lambda x: x in word_dic, line.split(" ")))) data_filter.append(line_filter) feature = Feature(data=data_filter, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = load_model(model_save_path) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) #loss,accuracy = model.evaluate(X_val,y_val) #print loss,accuracy pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None) print(y_val) print(type(y_val)) with open("./output/result", "w") as f: for i in range(len(y_val)): f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2])) f.close()
def __init__(self, input_file_path, seg_input_file_path='', word_vocab_path='', label_vocab_path='', feature_vec_path='', model_save_path='', pred_save_path='', feature_type='tf_word', model_type='logistic', num_classes=2, col_sep='\t', min_count=1, lower_thres=0.5, upper_thres=0.85, label_ratio=0.9, label_min_size=200, batch_size=10, warmstart_size=0.02, stop_words_path='data/stop_words.txt'): self.input_file_path = input_file_path self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg" self.stop_words_path = stop_words_path self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt" self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt" self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl" self.model_save_path = model_save_path if model_save_path else "model.pkl" self.pred_save_path = pred_save_path if pred_save_path else "predict.txt" self.feature_type = feature_type self.num_classes = num_classes self.col_sep = col_sep self.min_count = min_count self.lower_thres = lower_thres self.upper_thres = upper_thres self.label_ratio = label_ratio # 1. load segment data if not os.path.exists(self.seg_input_file_path): start_time = time() seg_data(self.input_file_path, self.seg_input_file_path, col_sep=self.col_sep, stop_words_path=self.stop_words_path) logger.info("spend time: %s s" % (time() - start_time)) self.seg_contents, self.data_lbl = data_reader( self.seg_input_file_path, self.col_sep) # 2. load original data self.content, _ = data_reader(self.input_file_path, self.col_sep) # 3. load feature word_lst = [] for i in self.seg_contents: word_lst.extend(i.split()) # word vocab self.word_vocab = build_vocab(word_lst, min_count=self.min_count, sort=True, lower=True) # save word vocab write_vocab(self.word_vocab, self.word_vocab_path) # label label_vocab = build_vocab(self.data_lbl) # save label vocab write_vocab(label_vocab, self.label_vocab_path) label_id = load_vocab(self.label_vocab_path) print("label_id: %s" % label_id) self.set_label_id(label_id) self.id_label = {v: k for k, v in label_id.items()} print('num_classes:%d' % self.num_classes) self.data_feature = self._get_feature(self.word_vocab) # 4. assemble sample DataObject self.samples = self._get_samples(self.data_feature) self.batch_num = batch_size if batch_size > 1 else batch_size * len( self.samples) self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len( self.samples) self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len( self.samples) # 5. init model self.model = get_model(model_type)
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type)) # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) word_id = load_vocab(word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) print(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) logger.info('data size:%d' % len(data_content)) logger.info('label size:%d' % len(data_lbl)) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.error('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split( data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': save_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path) # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight = {} word_dict_rev = sorted(word_id.items(), key=lambda x: x[1]) for feature, index in word_dict_rev: feature_weight[feature] = list(map(float, model.coef_[:, index])) save_dict(feature_weight, config.lr_feature_weight_path)