def _show_all_labels(self): # split labeled data and unlabeled data output = [] contents = [] seg_contents = [] features = [] labels = [] for i in self.samples: label = i.human_label if i.human_label else i.machine_label output.append(label + self.col_sep + str(i.prob)) seg_contents.append(i.seg_text_word) contents.append(i.original_text) labels.append(label) features.append(i.feature.toarray().tolist()[0]) # get data feature X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(features)), labels) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) save(output, ture_labels=None, pred_save_path=self.pred_save_path, data_set=contents)
def train_classic(model_type, data_path=None, pr_figure_path=None, model_save_path=None, vectorizer_path=None, col_sep=',', thresholds=0.5, num_classes=2): data_content, data_lbl = data_reader(data_path, col_sep) # data feature data_tfidf, vocab = tfidf(data_content) # save data feature dump_pkl(vocab, vectorizer_path, overwrite=True) # label data_label = label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_tfidf, data_label, test_size=0.2) model = get_model(model_type) # fit model.fit(X_train, y_train) # save model dump_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, thresholds=thresholds, num_classes=num_classes, model_type=model_type, pr_figure_path=pr_figure_path)
def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id): machine_samples_list = [] # get data feature labeled_data_label = [ i.human_label if i.human_label else i.machine_label for i in labeled_sample_list ] labeled_data_feature = [ i.feature.toarray().tolist()[0] for i in labeled_sample_list ] X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(labeled_data_feature)), labeled_data_label) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) # 预测未标注数据集 unlabeled_data_feature = [ i.feature.toarray().tolist()[0] for i in unlabeled_sample_list ] if not unlabeled_sample_list: return machine_samples_list pred_result = self.model.predict_proba( csr_matrix(np.array(unlabeled_data_feature))) pred_label_proba = [(self.id_label[prob.argmax()], prob.max()) for prob in pred_result] # save middle result pred_output = [ self.id_label[prob.argmax()] + self.col_sep + str(prob.max()) for prob in pred_result ] pred_save_path = self.pred_save_path[:-4] + '_batch_' + str( batch_id) + '.txt' logger.debug("save infer label and prob result to: %s" % pred_save_path) unlabeled_data_text = [i.original_text for i in unlabeled_sample_list] save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=unlabeled_data_text) assert len(unlabeled_sample_list) == len(pred_label_proba) for unlabeled_sample, label_prob in zip(unlabeled_sample_list, pred_label_proba): idx = unlabeled_sample.id self.samples[idx].machine_label = label_prob[0] self.samples[idx].prob = label_prob[1] machine_samples_list.append(unlabeled_sample) return machine_samples_list
def train_classic(model_type, data_path=None, pr_figure_path=None, model_save_path=None, vectorizer_path=None, col_sep=',', thresholds=0.5, num_classes=2, feature_type='tfidf_char'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = get_model(model_type) # fit model.fit(X_train, y_train) # save model dump_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, thresholds=thresholds, num_classes=num_classes, model_type=model_type, pr_figure_path=pr_figure_path)
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type)) # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) word_id = load_vocab(word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) print(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) logger.info('data size:%d' % len(data_content)) logger.info('label size:%d' % len(data_lbl)) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.error('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split( data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': save_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path) # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight = {} word_dict_rev = sorted(word_id.items(), key=lambda x: x[1]) for feature, index in word_dict_rev: feature_weight[feature] = list(map(float, model.coef_[:, index])) save_dict(feature_weight, config.lr_feature_weight_path)