Пример #1
0
    def _show_all_labels(self):
        # split labeled data and unlabeled data
        output = []
        contents = []
        seg_contents = []
        features = []
        labels = []
        for i in self.samples:
            label = i.human_label if i.human_label else i.machine_label
            output.append(label + self.col_sep + str(i.prob))
            seg_contents.append(i.seg_text_word)
            contents.append(i.original_text)
            labels.append(label)
            features.append(i.feature.toarray().tolist()[0])
        # get data feature
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(features)), labels)

        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)
        save(output,
             ture_labels=None,
             pred_save_path=self.pred_save_path,
             data_set=contents)
Пример #2
0
def train_classic(model_type,
                  data_path=None,
                  pr_figure_path=None,
                  model_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  thresholds=0.5,
                  num_classes=2):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # data feature
    data_tfidf, vocab = tfidf(data_content)
    # save data feature
    dump_pkl(vocab, vectorizer_path, overwrite=True)
    # label
    data_label = label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(data_tfidf,
                                                      data_label,
                                                      test_size=0.2)
    model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    dump_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model,
         X_val,
         y_val,
         thresholds=thresholds,
         num_classes=num_classes,
         model_type=model_type,
         pr_figure_path=pr_figure_path)
Пример #3
0
    def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id):
        machine_samples_list = []
        # get data feature
        labeled_data_label = [
            i.human_label if i.human_label else i.machine_label
            for i in labeled_sample_list
        ]
        labeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in labeled_sample_list
        ]
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(labeled_data_feature)), labeled_data_label)
        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)

        # 预测未标注数据集
        unlabeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in unlabeled_sample_list
        ]
        if not unlabeled_sample_list:
            return machine_samples_list
        pred_result = self.model.predict_proba(
            csr_matrix(np.array(unlabeled_data_feature)))

        pred_label_proba = [(self.id_label[prob.argmax()], prob.max())
                            for prob in pred_result]

        # save middle result
        pred_output = [
            self.id_label[prob.argmax()] + self.col_sep + str(prob.max())
            for prob in pred_result
        ]
        pred_save_path = self.pred_save_path[:-4] + '_batch_' + str(
            batch_id) + '.txt'
        logger.debug("save infer label and prob result to: %s" %
                     pred_save_path)
        unlabeled_data_text = [i.original_text for i in unlabeled_sample_list]
        save(pred_output,
             ture_labels=None,
             pred_save_path=pred_save_path,
             data_set=unlabeled_data_text)

        assert len(unlabeled_sample_list) == len(pred_label_proba)
        for unlabeled_sample, label_prob in zip(unlabeled_sample_list,
                                                pred_label_proba):
            idx = unlabeled_sample.id
            self.samples[idx].machine_label = label_prob[0]
            self.samples[idx].prob = label_prob[1]
            machine_samples_list.append(unlabeled_sample)
        return machine_samples_list
Пример #4
0
def train_classic(model_type,
                  data_path=None,
                  pr_figure_path=None,
                  model_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  thresholds=0.5,
                  num_classes=2,
                  feature_type='tfidf_char'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    dump_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model,
         X_val,
         y_val,
         thresholds=thresholds,
         num_classes=num_classes,
         model_type=model_type,
         pr_figure_path=pr_figure_path)
Пример #5
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
Пример #6
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type))
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    word_id = load_vocab(word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    print(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)
    logger.info('data size:%d' % len(data_content))
    logger.info('label size:%d' % len(data_lbl))

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.error('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content, feature_type=feature_type,
                      feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(
        data_feature, data_label, test_size=0.1, random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        save_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight = {}
        word_dict_rev = sorted(word_id.items(), key=lambda x: x[1])
        for feature, index in word_dict_rev:
            feature_weight[feature] = list(map(float, model.coef_[:, index]))
        save_dict(feature_weight, config.lr_feature_weight_path)