Пример #1
0
def infer_cnn(data_path,
              model_path,
              word_vocab_path,
              pos_vocab_path,
              label_vocab_path,
              word_emb_path,
              pos_emb_path,
              batch_size,
              pred_save_path=None):
    # init dict
    word_vocab, pos_vocab, label_vocab = load_vocab(
        word_vocab_path), load_vocab(pos_vocab_path), load_vocab(
            label_vocab_path)
    word_emb, pos_emb = load_pkl(word_emb_path), load_pkl(pos_emb_path)
    word_test, pos_test = test_reader(data_path, word_vocab, pos_vocab,
                                      label_vocab)
    # init model
    model = Model(config.max_len, word_emb, pos_emb, label_vocab=label_vocab)
    ckpt_path = get_ckpt_path(model_path)
    if ckpt_path:
        print("Read model parameters from %s" % ckpt_path)
        model.saver.restore(model.sess, ckpt_path)
    else:
        print("Can't find the checkpoint.going to stop")
        return
    label_pred = model.predict(word_test, pos_test, batch_size)
    save(label_pred, pred_save_path=pred_save_path)
    print("finish prediction.")
Пример #2
0
    def tfidf_char_feature(self, data_set):
        """
        Get TFIDF feature by char
        :param data_set:
        :return:
        """
        data_set = get_char_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        logger.debug('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                logger.debug("%s	%s" % (k, v))
                count += 1

        logger.info(data_feature.shape)
        if not self.is_infer:
            dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
Пример #3
0
    def tf_word_feature(self, data_set):
        """
        Get TF feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = CountVectorizer(analyzer='word',
                                              encoding='utf-8',
                                              lowercase=True,
                                              vocabulary=self.word_vocab)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        logger.debug('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                logger.debug("%s	%s" % (k, v))
                count += 1
        feature_names = self.vectorizer.get_feature_names()
        logger.info('feature_names:%s' % feature_names[:20])

        logger.info(data_feature.shape)
        if not self.is_infer:
            dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
Пример #4
0
    def tfidf_word_feature(self, data_set):
        """
        Get TFIDF ngram feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = TfidfVectorizer(analyzer='word',
                                              ngram_range=(1, 2),
                                              sublinear_tf=True)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        print('Vocab size:', len(vocab))
        print('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                print(k, v)
                count += 1

        print('\nIFIDF词频矩阵:')
        print('data_feature shape:', data_feature.shape)
        print(data_feature.toarray())
        dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
Пример #5
0
def load_data(state):
    datafile = DataFile('/data/polyvore/processed/tuples',
                        '/data/polyvore/processed/image_list')
    image_list = datafile.image_list
    fashion_sets, fashion_items = load_pkl('/data/polyvore/processed/pickles')
    positive_tuple, negative_tuples = datafile.get_tuples(state,repeated=False)
    return image_list, positive_tuple, negative_tuples, fashion_items
Пример #6
0
def infer_classic(model_save_path,
                  test_data_path,
                  thresholds=0.5,
                  pred_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  num_classes=2,
                  feature_type='tf'):
    # load model
    model = load_pkl(model_save_path)
    # load data content
    data_set, test_ids = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()

    if num_classes == 2:
        # binary classification
        label_pred_probas = model.predict_proba(data_feature)[:, 1]
        label_pred = label_pred_probas > thresholds
    else:
        label_pred = model.predict(data_feature)
    save(label_pred, test_ids, pred_save_path)
    print("finish prediction.")
Пример #7
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data=data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)

    # evaluate
    if true_labels:
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
        except Exception:
            print("error. no true labels")

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight_dict = load_dict(config.lr_feature_weight_path)
        pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict,
                                                id_label)
        print(pred_labels[:5])
Пример #8
0
def build_word_embedding(path,
                         overwrite=False,
                         sentence_w2v_path=None,
                         word_vocab_path=None,
                         word_vocab_start=2,
                         w2v_dim=256):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    word_vocab = load_vocab(word_vocab_path)
    w2v_dict_full = load_pkl(sentence_w2v_path)
    word_vocab_count = len(w2v_dict_full) + word_vocab_start
    word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32')
    for word in word_vocab:
        index = word_vocab[word]
        if word in w2v_dict_full:
            word_emb[index, :] = w2v_dict_full[word]
        else:
            random_vec = np.random.uniform(-0.25, 0.25,
                                           size=(w2v_dim, )).astype('float32')
            word_emb[index, :] = random_vec
    # save
    dump_pkl(word_emb, path, overwrite=True)
    return word_emb
Пример #9
0
def build_pos_embedding(path,
                        overwrite=False,
                        pos_vocab_path=None,
                        pos_vocab_start=1,
                        pos_dim=64):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    pos_vocab = load_vocab(pos_vocab_path)
    pos_vocab_count = len(pos_vocab) + pos_vocab_start
    pos_emb = np.random.normal(size=(
        pos_vocab_count,
        pos_dim,
    )).astype('float32')
    for i in range(pos_vocab_start):
        pos_emb[i, :] = 0.
    # save
    dump_pkl(pos_emb, path, overwrite=True)
    return pos_emb
Пример #10
0
 def tfidf_char_feature(self, data_set):
     """
     Get TFIDF feature by char
     :param data_set:
     :return:
     """
     data_set = get_char_segment_data(data_set)
     if self.is_infer:
         self.vectorizer = load_pkl(self.feature_vec_path)
         data_feature = self.vectorizer.transform(data_set)
     else:
         self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True)
         data_feature = self.vectorizer.fit_transform(data_set)
     vocab = self.vectorizer.vocabulary_
     logger.info('Vocab size:%d' % len(vocab))
     logger.info(data_feature.shape)
     if not self.is_infer:
         save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
     return data_feature
Пример #11
0
 def tf_word_feature(self, data_set):
     """
     Get TF feature by word
     :param data_set:
     :return:
     """
     data_set = get_word_segment_data(data_set)
     if self.is_infer:
         self.vectorizer = load_pkl(self.feature_vec_path)
         data_feature = self.vectorizer.transform(data_set)
     else:
         self.vectorizer = CountVectorizer(vocabulary=self.word_vocab)
         data_feature = self.vectorizer.fit_transform(data_set)
     vocab = self.vectorizer.vocabulary_
     logger.info('Vocab size:%d' % len(vocab))
     feature_names = self.vectorizer.get_feature_names()
     logger.info('feature_names:%s' % feature_names[:20])
     logger.info(data_feature.shape)
     if not self.is_infer:
         save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
     return data_feature
Пример #12
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        out = self.fc(output)

        return x, out, state


class Pointer(tf.keras.layers.Layer):

    def __init__(self):
        super(Pointer, self).__init__()
        self.w_s_reduce = tf.keras.layers.Dense(1)
        self.w_i_reduce = tf.keras.layers.Dense(1)
        self.w_c_reduce = tf.keras.layers.Dense(1)

    def call(self, context_vector, state, dec_inp):
        return tf.nn.sigmoid(self.w_s_reduce(state) + self.w_c_reduce(context_vector) + self.w_i_reduce(dec_inp))


if __name__ == '__main__':
    from utils.data_utils import load_pkl
    word2vec = load_pkl('../datasets/word2vec.txt')
    print(word2vec)
Пример #14
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if 'logistic_regression' in model_save_path and config.is_debug:
        count = 0
        features = load_pkl('output/lr_features.pkl')
        for line in data_set:
            if count > 5:
                break
            count += 1
            logger.debug(line)
            words = line.split()
            for category, category_feature in features.items():
                logger.debug('*' * 43)
                logger.debug(category)
                category_score = 0
                for w in words:
                    if w in category_feature:
                        category_score += category_feature[w]
                        logger.debug("%s:%s" % (w, category_feature[w]))
                logger.debug("%s\t%f" % (category, category_score))
                logger.debug('=' * 43)
    if true_labels:
        # evaluate
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))