def infer_cnn(data_path, model_path, word_vocab_path, pos_vocab_path, label_vocab_path, word_emb_path, pos_emb_path, batch_size, pred_save_path=None): # init dict word_vocab, pos_vocab, label_vocab = load_vocab( word_vocab_path), load_vocab(pos_vocab_path), load_vocab( label_vocab_path) word_emb, pos_emb = load_pkl(word_emb_path), load_pkl(pos_emb_path) word_test, pos_test = test_reader(data_path, word_vocab, pos_vocab, label_vocab) # init model model = Model(config.max_len, word_emb, pos_emb, label_vocab=label_vocab) ckpt_path = get_ckpt_path(model_path) if ckpt_path: print("Read model parameters from %s" % ckpt_path) model.saver.restore(model.sess, ckpt_path) else: print("Can't find the checkpoint.going to stop") return label_pred = model.predict(word_test, pos_test, batch_size) save(label_pred, pred_save_path=pred_save_path) print("finish prediction.")
def tfidf_char_feature(self, data_set): """ Get TFIDF feature by char :param data_set: :return: """ data_set = get_char_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.debug('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: logger.debug("%s %s" % (k, v)) count += 1 logger.info(data_feature.shape) if not self.is_infer: dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def tf_word_feature(self, data_set): """ Get TF feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = CountVectorizer(analyzer='word', encoding='utf-8', lowercase=True, vocabulary=self.word_vocab) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.debug('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: logger.debug("%s %s" % (k, v)) count += 1 feature_names = self.vectorizer.get_feature_names() logger.info('feature_names:%s' % feature_names[:20]) logger.info(data_feature.shape) if not self.is_infer: dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def tfidf_word_feature(self, data_set): """ Get TFIDF ngram feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ print('Vocab size:', len(vocab)) print('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: print(k, v) count += 1 print('\nIFIDF词频矩阵:') print('data_feature shape:', data_feature.shape) print(data_feature.toarray()) dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def load_data(state): datafile = DataFile('/data/polyvore/processed/tuples', '/data/polyvore/processed/image_list') image_list = datafile.image_list fashion_sets, fashion_items = load_pkl('/data/polyvore/processed/pickles') positive_tuple, negative_tuples = datafile.get_tuples(state,repeated=False) return image_list, positive_tuple, negative_tuples, fashion_items
def infer_classic(model_save_path, test_data_path, thresholds=0.5, pred_save_path=None, vectorizer_path=None, col_sep=',', num_classes=2, feature_type='tf'): # load model model = load_pkl(model_save_path) # load data content data_set, test_ids = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=vectorizer_path, is_infer=True) # get data feature data_feature = feature.get_feature() if num_classes == 2: # binary classification label_pred_probas = model.predict_proba(data_feature)[:, 1] label_pred = label_pred_probas > thresholds else: label_pred = model.predict(data_feature) save(label_pred, test_ids, pred_save_path) print("finish prediction.")
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data=data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) # evaluate if true_labels: try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id)) except Exception: print("error. no true labels") # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight_dict = load_dict(config.lr_feature_weight_path) pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict, id_label) print(pred_labels[:5])
def build_word_embedding(path, overwrite=False, sentence_w2v_path=None, word_vocab_path=None, word_vocab_start=2, w2v_dim=256): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) word_vocab = load_vocab(word_vocab_path) w2v_dict_full = load_pkl(sentence_w2v_path) word_vocab_count = len(w2v_dict_full) + word_vocab_start word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32') for word in word_vocab: index = word_vocab[word] if word in w2v_dict_full: word_emb[index, :] = w2v_dict_full[word] else: random_vec = np.random.uniform(-0.25, 0.25, size=(w2v_dim, )).astype('float32') word_emb[index, :] = random_vec # save dump_pkl(word_emb, path, overwrite=True) return word_emb
def build_pos_embedding(path, overwrite=False, pos_vocab_path=None, pos_vocab_start=1, pos_dim=64): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) pos_vocab = load_vocab(pos_vocab_path) pos_vocab_count = len(pos_vocab) + pos_vocab_start pos_emb = np.random.normal(size=( pos_vocab_count, pos_dim, )).astype('float32') for i in range(pos_vocab_start): pos_emb[i, :] = 0. # save dump_pkl(pos_emb, path, overwrite=True) return pos_emb
def tfidf_char_feature(self, data_set): """ Get TFIDF feature by char :param data_set: :return: """ data_set = get_char_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.info(data_feature.shape) if not self.is_infer: save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def tf_word_feature(self, data_set): """ Get TF feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = CountVectorizer(vocabulary=self.word_vocab) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) feature_names = self.vectorizer.get_feature_names() logger.info('feature_names:%s' % feature_names[:20]) logger.info(data_feature.shape) if not self.is_infer: save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size) x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) # passing the concatenated vector to the GRU output, state = self.gru(x) # output shape == (batch_size * 1, hidden_size) output = tf.reshape(output, (-1, output.shape[2])) # output shape == (batch_size, vocab) out = self.fc(output) return x, out, state class Pointer(tf.keras.layers.Layer): def __init__(self): super(Pointer, self).__init__() self.w_s_reduce = tf.keras.layers.Dense(1) self.w_i_reduce = tf.keras.layers.Dense(1) self.w_c_reduce = tf.keras.layers.Dense(1) def call(self, context_vector, state, dec_inp): return tf.nn.sigmoid(self.w_s_reduce(state) + self.w_c_reduce(context_vector) + self.w_i_reduce(dec_inp)) if __name__ == '__main__': from utils.data_utils import load_pkl word2vec = load_pkl('../datasets/word2vec.txt') print(word2vec)
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if 'logistic_regression' in model_save_path and config.is_debug: count = 0 features = load_pkl('output/lr_features.pkl') for line in data_set: if count > 5: break count += 1 logger.debug(line) words = line.split() for category, category_feature in features.items(): logger.debug('*' * 43) logger.debug(category) category_score = 0 for w in words: if w in category_feature: category_score += category_feature[w] logger.debug("%s:%s" % (w, category_feature[w])) logger.debug("%s\t%f" % (category, category_score)) logger.debug('=' * 43) if true_labels: # evaluate try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))