def batch_generator(descs, batch_size, train_vocab, word_ngrams, sort_ngrams, shuffle=False, show_progress=True): global cache inds = np.arange(len(descs)) rem_inds, batch_inds = next_batch(inds, batch_size, shuffle) if show_progress: progress_bar = tqdm(total=int(np.ceil(len(descs) / batch_size))) while len(batch_inds) > 0: batch_descs = [descs[i] for i in batch_inds] desc_hashes = [hash(str(desc)) for desc in batch_descs] batch = [[0] + [train_vocab[phrase]["id"] for phrase in get_all(desc, word_ngrams, sort_ngrams) if phrase in train_vocab] if h not in cache else cache[h] for desc, h in zip(batch_descs, desc_hashes)] for h, inds in zip(desc_hashes, batch): if h not in cache: cache[h] = inds batch_weights = [[1 / len(i) for _ in range(len(i))] for i in batch] cur_lens = np.array([len(i) for i in batch]) mx_len = max(cur_lens) to_pad = mx_len - cur_lens batch = [i + [0 for _ in range(pad)] for i, pad in zip(batch, to_pad)] batch_weights = [i + [0 for _ in range(pad)] for i, pad in zip(batch_weights, to_pad)] rem_inds, batch_inds = next_batch(rem_inds, batch_size, shuffle) if show_progress: progress_bar.update() yield batch, np.expand_dims(batch_weights, axis=2) if show_progress: progress_bar.close()
def batch_generator(descs, labels, batch_size, train_vocab, labels_lookup, word_ngrams, shuffle=False): global cache inds = np.arange(len(descs)) rem_inds, batch_inds = next_batch(inds, batch_size, shuffle) while len(batch_inds) > 0: batch_descs = [descs[i] for i in batch_inds] desc_hashes = [hash(str(desc)) for desc in batch_descs] batch = [[0] + [ train_vocab[phrase]["id"] for phrase in get_all(desc, word_ngrams) if phrase in train_vocab ] if h not in cache else cache[h] for desc, h in zip(batch_descs, desc_hashes)] for h, inds in zip(desc_hashes, batch): if h not in cache: cache[h] = inds batch_weights = [[1 / len(i) for _ in range(len(i))] for i in batch] batch_labels = [labels[i] for i in batch_inds] batch_labels = [labels_lookup[label] for label in batch_labels] cur_lens = np.array([len(i) for i in batch]) mx_len = max(cur_lens) to_pad = mx_len - cur_lens batch = [i + [0 for _ in range(pad)] for i, pad in zip(batch, to_pad)] batch_weights = [ i + [0 for _ in range(pad)] for i, pad in zip(batch_weights, to_pad) ] rem_inds, batch_inds = next_batch(rem_inds, batch_size, shuffle) yield batch, np.expand_dims(batch_weights, axis=2), batch_labels
def _batch_generator(self, list_of_texts, batch_size): """ Generate batch from list of texts :param list_of_texts: list/array :param batch_size: int :return: batch word indices, batch word weights """ if self.preprocessing_function: list_of_texts = [self.preprocessing_function(str(t)) for t in list_of_texts] else: list_of_texts = [str(t) for t in list_of_texts] inds = np.arange(len(list_of_texts)) rem_inds, batch_inds = next_batch(inds, batch_size) while len(batch_inds) > 0: batch, batch_weights = [], [] descs_words = [list(get_all(list_of_texts[ind].split(), self.info["word_ngrams"], self.info["sort_ngrams"])) for ind in batch_inds] num_max_words = max([len(desc_split) for desc_split in descs_words]) + 1 for desc_words in descs_words: init_test_inds = [0] + [self.train_vocab[phrase]["id"] for phrase in desc_words if phrase in self.train_vocab] test_desc_inds = init_test_inds + [0 for _ in range(num_max_words - len(init_test_inds))] test_desc_weights = np.zeros_like(test_desc_inds, dtype=float) test_desc_weights[:len(init_test_inds)] = 1. / len(init_test_inds) batch.append(test_desc_inds) batch_weights.append(test_desc_weights) rem_inds, batch_inds = next_batch(rem_inds, batch_size) batch_weights = np.expand_dims(batch_weights, 2) batch = np.array(batch) yield batch, batch_weights
def _batch_generator(self, list_of_texts, batch_size, show_progress=False): """ Generate batch from list of texts :param list_of_texts: list/array :param batch_size: int :param show_progress: bool, show progress bar :return: batch word indices, batch word weights """ if self.preprocessing_function: list_of_texts = [ self.preprocessing_function(str(text)) for text in list_of_texts ] else: list_of_texts = [str(text) for text in list_of_texts] indices = np.arange(len(list_of_texts)) remaining_indices, batch_indices = next_batch(indices, batch_size) if len(list_of_texts) <= batch_size: show_progress = False disable_progress_bar = not show_progress progress_bar = tqdm(total=int(np.ceil(len(list_of_texts) / batch_size)), disable=disable_progress_bar) while len(batch_indices) > 0: batch, batch_weights = [], [] batch_descriptions = [ list( get_all(list_of_texts[index].split(), self.info["word_ngrams"], self.info["sort_ngrams"])) for index in batch_indices ] num_max_words = max([ len(batch_description) for batch_description in batch_descriptions ]) + 1 for batch_description in batch_descriptions: initial_indices = [0] + [ self.word_dict[phrase]["id"] for phrase in batch_description if phrase in self.word_dict ] description_indices = np.array( initial_indices + [0 for _ in range(num_max_words - len(initial_indices))]) description_weights = np.zeros_like(description_indices, dtype=np.float32) description_weights[:len(initial_indices )] = 1. / len(initial_indices) batch.append(description_indices) batch_weights.append(description_weights) remaining_indices, batch_indices = next_batch( remaining_indices, batch_size) progress_bar.update() yield batch, batch_weights progress_bar.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-md", "--model_dir", type=str, help="path where model.pb and model_params.json are") parser.add_argument("-tp", "--test_path", type=str, help="path to test file") parser.add_argument("-lp", "--label_prefix", type=str, help="label prefix", default="__label__") parser.add_argument("-bs", "--batch_size", type=int, default=1024, help="batch size for inference") parser.add_argument("-k", "--top_k", type=int, default=1, help="calculate accuracy on top k predictions") parser.add_argument("-hc", "--hand_check", type=bool, default=False, help="test on manually inputted data") parser.add_argument("-gpu", "--use_gpu", type=bool, default=True, help="use gpu for inference") parser.add_argument("-gpu_fr", "--gpu_fraction", type=float, default=0.4, help="what fraction of gpu to allocate") args = parser.parse_args() model_dir = args.model_dir model_params_path = os.path.join(model_dir, "model_params.json") model_path = os.path.join(model_dir, "model_best.pb") label_prefix = args.label_prefix if args.use_gpu: device = "/gpu:0" config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction, allow_growth=True)) else: device = "/cpu:0" os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config = tf.ConfigProto(allow_soft_placement=True) num_thrown_for_label = 0 with open(model_params_path, "r") as infile: model_params = json.load(infile) if os.path.isfile(model_params["label_dict_path"]): with open(model_params["label_dict_path"], "r") as infile: label_dict = json.load(infile) else: with open(os.path.join(model_dir, "label_dict.json"), "r") as infile: label_dict = json.load(infile) if os.path.isfile(model_params["word_dict_path"]): with open(model_params["word_dict_path"], "r") as infile: word_dict = json.load(infile) else: with open(os.path.join(model_dir, "word_dict.json"), "r") as infile: word_dict = json.load(infile) word_ngrams = model_params["word_ngrams"] sort_ngrams = model_params["sort_ngrams"] labels_dict_inverse = {} for label, label_id in label_dict.items(): labels_dict_inverse[label_dict[label]["id"]] = label with tf.device(device): with tf.Session(config=config) as sess: run_arg = load_graph(model_path, ["input:0", "input_weights:0", "prediction:0"]) if args.hand_check: while True: query_description = input("Enter the description: ") label = [token for token in query_description.split() if token.startswith(label_prefix)][0] label = label.split(label_prefix)[-1] query_description = query_description[20:] test_description_indices = \ np.expand_dims([0] + [word_dict[phrase]["id"] for phrase in get_all(query_description.split(), word_ngrams, sort_ngrams) if phrase in word_dict], axis=0) test_desc_weights = np.zeros_like(test_description_indices, dtype=np.float32) test_desc_weights[0][:len(test_description_indices[0])] = 1. / len(test_description_indices[0]) if label not in label_dict: print("New label") continue probabilities = np.squeeze(sess.run(run_arg[-1], feed_dict={run_arg[0]: test_description_indices, run_arg[1]: test_desc_weights})) max_index = np.argmax(probabilities) max_prob = probabilities[max_index] predicted_label = labels_dict_inverse[max_index] print(predicted_label == label, predicted_label, max_prob) else: test_descriptions, test_labels = parse_txt(args.test_path) test_indices = np.arange(len(test_descriptions)) print("The total number of test datapoints: {}".format(len(test_descriptions))) progress_bar = tqdm(total=int(np.ceil(len(test_descriptions) / args.batch_size))) remaining_indices, batch_indices = next_batch(test_indices, args.batch_size) accuracy_top_1, accuracy_top_k = 0, 0 cnt = 0 while len(batch_indices) > 0: batch_descriptions = [test_descriptions[i] for i in batch_indices] batch_labels = [test_labels[i] for i in batch_indices] batch, batch_weights, batch_labels2 = [], [], [] max_words = -1 for test_description in batch_descriptions: max_words = max(max_words, len(test_description.split())) num_max_words = 1 for ng in range(word_ngrams): num_max_words += max_words - ng for test_description, test_label in zip(batch_descriptions, batch_labels): if test_label not in label_dict: num_thrown_for_label += 1 continue initial_test_indices = [0] + [word_dict[phrase]["id"] for phrase in get_all(test_description.split(), word_ngrams, sort_ngrams) if phrase in word_dict] cnt += 1 test_description_indices = \ np.array(initial_test_indices + [0 for _ in range(num_max_words - len(initial_test_indices))]) test_description_weights = np.zeros_like(test_description_indices, dtype=np.float32) test_description_weights[:len(initial_test_indices)] = 1. / len(initial_test_indices) batch.append(test_description_indices) batch_weights.append(test_description_weights) batch_labels2.append(label_dict[test_label]["id"]) probabilities = sess.run(run_arg[-1], feed_dict={run_arg[0]: batch, run_arg[1]: batch_weights}) top_k = [np.argsort(i)[-args.top_k:] for i in probabilities] accuracy_top_k += sum([True if i in j else False for i, j in zip(batch_labels2, top_k)]) accuracy_top_1 += sum([True if i == j[-1] else False for i, j in zip(batch_labels2, top_k)]) remaining_indices, batch_indices = next_batch(remaining_indices, args.batch_size) progress_bar.update() progress_bar.close() print("{} datapoint thrown because of label".format(num_thrown_for_label)) print("Number of test datapoints after cleaning: {}".format(len(test_descriptions) - num_thrown_for_label)) print("Number of unique labels in test after cleaning: {}".format(len(set(test_labels)))) print("Accuracy: {}".format(round(100 * accuracy_top_1 / len(test_descriptions), 2))) print("Accuracy top {}: {}".format(args.top_k, round(100 * accuracy_top_k / len(test_descriptions), 2)))