def scoring(self, type='pt', save_imgs=False, save_cams=False): test_labels, pred_labels, total_batches = [], [], shutils.get_num_batches( self.num_test, cnt.BATCH_SIZE) if type == 'pt': encoder = shutils.load_data_pkl(cnt.PT_ENCODER_PATH) pred_out_dir = cnt.PT_PREDS_PATH cam_dir = cnt.PT_CAMS_PATH self.init_pt_model() model = self.pt_model model.load_weights(cnt.PT_MODEL_PATH) else: encoder = shutils.load_data_pkl(cnt.COLOR_ENCODER_PATH) pred_out_dir = cnt.COLOR_PREDS_PATH cam_dir = cnt.COLOR_CAMS_PATH self.init_color_model() model = self.color_model model.load_weights(cnt.COLOR_MODEL_PATH) num_batches, start = 0, 0 for batch_data, batch_labels in self.data_generator( self.num_test, 'test', type): test_labels += batch_labels.tolist() predictions = self.predict(batch_data, type) pred_labels += predictions num_batches += 1 indices = [start + i for i in range(len(batch_labels))] if save_imgs: utils.save_imgs(batch_data, indices, np.array(batch_labels), np.array(predictions), encoder, pred_out_dir) if save_cams: utils.cam(model, batch_data, indices, np.array(batch_labels), np.array(predictions), encoder, cam_dir) start += len(batch_labels) if num_batches == total_batches: break h = np.sum(np.array(pred_labels), axis=1) idx = np.nonzero(h > 0)[0] t_labels = encoder.inverse_transform(np.array(test_labels)[idx]) p_labels = encoder.inverse_transform(np.array(pred_labels)[idx]) print(classification_report(t_labels, p_labels))
def create_train_test(): try: img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='r') img_arr = img_arr_file.root.data train_indices, test_indices = train_test_split(range(img_arr.shape[0]), test_size=0.2) encoder = MultiLabelBinarizer() labels = shutils.load_data_pkl(cnt.LABELS_PATH) labels = [x.strip().split('__') for x in labels] transfomed_labels = encoder.fit_transform(labels) shutils.save_data_pkl(transfomed_labels, cnt.TRANSFORMED_LABELS_PATH) shutils.save_data_pkl(encoder, cnt.ENCODER_PATH) print(len(train_indices), len(test_indices)) shutils.save_data_pkl(train_indices, cnt.TRAIN_INDICES_PATH) shutils.save_data_pkl(test_indices, cnt.TEST_INDICES_PATH) finally: img_arr_file.close()
def get_data_as_generator(num_data, prefix='train'): try: img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='r') img_arr = img_arr_file.root.data txt_arr = shutils.load_data_pkl(cnt.INPUT_TENSOR_PATH) txt_arr = np.array(txt_arr) labels = shutils.load_data_pkl(cnt.TRANSFORMED_LABELS_PATH) labels = np.array(labels) random.seed(42) if prefix == 'train': indices = shutils.load_data_pkl(cnt.TRAIN_INDICES_PATH) else: indices = shutils.load_data_pkl(cnt.TEST_INDICES_PATH) random.shuffle(indices) indices = np.array(indices) num_batches = int(math.ceil(float(num_data) / cnt.BATCH_SIZE)) batch_num = 0 while True: m = batch_num % num_batches start, end = m * cnt.BATCH_SIZE, min((m + 1) * cnt.BATCH_SIZE, num_data) batch_indices = indices[start:end] out_img_arr = np.array([img_arr[x] for x in batch_indices]) out_txt_arr = np.array([txt_arr[x] for x in batch_indices]) batch_num += 1 yield [out_img_arr, out_txt_arr], labels[batch_indices] finally: img_arr_file.close()
def get_data_as_generator(num_samples, type='train'): if type == 'train': src_tensor_train = shutils.load_data_pkl(cnt.SRC_TENSOR_TRAIN) trg_tensor_train = shutils.load_data_pkl(cnt.TRG_TENSOR_TRAIN) n = len(src_tensor_train) dataset = tf.data.Dataset.from_tensor_slices( (src_tensor_train, trg_tensor_train)).shuffle(n) else: src_tensor_valid = shutils.load_data_pkl(cnt.SRC_TENSOR_VALID) trg_tensor_valid = shutils.load_data_pkl(cnt.TRG_TENSOR_VALID) n = len(src_tensor_valid) dataset = tf.data.Dataset.from_tensor_slices( (src_tensor_valid, trg_tensor_valid)).shuffle(n) dataset = dataset.batch(cnt.BATCH_SIZE, drop_remainder=False) return iter(dataset)
def get_data_as_generator(num_data, prefix='train'): random.seed(42) word_vector_model = utils.get_vector_model(cnt.VECTOR_MODEL, char_tokens=False) char_vector_model = utils.get_vector_model(cnt.VECTOR_MODEL, char_tokens=True) data_pairs = shutils.load_data_pkl( os.path.join(cnt.PERSISTENCE_PATH, prefix + "_data_pairs.pkl")) random.shuffle(data_pairs) num_batches = shutils.get_num_batches(num_data, cnt.BATCH_SIZE) batch_num = 0 while True: m = batch_num % num_batches start, end = m * cnt.BATCH_SIZE, min((m + 1) * cnt.BATCH_SIZE, num_data) word_tokens1, word_tokens2, char_tokens1, char_tokens2, labels = zip( *data_pairs[start:end]) labels = np.array(labels) labels = np.expand_dims(labels, -1) word_data_1 = shutils.get_vectors(word_vector_model, word_tokens1, cnt.WORD_VECTOR_DIM) word_data_2 = shutils.get_vectors(word_vector_model, word_tokens2, cnt.WORD_VECTOR_DIM) char_data_1 = np.array([ shutils.get_vectors(char_vector_model, x, cnt.CHAR_VECTOR_DIM) for x in char_tokens1 ]) char_data_2 = np.array([ shutils.get_vectors(char_vector_model, x, cnt.CHAR_VECTOR_DIM) for x in char_tokens2 ]) batch_num += 1 yield [word_data_1, word_data_2, char_data_1, char_data_2], labels
def scoring(self): test_labels, pred_labels, total_batches = [], [], shutils.get_num_batches( self.num_test, cnt.BATCH_SIZE) encoder = shutils.load_data_pkl(cnt.ENCODER_PATH) num_batches = 0 for batch_data, batch_labels in self.data_generator( self.num_test, 'test'): test_labels += batch_labels.tolist() predictions = self.predict(batch_data) pred_labels += predictions num_batches += 1 if num_batches == total_batches: break t_labels = encoder.inverse_transform(np.array(test_labels)) p_labels = encoder.inverse_transform(np.array(pred_labels)) print( classification_report(t_labels, p_labels, target_names=encoder.classes_))
# print("Reading input...") # utils.read_input_file() # print("Downloading images...") # utils.download_images() # print("Creating image data...") # utils.create_image_data() # print("Creating text data...") # utils.create_text_data() # print("Creating train test...") # utils.create_train_test() n = len(shutils.load_data_pkl(cnt.TRAIN_INDICES_PATH)) m = len(shutils.load_data_pkl(cnt.TEST_INDICES_PATH)) transf_labels = shutils.load_data_pkl(cnt.TRANSFORMED_LABELS_PATH) num_classes = transf_labels.shape[1] vocab_size = shutils.load_data_pkl(cnt.VOCAB_SIZE_PATH) print(n, m) # print("Training model...") # network = AttributeExtractionNetwork(dg.get_data_as_generator, n, m, num_classes, vocab_size) # network.fit() print("Scoring model...") network = AttributeExtractionNetwork(dg.get_data_as_generator, n, m,
# print("Getting train test tokens...") # train_indices, test_indices = train_test_split(range(len(items_train)), test_size=0.2) # shutils.save_data_pkl(train_indices, os.path.join(cnt.PERSISTENCE_PATH, "train_indices.pkl")) # shutils.save_data_pkl(test_indices, os.path.join(cnt.PERSISTENCE_PATH, "test_indices.pkl")) # train_data_pairs, test_data_pairs = utils.get_tokens_indices(items_train, train_indices), utils.get_tokens_indices(items_train, test_indices) # train_data_pairs, test_data_pairs = utils.get_tokens_indices(items_train, range(len(items_train))), utils.get_tokens_indices(items_test, range(len(items_test))) # shutils.save_data_pkl(train_data_pairs, os.path.join(cnt.PERSISTENCE_PATH, "train_data_pairs.pkl")) # shutils.save_data_pkl(test_data_pairs, os.path.join(cnt.PERSISTENCE_PATH, "test_data_pairs.pkl")) n = len( shutils.load_data_pkl( os.path.join(cnt.PERSISTENCE_PATH, "train_data_pairs.pkl"))) m = len( shutils.load_data_pkl( os.path.join(cnt.PERSISTENCE_PATH, "test_data_pairs.pkl"))) print("Training model...") network = DeepMatchingNetwork(dg.get_data_as_generator, n, m) network.fit() # print("Scoring model...") # network = DeepMatchingNetwork(dg.get_data_as_generator, n, m) # network.scoring() # network = DeepMatchingNetwork(dg.get_data_as_generator, n, m) # network.init_model() # network.load()
# src_tensor_train, src_tensor_valid, trg_tensor_train, trg_tensor_valid = train_test_split(src_tensor, trg_tensor, test_size=0.2) # shutils.save_data_pkl(src_tensor_train, cnt.SRC_TENSOR_TRAIN) # shutils.save_data_pkl(src_tensor_valid, cnt.SRC_TENSOR_VALID) # shutils.save_data_pkl(trg_tensor_train, cnt.TRG_TENSOR_TRAIN) # shutils.save_data_pkl(trg_tensor_valid, cnt.TRG_TENSOR_VALID) # shutils.save_data_pkl(src_tensor, cnt.SRC_TENSOR) # shutils.save_data_pkl(trg_tensor, cnt.TRG_TENSOR) # shutils.save_data_pkl(src_lang, cnt.SRC_LANG) # shutils.save_data_pkl(trg_lang, cnt.TRG_LANG) # print(len(src_tensor_train), len(trg_tensor_train), len(src_tensor_valid), len(trg_tensor_valid)) src_tensor_train = shutils.load_data_pkl(cnt.SRC_TENSOR_TRAIN) trg_tensor_train = shutils.load_data_pkl(cnt.TRG_TENSOR_TRAIN) src_tensor_valid = shutils.load_data_pkl(cnt.SRC_TENSOR_VALID) trg_tensor_valid = shutils.load_data_pkl(cnt.TRG_TENSOR_VALID) src_lang = shutils.load_data_pkl(cnt.SRC_LANG) trg_lang = shutils.load_data_pkl(cnt.TRG_LANG) src_tensor = shutils.load_data_pkl(cnt.SRC_TENSOR) trg_tensor = shutils.load_data_pkl(cnt.TRG_TENSOR) max_length_src = utils.max_length(src_tensor) max_length_trg = utils.max_length(trg_tensor) n, m = len(src_tensor_train), len(src_tensor_valid)