def __init__(self, experiment_root_directory, experiment_name, number_test_of_samples, batch_size=32, prefetch_size=32, dataset=None, iterator_name="Cifar10BasicIterator", preprocessed_data_path="preprocessed_data", train_data_path="train", validation_data_path="val", test_data_path="test"): ImageFeature.__init__(self) IPreprocessor.__init__( self, experiment_name=experiment_name, preprocessed_data_path=preprocessed_data_path, experiment_root_directory=experiment_root_directory, train_data_path=train_data_path, validation_data_path=validation_data_path, test_data_path=test_data_path) IIteratorBase.__init__( self, experiment_root_directory=experiment_root_directory, experiment_name=experiment_name, batch_size=batch_size, prefetch_size=prefetch_size, dataset=dataset) self._experiment_root_directory = experiment_root_directory self._experiment_name = experiment_name self._batch_size = batch_size self._prefetch_size = prefetch_size self._dataset = dataset self._iterator_name = iterator_name self._preprocessed_data_path = preprocessed_data_path self._train_data_path = train_data_path self._validation_data_path = validation_data_path self._test_data_path = test_data_path # TODO - make `EXPERIMENT_ROOT_DIR` as local variable self.EXPERIMENT_ROOT_DIR = os.path.join( self._experiment_root_directory, self._experiment_name) self.PREPROCESSED_DATA_OUT_DIR = os.path.join( self.EXPERIMENT_ROOT_DIR, self._preprocessed_data_path) self.TRAIN_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, self._train_data_path) self.TEST_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, self._test_data_path) self.OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR, self._iterator_name) # This rule is assumed to be correct if the previous stage is of IPreprocessor self.TRAIN_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "train/") self.TEST_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "test/") check_n_makedirs(self.OUT_DIR) # Width and height of each image. self._img_size = 32 # Number of channels in each image, 3 channels: Red, Green, Blue. self._num_channels = 3 # Length of an image when flattened to a 1-dim array. self._img_size_flat = self._img_size * self._img_size * self._num_channels # Number of classes. self._num_classes = 10 # Number of files for the training-set. self._num_files_train = 5 # Number of images for each batch-file in the training-set. self._images_per_file = 10000 # Total number of images in the training-set. # This is used to pre-allocate arrays for efficiency. self._num_images_train = self._num_files_train * self._images_per_file self.images, self.labels = self._load_training_data() self.images = self.images.astype("float32") self._number_test_of_samples = number_test_of_samples
def predict_on_test_files(self, predict_fn): ''' Iterate through the files and use `predict_on_test_file`, for prediction :param predict_fn: :return: Creates a folder estimator.model_dir/predictions/ and adds the predicted files ''' predictions = [] for predict in predict_fn: predictions.append(predict) results = [] # Get the files from test folder and zip it with predictions for each_prediction, file in zip(predictions, self._test_files_path): df = pd.read_csv(file, sep=self._in_seperator, quoting=csv.QUOTE_NONE) df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) predicted_id = [] confidence = [] for tag_score in each_prediction["confidence"]: confidence.append(tag_score) for tag_id in each_prediction["viterbi_seq"]: predicted_id.append(self.TAGS_2_ID[tag_id]) top_3_predicted_indices = each_prediction["top_3_indices"] top_3_predicted_confidence = each_prediction["top_3_confidence"] # print(top_3_predicted_indices) pred_1 = top_3_predicted_indices[:, 0:1].flatten() pred_1 = list(map(lambda x: self.TAGS_2_ID[x], pred_1)) pred_2 = top_3_predicted_indices[:, 1:2].flatten() pred_2 = list(map(lambda x: self.TAGS_2_ID[x], pred_2)) pred_3 = top_3_predicted_indices[:, 2:].flatten() pred_3 = list(map(lambda x: self.TAGS_2_ID[x], pred_3)) pred_1_confidence = top_3_predicted_confidence[:, 0:1] pred_2_confidence = top_3_predicted_confidence[:, 1:2] pred_3_confidence = top_3_predicted_confidence[:, 2:] results.append({ "tokens": df[self._text_col].astype(str).values.tolist(), self._prediction_col: predicted_id, "confidence": confidence, "pred_1": pred_1, "pred_1_confidence": pred_1_confidence, "pred_2": pred_2, "pred_2_confidence": pred_2_confidence, "pred_3": pred_3, "pred_3_confidence": pred_3_confidence, }) df[self._prediction_col] = [j for i, j in zip(df[self._text_col].astype(str).values.tolist(), predicted_id)] df["confidence"]= confidence[:len(df)] df["pred_1"]= pred_1[:len(df)] df["pred_1_confidence"]= pred_1_confidence[:len(df)] df["pred_2"]= pred_2[:len(df)] df["pred_2_confidence"]= pred_2_confidence[:len(df)] out_dir = os.path.join(self.OUT_DIR, "predictions") check_n_makedirs(out_dir) df.to_csv(os.path.join(out_dir, os.path.basename(file)), index=False) return results
def predict_on_test_files(self, executor: Executor): """ Runs the prediction on list of file to be tagged :return: """ model = executor.model estimator = executor.estimator data_iterator = executor.data_iterator predict_fn = estimator.predict(input_fn=lambda: data_iterator.test_input_fn()) # data_iterator.predict_on_test_files(predict_fn) predictions = [] for predict in predict_fn: predictions.append(predict) results = [] # Get the files from test folder and zip it with predictions for each_prediction, file in zip(predictions, self._test_files_path): df = pd.read_csv(file, sep=self._in_seperator, quoting=csv.QUOTE_NONE) df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) predicted_id = [] confidence = [] for tag_score in each_prediction["confidence"]: confidence.append(tag_score) for tag_id in each_prediction["viterbi_seq"]: predicted_id.append(self.TAGS_2_ID[tag_id]) top_3_predicted_indices = each_prediction["top_3_indices"] top_3_predicted_confidence = each_prediction["top_3_confidence"] # print(top_3_predicted_indices) pred_1 = top_3_predicted_indices[:, 0:1].flatten() pred_1 = list(map(lambda x: self.TAGS_2_ID[x], pred_1)) pred_2 = top_3_predicted_indices[:, 1:2].flatten() pred_2 = list(map(lambda x: self.TAGS_2_ID[x], pred_2)) pred_3 = top_3_predicted_indices[:, 2:].flatten() pred_3 = list(map(lambda x: self.TAGS_2_ID[x], pred_3)) pred_1_confidence = top_3_predicted_confidence[:, 0:1].flatten() pred_2_confidence = top_3_predicted_confidence[:, 1:2].flatten() pred_3_confidence = top_3_predicted_confidence[:, 2:].flatten() results.append({ "tokens": df[self._text_col].astype(str).values.tolist(), self._prediction_col: predicted_id, "confidence": confidence, "pred_1": pred_1, "pred_1_confidence": pred_1_confidence, "pred_2": pred_2, "pred_2_confidence": pred_2_confidence, "pred_3": pred_3, "pred_3_confidence": pred_3_confidence, }) df[self._prediction_col] = [j for i, j in zip(df[self._text_col].astype(str).values.tolist(), predicted_id)] df["confidence"] = confidence[:len(df)] df["pred_1"] = pred_1[:len(df)] df["pred_1_confidence"] = pred_1_confidence[:len(df)] df["pred_2"] = pred_2[:len(df)] df["pred_2_confidence"] = pred_2_confidence[:len(df)] out_dir = os.path.join(self.OUT_DIR, "predictions") check_n_makedirs(out_dir) df.to_csv(os.path.join(out_dir, os.path.basename(file)), index=False) prev_tag = None doc_text = "" text = "" enter = False print(file) for index, row in df.iterrows(): print(row[self._text_col],row[self._prediction_col]) # check if the first row if row[self._prediction_col] != "O": if index == 0 or not enter: text = row[self._text_col] prev_tag = row[self._prediction_col] enter = True else: # second index onwards if is_new_tag(prev_tag, row[self._prediction_col]): doc_text = doc_text + text + "~" + strip_iob(prev_tag) + "\n" text = row[self._text_col] else: text = text + " " + row[self._text_col] prev_tag = row[self._prediction_col] else: doc_text = doc_text + text + "~" + strip_iob(row[self._prediction_col]) + "\n" prev_tag = row[self._prediction_col] doc_text = doc_text + text + "~" + strip_iob(prev_tag) + "\n" print(doc_text) post_out_dir = os.path.join(self.OUT_DIR, "postprocessed") check_n_makedirs(post_out_dir) # with open(os.path.join(post_out_dir, os.path.basename(file)), "w") as post_file: # post_file.write("Item~Tag\n") # post_file.write(doc_text) # for loop ends # Compute accuracy metric for the all the predictions get_naive_metrics(predicted_csvs_path=out_dir, ner_tag_vocab_file=self.ENTITY_VOCAB_FILE, entity_col_name=self._entity_col, prediction_col_name=self._prediction_col, out_dir=out_dir)
def __init__(self, experiment_root_directory, experiment_name, number_test_of_samples=4, batch_size=32, prefetch_size=32, iterator_name="Cifar10BasicIterator", preprocessed_data_path="preprocessed_data", train_data_path="train", validation_data_path="val", test_data_path="test", dataset=None, text_col=0, entity_col=3, prediction_col="prediction", in_seperator="~", out_seperator="~", quotechar="^", max_word_length=20, use_char_embd=False): ''' :param hparams: :param dataset: ''' IIteratorBase.__init__(self, experiment_root_directory=experiment_root_directory, experiment_name=experiment_name, batch_size=batch_size, prefetch_size=prefetch_size, dataset=dataset) ITextFeature.__init__(self) self._experiment_root_directory = experiment_root_directory self._experiment_name = experiment_name self._prefetch_size = prefetch_size self._dataset = dataset self._iterator_name = iterator_name self._preprocessed_data_path = preprocessed_data_path self._train_data_path = train_data_path self._validation_data_path = validation_data_path self._test_data_path = test_data_path self._number_test_of_samples = number_test_of_samples self._text_col = text_col self._entity_col = entity_col self._prediction_col = prediction_col self._batch_size = batch_size self._in_seperator = in_seperator self._out_seperator = out_seperator self._quotechar = quotechar self._max_word_length = max_word_length self._use_char_embd = use_char_embd self.EXPERIMENT_ROOT_DIR = os.path.join(self._experiment_root_directory, self._experiment_name) self.PREPROCESSED_DATA_OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR, self._preprocessed_data_path) self.TRAIN_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, self._train_data_path) self.VAL_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, self._validation_data_path) self.TEST_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, self._test_data_path) self.OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR, self._iterator_name) # This rule is assumed to be correct if the previous stage is of IPreprocessor self.TRAIN_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "train/") self.VAL_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "val/") self.TEST_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "test/") self.WORDS_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._text_col) + "_" + "vocab.tsv") self.CHARS_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._text_col) + "_" + "chars_vocab.tsv") self.ENTITY_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._entity_col) + "_vocab.tsv") check_n_makedirs(self.OUT_DIR) self.padded_shapes = (tf.TensorShape([None]), # sentence of unknown size tf.TensorShape([None]), tf.TensorShape([None])) # labels of unknown size self.padding_values = (SpecialTokens.PAD_WORD, SpecialTokens.PAD_CHAR, # sentence padded on the right with id_pad_word SpecialTokens.PAD_TAG) # labels padded on the right with id_pad_tag self._extract_vocab()