def __init__(self,
                 experiment_root_directory,
                 experiment_name,
                 number_test_of_samples,
                 batch_size=32,
                 prefetch_size=32,
                 dataset=None,
                 iterator_name="Cifar10BasicIterator",
                 preprocessed_data_path="preprocessed_data",
                 train_data_path="train",
                 validation_data_path="val",
                 test_data_path="test"):
        ImageFeature.__init__(self)
        IPreprocessor.__init__(
            self,
            experiment_name=experiment_name,
            preprocessed_data_path=preprocessed_data_path,
            experiment_root_directory=experiment_root_directory,
            train_data_path=train_data_path,
            validation_data_path=validation_data_path,
            test_data_path=test_data_path)
        IIteratorBase.__init__(
            self,
            experiment_root_directory=experiment_root_directory,
            experiment_name=experiment_name,
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            dataset=dataset)

        self._experiment_root_directory = experiment_root_directory
        self._experiment_name = experiment_name
        self._batch_size = batch_size
        self._prefetch_size = prefetch_size
        self._dataset = dataset
        self._iterator_name = iterator_name
        self._preprocessed_data_path = preprocessed_data_path
        self._train_data_path = train_data_path
        self._validation_data_path = validation_data_path
        self._test_data_path = test_data_path

        # TODO - make `EXPERIMENT_ROOT_DIR` as local variable
        self.EXPERIMENT_ROOT_DIR = os.path.join(
            self._experiment_root_directory, self._experiment_name)
        self.PREPROCESSED_DATA_OUT_DIR = os.path.join(
            self.EXPERIMENT_ROOT_DIR, self._preprocessed_data_path)
        self.TRAIN_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                           self._train_data_path)
        self.TEST_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                          self._test_data_path)
        self.OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR,
                                    self._iterator_name)

        # This rule is assumed to be correct if the previous stage is of IPreprocessor
        self.TRAIN_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                                "train/")
        self.TEST_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                               "test/")

        check_n_makedirs(self.OUT_DIR)

        # Width and height of each image.
        self._img_size = 32

        # Number of channels in each image, 3 channels: Red, Green, Blue.
        self._num_channels = 3

        # Length of an image when flattened to a 1-dim array.
        self._img_size_flat = self._img_size * self._img_size * self._num_channels

        # Number of classes.
        self._num_classes = 10

        # Number of files for the training-set.
        self._num_files_train = 5

        # Number of images for each batch-file in the training-set.
        self._images_per_file = 10000

        # Total number of images in the training-set.
        # This is used to pre-allocate arrays for efficiency.
        self._num_images_train = self._num_files_train * self._images_per_file

        self.images, self.labels = self._load_training_data()
        self.images = self.images.astype("float32")

        self._number_test_of_samples = number_test_of_samples
    def predict_on_test_files(self, predict_fn):
        '''
        Iterate through the files and use `predict_on_test_file`, for prediction
        :param predict_fn:
        :return: Creates a folder estimator.model_dir/predictions/ and adds the predicted files
        '''

        predictions = []

        for predict in predict_fn:
            predictions.append(predict)

        results = []

        # Get the files from test folder and zip it with predictions
        for each_prediction, file in zip(predictions, self._test_files_path):

            df = pd.read_csv(file, sep=self._in_seperator, quoting=csv.QUOTE_NONE)
            df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

            predicted_id = []
            confidence = []

            for tag_score in each_prediction["confidence"]:
                confidence.append(tag_score)
            for tag_id in each_prediction["viterbi_seq"]:
                predicted_id.append(self.TAGS_2_ID[tag_id])
            top_3_predicted_indices = each_prediction["top_3_indices"]
            top_3_predicted_confidence = each_prediction["top_3_confidence"]

            # print(top_3_predicted_indices)

            pred_1 = top_3_predicted_indices[:, 0:1].flatten()
            pred_1 = list(map(lambda x: self.TAGS_2_ID[x], pred_1))

            pred_2 = top_3_predicted_indices[:, 1:2].flatten()
            pred_2 = list(map(lambda x: self.TAGS_2_ID[x], pred_2))

            pred_3 = top_3_predicted_indices[:, 2:].flatten()
            pred_3 = list(map(lambda x: self.TAGS_2_ID[x], pred_3))

            pred_1_confidence = top_3_predicted_confidence[:, 0:1]
            pred_2_confidence = top_3_predicted_confidence[:, 1:2]
            pred_3_confidence = top_3_predicted_confidence[:, 2:]

            results.append({
                "tokens": df[self._text_col].astype(str).values.tolist(),
                self._prediction_col: predicted_id,
                "confidence": confidence,
                "pred_1": pred_1,
                "pred_1_confidence": pred_1_confidence,
                "pred_2": pred_2,
                "pred_2_confidence": pred_2_confidence,
                "pred_3": pred_3,
                "pred_3_confidence": pred_3_confidence,

            })

            df[self._prediction_col] = [j for i, j in
                                  zip(df[self._text_col].astype(str).values.tolist(), predicted_id)]
            df["confidence"]= confidence[:len(df)]
            df["pred_1"]= pred_1[:len(df)]
            df["pred_1_confidence"]= pred_1_confidence[:len(df)]
            df["pred_2"]= pred_2[:len(df)]
            df["pred_2_confidence"]= pred_2_confidence[:len(df)]

            out_dir = os.path.join(self.OUT_DIR, "predictions")

            check_n_makedirs(out_dir)
            df.to_csv(os.path.join(out_dir, os.path.basename(file)), index=False)

        return results
    def predict_on_test_files(self, executor: Executor):
        """
        Runs the prediction on list of file to be tagged
        :return:
        """
        model = executor.model
        estimator = executor.estimator
        data_iterator = executor.data_iterator

        predict_fn = estimator.predict(input_fn=lambda: data_iterator.test_input_fn())

        # data_iterator.predict_on_test_files(predict_fn)

        predictions = []

        for predict in predict_fn:
            predictions.append(predict)

        results = []

        # Get the files from test folder and zip it with predictions
        for each_prediction, file in zip(predictions, self._test_files_path):

            df = pd.read_csv(file, sep=self._in_seperator, quoting=csv.QUOTE_NONE)
            df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

            predicted_id = []
            confidence = []

            for tag_score in each_prediction["confidence"]:
                confidence.append(tag_score)
            for tag_id in each_prediction["viterbi_seq"]:
                predicted_id.append(self.TAGS_2_ID[tag_id])
            top_3_predicted_indices = each_prediction["top_3_indices"]
            top_3_predicted_confidence = each_prediction["top_3_confidence"]

            # print(top_3_predicted_indices)

            pred_1 = top_3_predicted_indices[:, 0:1].flatten()
            pred_1 = list(map(lambda x: self.TAGS_2_ID[x], pred_1))

            pred_2 = top_3_predicted_indices[:, 1:2].flatten()
            pred_2 = list(map(lambda x: self.TAGS_2_ID[x], pred_2))

            pred_3 = top_3_predicted_indices[:, 2:].flatten()
            pred_3 = list(map(lambda x: self.TAGS_2_ID[x], pred_3))

            pred_1_confidence = top_3_predicted_confidence[:, 0:1].flatten()
            pred_2_confidence = top_3_predicted_confidence[:, 1:2].flatten()
            pred_3_confidence = top_3_predicted_confidence[:, 2:].flatten()

            results.append({
                "tokens": df[self._text_col].astype(str).values.tolist(),
                self._prediction_col: predicted_id,
                "confidence": confidence,
                "pred_1": pred_1,
                "pred_1_confidence": pred_1_confidence,
                "pred_2": pred_2,
                "pred_2_confidence": pred_2_confidence,
                "pred_3": pred_3,
                "pred_3_confidence": pred_3_confidence,

            })

            df[self._prediction_col] = [j for i, j in
                                  zip(df[self._text_col].astype(str).values.tolist(), predicted_id)]

            df["confidence"] = confidence[:len(df)]
            df["pred_1"] = pred_1[:len(df)]
            df["pred_1_confidence"] = pred_1_confidence[:len(df)]
            df["pred_2"] = pred_2[:len(df)]
            df["pred_2_confidence"] = pred_2_confidence[:len(df)]

            out_dir = os.path.join(self.OUT_DIR, "predictions")
            check_n_makedirs(out_dir)
            df.to_csv(os.path.join(out_dir, os.path.basename(file)), index=False)

            prev_tag = None
            doc_text = ""
            text = ""
            enter = False
            print(file)
            for index, row in df.iterrows():
                print(row[self._text_col],row[self._prediction_col])
                # check if the first row
                if row[self._prediction_col] != "O":
                    if index == 0 or not enter:
                        text = row[self._text_col]
                        prev_tag = row[self._prediction_col]
                        enter = True

                    else:
                        # second index onwards
                        if is_new_tag(prev_tag, row[self._prediction_col]):
                            doc_text = doc_text + text + "~" + strip_iob(prev_tag) + "\n"
                            text = row[self._text_col]

                        else:
                            text = text + " " + row[self._text_col]
                        prev_tag = row[self._prediction_col]
                else:
                    doc_text = doc_text + text + "~" + strip_iob(row[self._prediction_col]) + "\n"
                    prev_tag = row[self._prediction_col]

            doc_text = doc_text + text + "~" + strip_iob(prev_tag) + "\n"
            print(doc_text)

            post_out_dir = os.path.join(self.OUT_DIR, "postprocessed")
            check_n_makedirs(post_out_dir)

            # with open(os.path.join(post_out_dir, os.path.basename(file)), "w") as post_file:
            #     post_file.write("Item~Tag\n")
            #     post_file.write(doc_text)

        # for loop ends
        # Compute accuracy metric for the all the predictions
        get_naive_metrics(predicted_csvs_path=out_dir,
                          ner_tag_vocab_file=self.ENTITY_VOCAB_FILE,
                          entity_col_name=self._entity_col,
                          prediction_col_name=self._prediction_col,
                          out_dir=out_dir)
    def __init__(self,
                 experiment_root_directory,
                 experiment_name,
                 number_test_of_samples=4,
                 batch_size=32,
                 prefetch_size=32,
                 iterator_name="Cifar10BasicIterator",
                 preprocessed_data_path="preprocessed_data",
                 train_data_path="train",
                 validation_data_path="val",
                 test_data_path="test",
                 dataset=None,
                 text_col=0,
                 entity_col=3,
                 prediction_col="prediction",
                 in_seperator="~",
                 out_seperator="~",
                 quotechar="^",
                 max_word_length=20,
                 use_char_embd=False):
        '''
        :param hparams:
        :param dataset:
        '''
        IIteratorBase.__init__(self,
                               experiment_root_directory=experiment_root_directory,
                               experiment_name=experiment_name,
                               batch_size=batch_size,
                               prefetch_size=prefetch_size,
                               dataset=dataset)
        ITextFeature.__init__(self)

        self._experiment_root_directory = experiment_root_directory
        self._experiment_name = experiment_name
        self._prefetch_size = prefetch_size
        self._dataset = dataset
        self._iterator_name = iterator_name
        self._preprocessed_data_path = preprocessed_data_path
        self._train_data_path = train_data_path
        self._validation_data_path = validation_data_path
        self._test_data_path = test_data_path
        self._number_test_of_samples = number_test_of_samples
        self._text_col = text_col
        self._entity_col = entity_col
        self._prediction_col = prediction_col
        self._batch_size = batch_size
        self._in_seperator = in_seperator
        self._out_seperator = out_seperator
        self._quotechar = quotechar
        self._max_word_length = max_word_length
        self._use_char_embd = use_char_embd

        self.EXPERIMENT_ROOT_DIR = os.path.join(self._experiment_root_directory,
                                                self._experiment_name)
        self.PREPROCESSED_DATA_OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR,
                                                      self._preprocessed_data_path)
        self.TRAIN_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                           self._train_data_path)
        self.VAL_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                         self._validation_data_path)
        self.TEST_OUT_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR,
                                          self._test_data_path)
        self.OUT_DIR = os.path.join(self.EXPERIMENT_ROOT_DIR,
                                    self._iterator_name)

        # This rule is assumed to be correct if the previous stage is of IPreprocessor
        self.TRAIN_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "train/")
        self.VAL_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "val/")
        self.TEST_FILES_IN_PATH = os.path.join(self.PREPROCESSED_DATA_OUT_DIR, "test/")

        self.WORDS_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._text_col) + "_" + "vocab.tsv")
        self.CHARS_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._text_col) + "_" + "chars_vocab.tsv")
        self.ENTITY_VOCAB_FILE = os.path.join(self.OUT_DIR, str(self._entity_col) + "_vocab.tsv")

        check_n_makedirs(self.OUT_DIR)

        self.padded_shapes = (tf.TensorShape([None]),  # sentence of unknown size
                              tf.TensorShape([None]),
                              tf.TensorShape([None]))  # labels of unknown size

        self.padding_values = (SpecialTokens.PAD_WORD,
                               SpecialTokens.PAD_CHAR,  # sentence padded on the right with id_pad_word
                               SpecialTokens.PAD_TAG)  # labels padded on the right with id_pad_tag

        self._extract_vocab()