Exemplo n.º 1
0
    def _produce_annotations(self, inputs: Inputs) -> Outputs:
        """ generates dataframe with semantic type classifications and classification probabilities
            for each column of original dataframe

        Arguments:
            inputs {Inputs} -- D3M dataframe

        Returns:
            Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities"
                       Each row represents a column in the original dataframe. The column "semantic type
                       classifications" contains a list of all semantic type labels and the column
                       "probabilities" contains a list of the model's confidence in assigning each
                       respective semantic type label
        """

        # load model checkpoint
        checkpoint_dir = (self._volumes["simon_models_1"] +
                          "/simon_models_1/pretrained_models/")
        if self.hyperparams["statistical_classification"]:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(
                self._volumes["simon_models_1"] + "/simon_models_1" +
                category_list, "r") as f:
            Categories = f.read().splitlines()

        # create model object
        Classifier = Simon(encoder={})
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config["encoder"]
        checkpoint = config["checkpoint"]
        model = Classifier.generate_model(20, self.hyperparams["max_rows"],
                                          len(Categories))
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["binary_accuracy"])

        # prepare data and make predictions
        frame = inputs.copy()
        prepped_data = encoder.encodeDataFrame(frame)
        preds = model.predict_on_batch(tf.constant(prepped_data))
        logger.debug('------------Reverse label encoding------------')
        decoded_preds = encoder.reverse_label_encode(
            preds, self.hyperparams["p_threshold"])

        # apply statistical / ordinal classification if desired
        if self.hyperparams["statistical_classification"]:
            logger.debug(
                "Beginning Guessing categorical/ordinal classifications...")
            raw_data = frame.values
            guesses = [
                guess(raw_data[:, i], for_types="category")
                for i in np.arange(raw_data.shape[1])
            ]

            # probability of rule-based statistical / ordinal classifications = min probability of existing classifications
            for i, g in enumerate(guesses):
                if g[0] == "category":
                    if len(decoded_preds[1][i]) == 0:
                        guess_prob = self.hyperparams['p_threshold']
                    else:
                        guess_prob = min(decoded_preds[1][i])
                    decoded_preds[0][i] += ("categorical", )
                    decoded_preds[1][i].append(guess_prob)
                    if (("int" in decoded_preds[1][i])
                            or ("float" in decoded_preds[1][i])
                            or ("datetime" in decoded_preds[1][i])):
                        decoded_preds[0][i] += ("ordinal", )
                        decoded_preds[1][i].append(guess_prob)
            logger.debug("Done with statistical variable guessing")

        # clear tf session, remove unnecessary files
        Classifier.clear_session()
        os.remove('unencoded_chars.json')

        out_df = pd.DataFrame.from_records(list(decoded_preds)).T
        out_df.columns = ["semantic types", "probabilities"]
        return out_df