示例#1
0
    def _produce_annotations(self, *, inputs: Inputs) -> Outputs:
        """ generates dataframe with semantic type classifications and classification probabilities 
            for each column of original dataframe
        
        Arguments:
            inputs {Inputs} -- D3M dataframe
        
        Returns:
            Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities"
                       Each row represents a column in the original dataframe. The column "semantic type 
                       classifications" contains a list of all semantic type labels and the column
                       "probabilities" contains a list of the model's confidence in assigning each 
                       respective semantic type label 
        """

        # load model checkpoint
        checkpoint_dir = (self.volumes["simon_models_1"] +
                          "/simon_models_1/pretrained_models/")
        if self.hyperparams["statistical_classification"]:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(
                self.volumes["simon_models_1"] + "/simon_models_1" +
                category_list, "r") as f:
            Categories = f.read().splitlines()

        # create model object
        Classifier = Simon(encoder={})
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config["encoder"]
        checkpoint = config["checkpoint"]
        model = Classifier.generate_model(self.hyperparams["max_chars"],
                                          self.hyperparams["max_rows"],
                                          len(Categories))
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["binary_accuracy"])

        # prepare data and make predictions
        frame = inputs.copy()
        prepped_data = encoder.encodeDataFrame(frame)
        preds = model.predict_on_batch(tf.constant(prepped_data))
        decoded_preds = encoder.reverse_label_encode(
            preds, self.hyperparams["p_threshold"])

        # apply statistical / ordinal classification if desired
        if self.hyperparams["statistical_classification"]:
            logger.debug(
                "Beginning Guessing categorical/ordinal classifications...")
            raw_data = frame.values
            guesses = [
                guess(raw_data[:, i], for_types="category")
                for i in np.arange(raw_data.shape[1])
            ]
            for i, g in enumerate(guesses):
                if g[0] == "category":
                    decoded_preds[0][i] += ("categorical", )
                    decoded_preds[1][i].append(1)
                    if (("int" in decoded_preds[1][i])
                            or ("float" in decoded_preds[1][i])
                            or ("datetime" in decoded_preds[1][i])):
                        decoded_preds[0][i] += ("ordinal", )
                        decoded_preds[1][i].append(1)
            logger.debug("Done with statistical variable guessing")

        # clear tf session
        Classifier.clear_session()

        out_df = pd.DataFrame.from_records(list(decoded_preds)).T
        out_df.columns = ["semantic types", "probabilities"]
        return out_df
示例#2
0
    def _produce_annotations(self, *, inputs: Inputs) -> Outputs:
        """
        Parameters
        ----------
        inputs: Input pandas frame

        Returns
        -------
        Outputs
            The outputs is two lists of lists, each has length equal to number of columns in input pandas frame.
            Each entry of the first one is a list of strings corresponding to each column's multi-label classification.
            Each entry of the second one is a list of floats corresponding to prediction probabilities.
        """
        frame = inputs

        # setup model as you typically would in a Simon main file
        maxlen = 20
        max_cells = 500
        p_threshold = 0.5

        DEBUG = True # boolean to specify whether or not print DEBUG information
        checkpoint_dir = self.volumes["simon_models_1"]+"/pretrained_models/"
        
        if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(self.volumes["simon_models_1"]+ category_list,'r') as f:
            Categories = f.read().splitlines()
        
        # orient the user a bit
        print("fixed categories are: ")
        Categories = sorted(Categories)
        print(Categories)
        category_count = len(Categories)

        # load specified execution configuration
        if execution_config is None:
            raise TypeError
        Classifier = Simon(encoder={}) # dummy text classifier
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config['encoder']
        checkpoint = config['checkpoint']

        X = encoder.encodeDataFrame(frame)

        # build classifier model
        model = Classifier.generate_model(maxlen, max_cells, category_count)
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)

        model_compile = lambda m: m.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=['binary_accuracy'])
        model_compile(model)
        y = model.predict(X)   
        # discard empty column edge case
        y[np.all(frame.isnull(),axis=0)]=0

        result = encoder.reverse_label_encode(y,p_threshold)

        
        ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
        category_count = 0
        ordinal_count = 0
        raw_data = frame.as_matrix()
        for i in np.arange(raw_data.shape[1]):
            if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']:
                print("Beginning Guessing categorical/ordinal classifications...")
                tmp = guess(raw_data[:,i], for_types ='category')
                if tmp[0]=='category':
                    category_count += 1
                    tmp2 = list(result[0][i])
                    tmp2.append('categorical')
                    result[0][i] = tmp2
                    result[1][i].append(1)
                    if ('int' in result[1][i]) or ('float' in result[1][i]) \
                        or ('datetime' in result[1][i]):
                            ordinal_count += 1
                            tmp2 = list(result[0][i])
                            tmp2.append('ordinal')
                            result[0][i] = tmp2
                            result[1][i].append(1)
                print("Done with statistical variable guessing")
                ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL
            result[0][i] = d3m_List(result[0][i])
            result[1][i] = d3m_List(result[1][i])
        Classifier.clear_session()

        out_df = pandas.DataFrame.from_records(list(result)).T
        out_df.columns = ['semantic types','probabilities']
        return out_df
示例#3
0
    def runModel(self, frame, p_threshold):

        # setup model as you typically would in a Simon main file
        maxlen = 20
        max_cells = 500
        p_threshold = 0.5

        DEBUG = True  # boolean to specify whether or not print DEBUG information

        checkpoint_dir = "/clusterfiles/scripts/pretrained_models/"

        with open('/clusterfiles/scripts/Categories.txt', 'r') as f:
            Categories = f.read().splitlines()

        # orient the user a bit
        print("fixed categories are: ")
        Categories = sorted(Categories)
        print(Categories)
        category_count = len(Categories)

        execution_config = modelName

        # load specified execution configuration
        if execution_config is None:
            raise TypeError
        Classifier = Simon(encoder={})  # dummy text classifier

        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config['encoder']
        checkpoint = config['checkpoint']

        X = encoder.encodeDataFrame(frame)

        # build classifier model
        model = Classifier.generate_model(maxlen, max_cells, category_count)
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model_compile = lambda m: m.compile(loss='binary_crossentropy',
                                            optimizer='adam',
                                            metrics=['binary_accuracy'])
        model_compile(model)
        y = model.predict(X)
        # discard empty column edge case
        y[np.all(frame.isnull(), axis=0)] = 0

        result = encoder.reverse_label_encode(y, p_threshold)

        ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
        print("Beginning Guessing categorical/ordinal classifications...")
        start_time_guess = time.time()
        category_count = 0
        ordinal_count = 0
        raw_data = frame.as_matrix()
        for i in np.arange(raw_data.shape[1]):
            tmp = guess(raw_data[:, i], for_types='category')
            if tmp[0] == 'category':
                category_count += 1
                tmp2 = list(result[0][i])
                tmp2.append('categorical')
                result[0][i] = tuple(tmp2)
                result[1][i].append(1)
                if ('int' in result[1][i]) or ('float' in result[1][i]) \
                    or ('datetime' in result[1][i]):
                    ordinal_count += 1
                    tmp2 = list(result[0][i])
                    tmp2.append('ordinal')
                    result[0][i] = tuple(tmp2)
                    result[1][i].append(1)
        elapsed_time = time.time() - start_time_guess
        print("Total statistical variable guessing time is : %.2f sec" %
              elapsed_time)
        ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL

        Classifier.clear_session()

        return self.encoder.encode((result))