def _produce_annotations(self, *, inputs: Inputs) -> Outputs: """ generates dataframe with semantic type classifications and classification probabilities for each column of original dataframe Arguments: inputs {Inputs} -- D3M dataframe Returns: Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ # load model checkpoint checkpoint_dir = (self.volumes["simon_models_1"] + "/simon_models_1/pretrained_models/") if self.hyperparams["statistical_classification"]: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open( self.volumes["simon_models_1"] + "/simon_models_1" + category_list, "r") as f: Categories = f.read().splitlines() # create model object Classifier = Simon(encoder={}) config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config["encoder"] checkpoint = config["checkpoint"] model = Classifier.generate_model(self.hyperparams["max_chars"], self.hyperparams["max_rows"], len(Categories)) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) # prepare data and make predictions frame = inputs.copy() prepped_data = encoder.encodeDataFrame(frame) preds = model.predict_on_batch(tf.constant(prepped_data)) decoded_preds = encoder.reverse_label_encode( preds, self.hyperparams["p_threshold"]) # apply statistical / ordinal classification if desired if self.hyperparams["statistical_classification"]: logger.debug( "Beginning Guessing categorical/ordinal classifications...") raw_data = frame.values guesses = [ guess(raw_data[:, i], for_types="category") for i in np.arange(raw_data.shape[1]) ] for i, g in enumerate(guesses): if g[0] == "category": decoded_preds[0][i] += ("categorical", ) decoded_preds[1][i].append(1) if (("int" in decoded_preds[1][i]) or ("float" in decoded_preds[1][i]) or ("datetime" in decoded_preds[1][i])): decoded_preds[0][i] += ("ordinal", ) decoded_preds[1][i].append(1) logger.debug("Done with statistical variable guessing") # clear tf session Classifier.clear_session() out_df = pd.DataFrame.from_records(list(decoded_preds)).T out_df.columns = ["semantic types", "probabilities"] return out_df
def _produce_annotations(self, *, inputs: Inputs) -> Outputs: """ Parameters ---------- inputs: Input pandas frame Returns ------- Outputs The outputs is two lists of lists, each has length equal to number of columns in input pandas frame. Each entry of the first one is a list of strings corresponding to each column's multi-label classification. Each entry of the second one is a list of floats corresponding to prediction probabilities. """ frame = inputs # setup model as you typically would in a Simon main file maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = self.volumes["simon_models_1"]+"/pretrained_models/" if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open(self.volumes["simon_models_1"]+ category_list,'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(),axis=0)]=0 result = encoder.reverse_label_encode(y,p_threshold) ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL category_count = 0 ordinal_count = 0 raw_data = frame.as_matrix() for i in np.arange(raw_data.shape[1]): if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']: print("Beginning Guessing categorical/ordinal classifications...") tmp = guess(raw_data[:,i], for_types ='category') if tmp[0]=='category': category_count += 1 tmp2 = list(result[0][i]) tmp2.append('categorical') result[0][i] = tmp2 result[1][i].append(1) if ('int' in result[1][i]) or ('float' in result[1][i]) \ or ('datetime' in result[1][i]): ordinal_count += 1 tmp2 = list(result[0][i]) tmp2.append('ordinal') result[0][i] = tmp2 result[1][i].append(1) print("Done with statistical variable guessing") ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL result[0][i] = d3m_List(result[0][i]) result[1][i] = d3m_List(result[1][i]) Classifier.clear_session() out_df = pandas.DataFrame.from_records(list(result)).T out_df.columns = ['semantic types','probabilities'] return out_df
def runModel(self, frame, p_threshold): # setup model as you typically would in a Simon main file maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = "/clusterfiles/scripts/pretrained_models/" with open('/clusterfiles/scripts/Categories.txt', 'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) execution_config = modelName # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(), axis=0)] = 0 result = encoder.reverse_label_encode(y, p_threshold) ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL print("Beginning Guessing categorical/ordinal classifications...") start_time_guess = time.time() category_count = 0 ordinal_count = 0 raw_data = frame.as_matrix() for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:, i], for_types='category') if tmp[0] == 'category': category_count += 1 tmp2 = list(result[0][i]) tmp2.append('categorical') result[0][i] = tuple(tmp2) result[1][i].append(1) if ('int' in result[1][i]) or ('float' in result[1][i]) \ or ('datetime' in result[1][i]): ordinal_count += 1 tmp2 = list(result[0][i]) tmp2.append('ordinal') result[0][i] = tuple(tmp2) result[1][i].append(1) elapsed_time = time.time() - start_time_guess print("Total statistical variable guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL Classifier.clear_session() return self.encoder.encode((result))