def one_click_transform(data): enc = LabelEncoder() label_encoder = enc.fit(data[0:]) float_class = label_encoder.transform(data[0:]).astype(float) print "[INFO] Transforming Success, Categories Generated " print "[INFO] MAPPING: ", enc.get_params(deep = True) return float_class
def get_binary_encoded_xy_split(sample_size): """ Return feature set encoded as binary sparse matrix i.e ╔═════╦═══════════════╦═════════════════╦═════════════════════╦═════╗ ║ Row ║ IncidentType ║ ║ ║ ... ║ ╠═════╬═══════════════╬═════════════════╬═════════════════════╬═════╣ ║ ║ Fire Incident ║ Failure(System) ║ Failure (Structure) ║ ... ║ ║ 0 ║ 1 ║ 0 ║ 0 ║ ... ║ ║ 1 ║ 0 ║ 1 ║ 0 ║ ... ║ ╚═════╩═══════════════╩═════════════════╩═════════════════════╩═════╝ :param sample_size number of rows to return (pseudorandomly selected with MySQL RAND function) :return: query result """ Xy = _get_log_useful_cols(sample_size) categorical = [ "ControllerName", "Direction", "Location", "SubLocation", "Lane", "Source", "Description", "Reporter", "Responder" ] # categorical = ["Direction"] X = Xy[categorical] X = X.fillna(method="ffill").fillna(method="bfill") label = Xy["Category"].fillna(method="ffill").fillna("bfill") y_encoder = LabelEncoder().fit(label) print("Encoder Params: ", y_encoder.get_params()) y_enc = y_encoder.transform(label) X_enc_ls = [LabelBinarizer().fit_transform(X[cn]) for cn in X.columns] Xy["detection"] = Xy["detection"].apply( lambda x: x / np.timedelta64(1, 's')) fn = lambda x, y: np.hstack((x, y)) X_enc = reduce(fn, X_enc_ls) X_enc = np.hstack((X_enc, Xy[["detection", "duration"]].values)) print(X_enc.shape) X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.3) return X_train, X_test, y_train, y_test, y_encoder
class FixedLabelEncoder(BaseEstimator, TransformerMixin): """Fix LabelEncoder function signature to fit transformer standard.""" def __init__(self): self.encoder = SklearnLabelEncoder() def fit(self, X, y=None): """Fit the LabelEncoder. Args: X: iterable y (optional): iterable Returns: self """ self.encoder.fit(X) return self def transform(self, X): """Transform using the fit LabelEncoder. Args: X: iterable Returns: array-like """ return self.encoder.transform(X) def fit_transform(self, X): """Fit and transform in one step. Args: X: iterable Returns: array-like: Transformed samples """ return self.encoder.fit_transform(X) def inverse_transform(self, X): """Transform labels back to original encoding. Args: X: iterable Returns: iterable: Inverted transformed samples """ return self.encoder.inverse_transform(X) def get_params(self, deep=True): """Get parameters for this transformer. See super. Args: deep: deep to super get_params Returns: Params for this transformer. See super. """ params = super().get_params(deep=deep) if not deep: params["encoder"] = self.encoder else: params["encoder"] = self.encoder.get_params(deep=deep) return params def set_params(self, **params): """Set parameters for this transformer. See super. Args: **params: params to set on this transformer. """ self.encoder = params.pop("encoder") super().set_params(**params)
# Load dataset dataframe = pandas.read_csv("./resources/iris.data", header=None) dataframe = shuffle(dataframe) dataset = dataframe.values features = dataset[:, 0:4].astype(float) labels = dataset[:, 4] print(features) print(labels) # Encode class values as integers encoder = LabelEncoder() encoder.fit(labels) print(encoder.get_params()) encoded_labels = encoder.transform(labels) # Convert integers to dummy variables (one hot encoding) one_hot_labels = np_utils.to_categorical(encoded_labels) print(one_hot_labels) # Actual model model = Sequential() model.add(Dense(4, input_dim=4, activation="relu")) model.add(Dense(3, activation="sigmoid")) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
class ONNXTransformer(object): """ This is a transformer to convert X [pandas.Dataframe, dask.Dataframe, equivalent] and y [array like] data into Onnx readable dtypes and formats. It is Serializable, so it can be reloaded at another time. Usage: >>> from ads.common.model_export_util import ONNXTransformer >>> onnx_data_transformer = ONNXTransformer(task="classification") >>> train_transformed = onnx_data_transformer.fit_transform(train.X, train.y) >>> test_transformed = onnx_data_transformer.transform(test.X, test.y) Parameters ---------- task: str Either "classification" or "regression". This determines if y should be label encoded """ def __init__(self, task=None): self.task = task self.cat_impute_values = {} self.cat_unique_values = {} self.label_encoder = None self.dtypes = None self._fitted = False def _handle_dtypes(self, X): # Data type cast could be expensive doing it in for loop # Especially with wide datasets # So cast the numerical columns first, without loop # Then impute categorical columns dict_astype = {} for k, v in zip(X.columns, X.dtypes): if v in ['int64', 'int32', 'int16', 'int8'] or 'float' in str(v): dict_astype[k] = 'float32' _X = X.astype(dict_astype) for k in _X.columns[_X.dtypes != 'float32']: # SimpleImputer is not available for strings in ONNX-ML specifications # Replace NaNs with the most frequent category self.cat_impute_values[k] = _X[k].value_counts().idxmax() _X[k] = _X[k].fillna(self.cat_impute_values[k]) # Sklearn's OrdinalEncoder and LabelEncoder don't support unseen categories in test data # Label encode them to identify new categories in test data self.cat_unique_values[k] = _X[k].unique() return _X def fit(self, X, y=None): _X = self._handle_dtypes(X) self.dtypes = _X.dtypes if self.task == 'classification' and y is not None: # Label encoding is required for SVC's onnx converter self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self._fitted = True return self def transform(self, X, y=None): assert self._fitted, 'Call fit_transform first!' # Data type cast could be expensive doing it in for loop # Especially with wide datasets # So cast the numerical columns first, without loop # Then impute categorical columns _X = X.astype(self.dtypes) for k in _X.columns[_X.dtypes != 'float32']: # Replace unseen categories with NaNs and impute them _X.loc[~_X[k].isin(self.cat_unique_values[k]), k] = np.nan # SimpleImputer is not available for strings in ONNX-ML specifications # Replace NaNs with the most frequent category _X[k] = _X[k].fillna(self.cat_impute_values[k]) if self.label_encoder is not None and y is not None: y = self.label_encoder.transform(y) return _X, y def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) def save(self, filename, **kwargs): export_dict = { "task": { "value": self.task, "dtype": str(type(self.task)) }, "cat_impute_values": { "value": self.cat_impute_values, "dtype": str(type(self.cat_impute_values)) }, "cat_unique_values": { "value": self.cat_unique_values, "dtype": str(type(self.cat_unique_values)) }, "label_encoder": { "value": { "params": self.label_encoder.get_params() if hasattr( self.label_encoder, "get_params") else {}, "classes_": self.label_encoder.classes_.tolist() if hasattr( self.label_encoder, "classes_") else [] }, "dtype": str(type(self.label_encoder)) }, "dtypes": { "value": { "index": list(self.dtypes.index), "values": [str(val) for val in self.dtypes.values] } if self.dtypes is not None else {}, "dtype": str(type(self.dtypes)) }, "_fitted": { "value": self._fitted, "dtype": str(type(self._fitted)) } } with open(filename, 'w') as f: json.dump(export_dict, f, sort_keys=True, indent=4, separators=(',', ': ')) @staticmethod def load(filename, **kwargs): # Make sure you have pandas, numpy, and sklearn imported with open(filename, 'r') as f: export_dict = json.load(f) try: onnx_transformer = ONNXTransformer( task=export_dict['task']['value']) except Exception as e: print(f"No task set in ONNXTransformer at {filename}") raise e for key in export_dict.keys(): if key not in ["task", "label_encoder", "dtypes"]: try: setattr(onnx_transformer, key, export_dict[key]["value"]) except Exception as e: print( f"Warning: Failed to reload from {filename} to OnnxTransformer." ) raise e onnx_transformer.dtypes = pd.Series( data=[ np.dtype(val) for val in export_dict["dtypes"]["value"]["values"] ], index=export_dict["dtypes"]["value"]["index"]) le = LabelEncoder() le.set_params(**export_dict["label_encoder"]["value"]["params"]) le.classes_ = np.asarray( export_dict["label_encoder"]["value"]["classes_"]) onnx_transformer.label_encoder = le return onnx_transformer
re_mapped_line = "0.0;" + string_labels + ";".join(string_features) fout.write(re_mapped_line + "\n") fout.close() print "Phase1:", labels1, labels2 = read_file_and_get_labels(input_fname, wanted_lines) # create the label encoders (mapping from big_integer_labels -> consecutive_small_integer_label L1_encoder = LabelEncoder() L1_encoder.fit(labels1) L2_encoder = LabelEncoder() L2_encoder.fit(labels2) print L1_encoder.get_params() print "DONE \nLabel counts of", wanted_lines, "lines are: \n\t", len( L1_encoder.classes_), len(L2_encoder.classes_) print "\tmax class label values: ", max(L1_encoder.classes_), max( L2_encoder.classes_) # with open("labels_1.csv","w+") as l1f: # # convert to list of strings # label_strings = map(str, L1_encoder.classes_.tolist()) # l1f.write(",".join(label_strings)) # # with open("labels_2.csv","w+") as l1f: # label_strings = map(str, L2_encoder.classes_.tolist()) # l1f.write(",".join(label_strings))
fout.close() print "Phase1:", labels1, labels2 = read_file_and_get_labels(input_fname, wanted_lines) # create the label encoders (mapping from big_integer_labels -> consecutive_small_integer_label L1_encoder = LabelEncoder();L1_encoder.fit(labels1) L2_encoder = LabelEncoder();L2_encoder.fit(labels2) print L1_encoder.get_params() print "DONE \nLabel counts of",wanted_lines,"lines are: \n\t" , len(L1_encoder.classes_), len(L2_encoder.classes_) print "\tmax class label values: ", max(L1_encoder.classes_), max(L2_encoder.classes_) # with open("labels_1.csv","w+") as l1f: # # convert to list of strings # label_strings = map(str, L1_encoder.classes_.tolist()) # l1f.write(",".join(label_strings)) # # with open("labels_2.csv","w+") as l1f: # label_strings = map(str, L2_encoder.classes_.tolist()) # l1f.write(",".join(label_strings))