def one_click_transform(data):

    enc = LabelEncoder()
    label_encoder = enc.fit(data[0:])
    float_class = label_encoder.transform(data[0:]).astype(float)
    
    print "[INFO] Transforming Success, Categories Generated "
    print "[INFO] MAPPING: ", enc.get_params(deep = True)

    return float_class
def get_binary_encoded_xy_split(sample_size):
    """
    Return feature set encoded as binary sparse matrix
    i.e
    ╔═════╦═══════════════╦═════════════════╦═════════════════════╦═════╗
    ║ Row ║ IncidentType  ║                 ║                     ║ ... ║
    ╠═════╬═══════════════╬═════════════════╬═════════════════════╬═════╣
    ║     ║ Fire Incident ║ Failure(System) ║ Failure (Structure) ║ ... ║
    ║   0 ║ 1             ║ 0               ║ 0                   ║ ... ║
    ║   1 ║ 0             ║ 1               ║ 0                   ║ ... ║
    ╚═════╩═══════════════╩═════════════════╩═════════════════════╩═════╝

    :param sample_size number of rows to return (pseudorandomly selected with MySQL RAND function)
    :return: query result
    """
    Xy = _get_log_useful_cols(sample_size)

    categorical = [
        "ControllerName", "Direction", "Location", "SubLocation", "Lane",
        "Source", "Description", "Reporter", "Responder"
    ]
    # categorical = ["Direction"]
    X = Xy[categorical]
    X = X.fillna(method="ffill").fillna(method="bfill")
    label = Xy["Category"].fillna(method="ffill").fillna("bfill")

    y_encoder = LabelEncoder().fit(label)
    print("Encoder Params: ", y_encoder.get_params())
    y_enc = y_encoder.transform(label)
    X_enc_ls = [LabelBinarizer().fit_transform(X[cn]) for cn in X.columns]
    Xy["detection"] = Xy["detection"].apply(
        lambda x: x / np.timedelta64(1, 's'))

    fn = lambda x, y: np.hstack((x, y))
    X_enc = reduce(fn, X_enc_ls)
    X_enc = np.hstack((X_enc, Xy[["detection", "duration"]].values))
    print(X_enc.shape)
    X_train, X_test, y_train, y_test = train_test_split(X_enc,
                                                        y_enc,
                                                        test_size=0.3)
    return X_train, X_test, y_train, y_test, y_encoder
예제 #3
0
class FixedLabelEncoder(BaseEstimator, TransformerMixin):
    """Fix LabelEncoder function signature to fit transformer standard."""

    def __init__(self):
        self.encoder = SklearnLabelEncoder()

    def fit(self, X, y=None):
        """Fit the LabelEncoder.

        Args:
            X: iterable
            y (optional): iterable

        Returns:
            self

        """
        self.encoder.fit(X)
        return self

    def transform(self, X):
        """Transform using the fit LabelEncoder.

        Args:
            X: iterable

        Returns:
            array-like

        """
        return self.encoder.transform(X)

    def fit_transform(self, X):
        """Fit and transform in one step.

        Args:
            X: iterable

        Returns:
            array-like: Transformed samples

        """
        return self.encoder.fit_transform(X)

    def inverse_transform(self, X):
        """Transform labels back to original encoding.

        Args:
            X: iterable

        Returns:
            iterable: Inverted transformed samples

        """
        return self.encoder.inverse_transform(X)

    def get_params(self, deep=True):
        """Get parameters for this transformer. See super.

        Args:
            deep: deep to super get_params

        Returns:
            Params for this transformer. See super.

        """
        params = super().get_params(deep=deep)
        if not deep:
            params["encoder"] = self.encoder
        else:
            params["encoder"] = self.encoder.get_params(deep=deep)
        return params

    def set_params(self, **params):
        """Set parameters for this transformer. See super.

        Args:
            **params: params to set on this transformer.

        """
        self.encoder = params.pop("encoder")
        super().set_params(**params)
# Load dataset
dataframe = pandas.read_csv("./resources/iris.data", header=None)
dataframe = shuffle(dataframe)
dataset = dataframe.values

features = dataset[:, 0:4].astype(float)
labels = dataset[:, 4]

print(features)
print(labels)

# Encode class values as integers
encoder = LabelEncoder()
encoder.fit(labels)
print(encoder.get_params())
encoded_labels = encoder.transform(labels)

# Convert integers to dummy variables (one hot encoding)
one_hot_labels = np_utils.to_categorical(encoded_labels)
print(one_hot_labels)

# Actual model
model = Sequential()
model.add(Dense(4, input_dim=4, activation="relu"))
model.add(Dense(3, activation="sigmoid"))

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
예제 #5
0
class ONNXTransformer(object):
    """
    This is a transformer to convert X [pandas.Dataframe, dask.Dataframe, equivalent] and y [array like] data into Onnx
    readable dtypes and formats. It is Serializable, so it can be reloaded at another time.

    Usage:
    >>> from ads.common.model_export_util import ONNXTransformer
    >>> onnx_data_transformer = ONNXTransformer(task="classification")
    >>> train_transformed = onnx_data_transformer.fit_transform(train.X, train.y)
    >>> test_transformed = onnx_data_transformer.transform(test.X, test.y)

    Parameters
    ----------
    task: str
        Either "classification" or "regression". This determines if y should be label encoded
    """
    def __init__(self, task=None):
        self.task = task
        self.cat_impute_values = {}
        self.cat_unique_values = {}
        self.label_encoder = None
        self.dtypes = None
        self._fitted = False

    def _handle_dtypes(self, X):
        # Data type cast could be expensive doing it in for loop
        # Especially with wide datasets
        # So cast the numerical columns first, without loop
        # Then impute categorical columns
        dict_astype = {}
        for k, v in zip(X.columns, X.dtypes):
            if v in ['int64', 'int32', 'int16', 'int8'] or 'float' in str(v):
                dict_astype[k] = 'float32'
        _X = X.astype(dict_astype)
        for k in _X.columns[_X.dtypes != 'float32']:
            # SimpleImputer is not available for strings in ONNX-ML specifications
            # Replace NaNs with the most frequent category
            self.cat_impute_values[k] = _X[k].value_counts().idxmax()
            _X[k] = _X[k].fillna(self.cat_impute_values[k])
            # Sklearn's OrdinalEncoder and LabelEncoder don't support unseen categories in test data
            # Label encode them to identify new categories in test data
            self.cat_unique_values[k] = _X[k].unique()
        return _X

    def fit(self, X, y=None):
        _X = self._handle_dtypes(X)
        self.dtypes = _X.dtypes
        if self.task == 'classification' and y is not None:
            # Label encoding is required for SVC's onnx converter
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(y)

        self._fitted = True
        return self

    def transform(self, X, y=None):
        assert self._fitted, 'Call fit_transform first!'
        # Data type cast could be expensive doing it in for loop
        # Especially with wide datasets
        # So cast the numerical columns first, without loop
        # Then impute categorical columns
        _X = X.astype(self.dtypes)
        for k in _X.columns[_X.dtypes != 'float32']:
            # Replace unseen categories with NaNs and impute them
            _X.loc[~_X[k].isin(self.cat_unique_values[k]), k] = np.nan
            # SimpleImputer is not available for strings in ONNX-ML specifications
            # Replace NaNs with the most frequent category
            _X[k] = _X[k].fillna(self.cat_impute_values[k])

        if self.label_encoder is not None and y is not None:
            y = self.label_encoder.transform(y)

        return _X, y

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

    def save(self, filename, **kwargs):
        export_dict = {
            "task": {
                "value": self.task,
                "dtype": str(type(self.task))
            },
            "cat_impute_values": {
                "value": self.cat_impute_values,
                "dtype": str(type(self.cat_impute_values))
            },
            "cat_unique_values": {
                "value": self.cat_unique_values,
                "dtype": str(type(self.cat_unique_values))
            },
            "label_encoder": {
                "value": {
                    "params":
                    self.label_encoder.get_params() if hasattr(
                        self.label_encoder, "get_params") else {},
                    "classes_":
                    self.label_encoder.classes_.tolist() if hasattr(
                        self.label_encoder, "classes_") else []
                },
                "dtype": str(type(self.label_encoder))
            },
            "dtypes": {
                "value": {
                    "index": list(self.dtypes.index),
                    "values": [str(val) for val in self.dtypes.values]
                } if self.dtypes is not None else {},
                "dtype": str(type(self.dtypes))
            },
            "_fitted": {
                "value": self._fitted,
                "dtype": str(type(self._fitted))
            }
        }
        with open(filename, 'w') as f:
            json.dump(export_dict,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

    @staticmethod
    def load(filename, **kwargs):
        # Make sure you have  pandas, numpy, and sklearn imported
        with open(filename, 'r') as f:
            export_dict = json.load(f)
        try:
            onnx_transformer = ONNXTransformer(
                task=export_dict['task']['value'])
        except Exception as e:
            print(f"No task set in ONNXTransformer at {filename}")
            raise e
        for key in export_dict.keys():
            if key not in ["task", "label_encoder", "dtypes"]:
                try:
                    setattr(onnx_transformer, key, export_dict[key]["value"])
                except Exception as e:
                    print(
                        f"Warning: Failed to reload from {filename} to OnnxTransformer."
                    )
                    raise e
        onnx_transformer.dtypes = pd.Series(
            data=[
                np.dtype(val)
                for val in export_dict["dtypes"]["value"]["values"]
            ],
            index=export_dict["dtypes"]["value"]["index"])
        le = LabelEncoder()
        le.set_params(**export_dict["label_encoder"]["value"]["params"])
        le.classes_ = np.asarray(
            export_dict["label_encoder"]["value"]["classes_"])
        onnx_transformer.label_encoder = le
        return onnx_transformer
예제 #6
0
            re_mapped_line = "0.0;" + string_labels + ";".join(string_features)
            fout.write(re_mapped_line + "\n")
    fout.close()


print "Phase1:",

labels1, labels2 = read_file_and_get_labels(input_fname, wanted_lines)

# create the label encoders (mapping from big_integer_labels -> consecutive_small_integer_label
L1_encoder = LabelEncoder()
L1_encoder.fit(labels1)
L2_encoder = LabelEncoder()
L2_encoder.fit(labels2)

print L1_encoder.get_params()

print "DONE \nLabel counts of", wanted_lines, "lines are: \n\t", len(
    L1_encoder.classes_), len(L2_encoder.classes_)
print "\tmax class label values: ", max(L1_encoder.classes_), max(
    L2_encoder.classes_)

# with open("labels_1.csv","w+") as l1f:
#     # convert to list of strings
#     label_strings = map(str, L1_encoder.classes_.tolist())
#     l1f.write(",".join(label_strings))
#
# with open("labels_2.csv","w+") as l1f:
#     label_strings = map(str, L2_encoder.classes_.tolist())
#     l1f.write(",".join(label_strings))
예제 #7
0
    fout.close()





print "Phase1:",

labels1, labels2 = read_file_and_get_labels(input_fname, wanted_lines)


# create the label encoders (mapping from big_integer_labels -> consecutive_small_integer_label
L1_encoder = LabelEncoder();L1_encoder.fit(labels1)
L2_encoder = LabelEncoder();L2_encoder.fit(labels2)

print L1_encoder.get_params()


print "DONE \nLabel counts of",wanted_lines,"lines are: \n\t" , len(L1_encoder.classes_), len(L2_encoder.classes_)
print "\tmax class label values: ", max(L1_encoder.classes_), max(L2_encoder.classes_)

# with open("labels_1.csv","w+") as l1f:
#     # convert to list of strings
#     label_strings = map(str, L1_encoder.classes_.tolist())
#     l1f.write(",".join(label_strings))
#
# with open("labels_2.csv","w+") as l1f:
#     label_strings = map(str, L2_encoder.classes_.tolist())
#     l1f.write(",".join(label_strings))