示例#1
0
    def get_metadata(self, *args):
        """ Transform list of arguments into a dictionary describing them (used to log status, etc) """
        output = []
        if args and len(args) > 0:
            for i, arg in enumerate(args):
                meta = {}
                meta["type"] = str(type(arg))
                if isinstance(arg, pd.DataFrame):
                    df = arg
                    meta["rows"] = len(df)
                    meta["schema"] = generate_schema(df)
                    samples = analitico.pandas.pd_sample(df, DATAFRAME_SAMPLES)
                    meta["samples"] = pd_to_dict(samples)

                    # debugging help
                    self.factory.debug("output[%d]: pd.DataFrame", i)
                    self.factory.debug("  rows: %d", len(df))
                    self.factory.debug("  columns: %d", len(df.columns))
                    for j, column in enumerate(df.columns):
                        self.factory.debug(
                            "  %3d %s (%s/%s)", j, column, df.dtypes[j],
                            pandas_to_analitico_type(df.dtypes[j]))
                else:
                    self.factory.debug("output[%d]: %s", i, str(type(arg)))
                output.append(meta)
        return output
示例#2
0
 def validate_schema(self, train_df, test_df):
     """ Checks training and test dataframes to make sure they have matching schemas """
     train_schema = generate_schema(train_df)
     if test_df:
         test_schema = generate_schema(test_df)
         train_columns = train_schema["columns"]
         test_columns = test_schema["columns"]
         if len(train_columns) != len(test_columns):
             msg = "{} - training data has {} columns while test data has {} columns".format(
                 self.name, len(train_columns), len(test_columns))
             raise PluginError(msg)
         for i in range(0, len(train_columns)):
             if train_columns[i]["name"] != test_columns[i]["name"]:
                 msg = "{} - column {} of train '{}' and test '{}' have different names".format(
                     self.name, i, train_columns[i]["name"],
                     test_columns[i]["name"])
                 raise PluginError(msg)
             if train_columns[i]["type"] != test_columns[i]["type"]:
                 msg = "{} - column {} of train '{}' and test '{}' have different names".format(
                     self.name, i, train_columns[i]["type"],
                     test_columns[i]["type"])
                 raise PluginError(msg)
     return train_schema
示例#3
0
    def test_dataset_csv4_applyschema_index(self):
        """ Test reading a table then making a column its index """
        try:
            df = self.read_dataframe_asset("ds_test_4.json")
            schema = generate_schema(df)

            columns = schema["columns"]
            self.assertEqual(len(columns), 3)
            self.assertEqual(df.index.name, None)

            schema["columns"][0]["index"] = True
            df = apply_schema(df, schema)

            columns = df.columns
            self.assertEqual(df.index.name, "First")
        except Exception as exc:
            raise exc
示例#4
0
    def test_dataset_csv4_applyschema_rename(self):
        """ Test reading a table then renaming a column """
        try:
            df = self.read_dataframe_asset("ds_test_4.json")
            schema = generate_schema(df)

            columns = schema["columns"]
            self.assertEqual(len(columns), 3)
            self.assertEqual(df.columns[1], "Second")

            schema["columns"][1]["rename"] = "Secondo"
            df = apply_schema(df, schema)

            columns = df.columns
            self.assertEqual(df.columns[1], "Secondo")
        except Exception as exc:
            raise exc
示例#5
0
    def run(self, *args, action=None, **kwargs):
        """ Process the plugins in sequence then save the resulting dataframe """
        df = super().run(*args, action=action, **kwargs)
        if not isinstance(df, pd.DataFrame):
            self.logger.warn(
                "DataframePipelinePlugin.run - pipeline didn't produce a valid dataframe"
            )
            return None

        # save dataframe as data.csv
        # we will save the index column only if it is named
        # and it was created explicitely
        artifacts_path = self.factory.get_artifacts_directory()
        csv_path = os.path.join(artifacts_path, "data.csv")
        index = bool(df.index.name)
        df.to_csv(csv_path, index=index)

        # save schema as data.csv.info
        schema = generate_schema(df)
        csv_info_path = csv_path + ".info"
        analitico.utilities.save_json({"schema": schema}, csv_info_path)

        return df
示例#6
0
    def test_dataset_csv7_autoschema(self):
        """ Test automatically generating an analitico schema from a pandas dataframe """
        try:
            df = self.read_dataframe_asset("ds_test_7_autoschema.json")
            schema = generate_schema(df)

            columns = schema["columns"]
            self.assertEqual(len(columns), 12)

            self.assertEqual(columns[0]["name"], "name")
            self.assertEqual(columns[0]["type"], "string")
            self.assertEqual(columns[1]["name"], "slug")
            self.assertEqual(columns[1]["type"], "category")
            self.assertEqual(columns[2]["name"], "parent_id")
            self.assertEqual(columns[2]["type"], "float")
            self.assertEqual(columns[3]["name"], "depth")
            self.assertEqual(columns[3]["type"], "integer")
            self.assertEqual(columns[4]["name"], "priority")
            self.assertEqual(columns[4]["type"], "integer")
            self.assertEqual(columns[5]["name"], "max_weight")
            self.assertEqual(columns[5]["type"], "integer")
            self.assertEqual(columns[6]["name"], "frozen")
            self.assertEqual(columns[6]["type"], "boolean")
            self.assertEqual(columns[7]["name"], "rate")
            self.assertEqual(columns[7]["type"], "float")
            self.assertEqual(columns[8]["name"], "has_ingredients_book")
            self.assertEqual(columns[8]["type"], "boolean")
            self.assertEqual(columns[9]["name"], "indice")
            self.assertEqual(columns[9]["type"], "integer")
            self.assertEqual(columns[9]["index"], True)
            self.assertEqual(columns[10]["name"], "updated_at")
            self.assertEqual(columns[10]["type"], "datetime")
            self.assertEqual(columns[11]["name"], "elapsed")
            self.assertEqual(columns[11]["type"], "timespan")
        except Exception as exc:
            raise exc
示例#7
0
    def train(self, train, test, results, *args, **kwargs):
        """ Train with algorithm and given data to produce a trained model """
        try:
            assert isinstance(train, pd.DataFrame) and len(train.columns) > 1
            train_df = train
            test_df = test

            # if not specified the prediction target will be the last column of the dataset
            label = self.get_attribute("data.label")
            if not label:
                label = train_df.columns[len(train_df.columns) - 1]
            results["data"]["label"] = label

            # choose between regression, binary classification and multiclass classification
            label_type = analitico.schema.get_column_type(train_df, label)
            self.info("label: %s", label)
            self.info("label_type: %s", label_type)
            if label_type == analitico.schema.ANALITICO_TYPE_CATEGORY:
                label_classes = list(train_df[label].cat.categories)
                results["data"]["classes"] = label_classes
                train_df[label] = train_df[label].cat.codes
                results["algorithm"] = (
                    ALGORITHM_TYPE_BINARY_CLASSICATION if len(label_classes)
                    == 2 else ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION)
                self.info("classes: %s", label_classes)
            else:
                results["algorithm"] = ALGORITHM_TYPE_REGRESSION
            self.info("algorithm: %s", results["algorithm"])

            # remove rows with missing label from training and test sets
            train_rows = len(train_df)
            train_df = train_df.dropna(subset=[label])
            if len(train_df) < train_rows:
                self.warning("Training data has %s rows without '%s' label",
                             train_rows - len(train_df), label)
            if test_df:
                test_rows = len(test_df)
                test_df = test_df.dropna(subset=[label])
                if len(test_df) < test_rows:
                    self.warning("Test data has %s rows without '%s' label",
                                 test_rows - len(test_df), label)

            # make sure schemas match
            train_schema = self.validate_schema(train_df, test_df)

            # shortened training was requested?
            tail = self.get_attribute("parameters.tail", 0)
            if tail > 0:
                self.info("Tail: %d, cutting training data", tail)
                train_df = train_df.tail(tail).copy()

            # create test set from training set if not provided
            if not test_df:
                # decide how to create test set from settings variable
                # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
                chronological = self.get_attribute("data.chronological", False)
                test_size = self.get_attribute("parameters.test_size", 0.20)
                results["data"]["chronological"] = chronological
                results["parameters"]["test_size"] = test_size
                if chronological:
                    # test set if from the last rows (chronological order)
                    self.info("Test set split: chronological")
                    test_rows = int(len(train_df) * test_size)
                    test_df = train_df[-test_rows:]
                    train_df = train_df[:-test_rows]
                else:
                    # test set if from a random assortment of rows
                    self.info("Test set split: random")
                    train_df, test_df, = train_test_split(train_df,
                                                          test_size=test_size,
                                                          random_state=42)

            self.info("training: %d rows", len(train_df))
            self.info("testing: %d rows", len(test_df))

            # validate data types
            for column in train_schema["columns"]:
                if column["type"] not in ("integer", "float", "boolean",
                                          "category"):
                    self.warning(
                        "Column '%s' of type '%s' is incompatible and will be dropped",
                        column["name"], column["type"])
                    train_df = train_df.drop(column["name"], axis=1)
                    test_df = test_df.drop(column["name"], axis=1)

            # save schema after dropping unused columns
            results["data"]["schema"] = generate_schema(train_df)
            results["data"]["source_records"] = len(train)
            results["data"]["training_records"] = len(train_df)
            results["data"]["test_records"] = len(test_df)
            results["data"]["dropped_records"] = len(train) - len(
                train_df) - len(test_df)

            # save some training data for debugging
            artifacts_path = self.factory.get_artifacts_directory()
            self.info("artifacts_path: %s", artifacts_path)

            samples_df = analitico.pandas.pd_sample(train_df, 200)
            samples_path = os.path.join(artifacts_path,
                                        "training-samples.json")
            samples_df.to_json(samples_path, orient="records")
            self.info("saved: %s (%d bytes)", samples_path,
                      os.path.getsize(samples_path))
            samples_path = os.path.join(artifacts_path, "training-samples.csv")
            samples_df.to_csv(samples_path)
            self.info("saved: %s (%d bytes)", samples_path,
                      os.path.getsize(samples_path))

            # split data and labels
            train_labels = train_df[label]
            train_df = train_df.drop([label], axis=1)
            test_labels = test_df[label]
            test_df = test_df.drop([label], axis=1)

            # indexes of columns that should be considered categorical
            categorical_idx = self.get_categorical_idx(train_df)
            train_pool = catboost.Pool(train_df,
                                       train_labels,
                                       cat_features=categorical_idx)
            test_pool = catboost.Pool(test_df,
                                      test_labels,
                                      cat_features=categorical_idx)

            # create regressor or classificator then train
            training_on = time_ms()
            model = self.create_model(results)
            model.fit(train_pool, eval_set=test_pool)
            results["performance"]["training_ms"] = time_ms(training_on)

            # score test set, add related metrics to results
            self.score_training(model, test_df, test_pool, test_labels,
                                results)
            if results["algorithm"] == ALGORITHM_TYPE_REGRESSION:
                self.score_regressor_training(model, test_df, test_pool,
                                              test_labels, results)
            else:
                self.score_classifier_training(model, test_df, test_pool,
                                               test_labels, results)

            # save model file and training results
            model_path = os.path.join(artifacts_path, "model.cbm")
            model.save_model(model_path)
            results["scores"]["model_size"] = os.path.getsize(model_path)
            self.info("saved: %s (%d bytes)", model_path,
                      os.path.getsize(model_path))
            return results

        except Exception as exc:
            self.exception("CatBoostPlugin - error while training: %s",
                           str(exc),
                           exception=exc)