Пример #1
0
    def _run_train(self, *args, **kwargs):
        """ 
        When an algorithm runs it always takes in a dataframe with training data,
        it may optionally have a dataframe of validation data and will return a dictionary
        with information on the trained model plus a number of artifacts.
        """
        assert isinstance(args[0], pd.DataFrame)
        started_on = time_ms()
        results = collections.OrderedDict({
            "type": "analitico/training",
            "plugins": {
                "training": self.Meta.name,  # plugin used to train model
                "prediction": self.Meta.
                name,  # plugin to be used for predictions (usually the same)
            },
            "data": {},  # number of records, etc
            "parameters": {},  # model parameters, hyperparameters
            "scores": {},  # training scores
            "performance":
            get_runtime_brief(),  # time elapsed, cpu, gpu, memory, disk, etc
        })

        train = args[0]
        test = args[1] if len(args) > 1 else None
        results = self.train(train, test, results, *args, **kwargs)

        # finalize results and save as metadata.json
        results["performance"]["total_ms"] = time_ms(started_on)
        artifacts_path = self.factory.get_artifacts_directory()
        results_path = os.path.join(artifacts_path, "metadata.json")
        save_json(results, results_path)
        self.info("saved %s (%d bytes)", results_path,
                  os.path.getsize(results_path))
        return results
Пример #2
0
    def run(self, *args, action=None, **kwargs):
        """ Process plugins in sequence, return combined result """
        try:
            pipeline_on = time_ms()

            # logging is expensive so we don't track everything in prediction mode
            predicting = action and ACTION_PREDICT in action
            if not predicting:
                self.factory.status(self, status.STATUS_RUNNING)

            for p, plugin in enumerate(self.plugins):
                plugin_on = time_ms()
                if not predicting:
                    self.factory.status(plugin, status.STATUS_RUNNING)

                # a plugin can have one or more input parameters and one or more
                # output parameters. results from a call to the next in the chain
                # are passed as tuples. when we finally return, if we have a single
                # result we unpackit, otherwise we return as tuple. this allows
                # a pipeline of plugins to chain plugins with a variable number of
                # parameters. each plugin is responsible for validating the type of
                # its input positional parameters and named parameters.
                try:
                    args = plugin.run(*args, action=action, **kwargs)
                    if not isinstance(args, tuple):
                        args = (args, )
                except Exception as e:
                    self.factory.status(plugin,
                                        status.STATUS_FAILED,
                                        exception=e)
                    raise

                # log outputs of plugin
                # TODO skip when predicting
                if not predicting:
                    output = self.get_metadata(*args)
                    self.factory.status(plugin,
                                        status.STATUS_COMPLETED,
                                        elapsed_ms=time_ms(plugin_on),
                                        output=output)

            if not predicting:
                # log outputs of pipeline
                self.factory.status(self,
                                    status.STATUS_COMPLETED,
                                    elapsed_ms=time_ms(pipeline_on),
                                    output=output)
            return args if len(args) > 1 else args[0]

        except Exception as e:
            self.factory.status(self, status.STATUS_FAILED)
            self.factory.exception(self.Meta.name + " failed while processing",
                                   item=self,
                                   exception=e)
Пример #3
0
    def retrieve_df(self, *args, action=None, **kwargs):
        """ Retrieve dataframe from dataset with id set in plugin's configuration """
        try:
            dataset_id = self.get_attribute("dataset_id")
            if not dataset_id:
                dataset_id = self.get_attribute("source.dataset_id")
                if not dataset_id:
                    self.exception(
                        "DatasetSourcePlugin - must specify 'dataset_id'")

            info_url = "analitico://datasets/" + dataset_id + "/data/info"
            self.info("reading: %s", info_url)

            info = self.factory.get_url_json(info_url)
            schema = get_dict_dot(info, "data.schema", None)
            if not schema:
                self.warning(
                    "DatasetSourcePlugin - %s does not contain schema information",
                    info_url)

            # save the schema for the source so it can be used to enforce it on prediction
            self.set_attribute("source.schema", schema)

            # stream data from dataset endpoint or storage as csv
            csv_url = "analitico://datasets/" + dataset_id + "/data/csv"
            csv_stream = self.factory.get_url_stream(csv_url, binary=False)

            reading_on = time_ms()
            self.info("reading: %s", csv_url)
            df = analitico.pandas.pd_read_csv(csv_stream, schema)
            self.info("%d rows in %d ms", len(df), time_ms(reading_on))

            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
            sample = self.get_attribute("sample", 0)
            if sample > 0:
                rows_before = len(df)
                df = analitico.pandas.pd_sample(df, sample)
                self.info("sample: %f, rows before: %d, rows after: %d",
                          sample, rows_before, len(df))

            tail = self.get_attribute("tail", 0)
            if tail > 0:
                rows_before = len(df)
                df = df.tail(tail)
                self.info("tail: %d, rows before: %d, rows after: %d", tail,
                          rows_before, len(df))

            return df

        except Exception as exc:
            raise exc
Пример #4
0
 def drop_na_rows(self, df, column):
     """ Drops rows with null values in given column, logs action """
     started_on = time_ms()
     rows_before = len(df.index)
     if rows_before < 1:
         self.warning(
             "Can't drop null '%s' rows because dataframe is empty", column)
         return df
     df.dropna(subset=[column], inplace=True)
     rows_after = len(df.index)
     rows_dropped = rows_before - rows_after
     msg = "Dropped rows where '%s' is null, rows before: %d, after: %d, dropped: %d (%.2f%%) in %d ms"
     self.info(
         msg,
         column,
         rows_before,
         rows_after,
         rows_dropped,
         (100.0 * rows_dropped) / rows_before,
         time_ms(started_on),
     )
     return df
Пример #5
0
    def _run_predict(self, *args, **kwargs):
        """ 
        When an algorithm runs it always takes in a dataframe with training data,
        it may optionally have a dataframe of validation data and will return a dictionary
        with information on the trained model plus a number of artifacts.
        """
        # assert isinstance(args[0], pandas.DataFrame) # custom models may take json as input
        data = args[0]

        artifacts_path = self.factory.get_artifacts_directory()
        training = read_json(os.path.join(artifacts_path, "metadata.json"))
        assert training

        started_on = time_ms()
        results = collections.OrderedDict({
            "type": "analitico/prediction",
            # "endpoint_id": None,
            # "model_id": None,
            # "job_id": None,
            # "records": None,  # processed (augmented) data will be added by IAlgorithm
            # "predictions": None,  # predictions
            # "probabilities": None,
            "performance":
            get_runtime_brief(),  # time elapsed, cpu, gpu, memory, disk, etc
        })

        # force schema like in training data
        if isinstance(data, pd.DataFrame):
            schema = training["data"]["schema"]
            data = apply_schema(data, schema)

        # load model, calculate predictions
        results = self.predict(data, training, results, *args, **kwargs)
        results["performance"]["total_ms"] = time_ms(started_on)

        results_path = os.path.join(artifacts_path, "results.json")
        save_json(results, results_path)

        return results
Пример #6
0
 def drop_selected_rows(self, df, df_dropped, message=None):
     """ Drops df_dropped rows from dp in place, logs action """
     started_on = time_ms()
     rows_before = len(df.index)
     if rows_before < 1:
         self.warning(
             "Can't drop rows where '%s' because dataframe is empty",
             message)
         return df
     df.drop(df_dropped.index, inplace=True)
     if message:
         rows_after = len(df.index)
         rows_dropped = rows_before - rows_after
         msg = "Dropped rows where '%s', rows before: %d, after: %d, dropped: %d (%.2f%%) in %d ms"
         self.info(
             msg,
             message,
             rows_before,
             rows_after,
             rows_dropped,
             (100.0 * rows_dropped) / rows_before,
             time_ms(started_on),
         )
     return df
Пример #7
0
    def predict(self, data, training, results, *args, **kwargs):
        """ Return predictions from trained model """

        # data should already come in as pd.DataFrame but it's just a dictionary we convert it
        if not isinstance(data, pd.DataFrame):
            data = pd.DataFrame.from_dict(data, orient="columns")

        # record that we're predicting on after augmentation is added
        # to the results. if the endpoint or the jupyter notebook in
        # charge of communicating with the caller does not want to send
        # this information back, it can always take it out. in the future
        # we may want to optimized here and add this optionally instead.
        results["records"] = analitico.pandas.pd_to_dict(data)

        # initialize data pool to be tested
        categorical_idx = self.get_categorical_idx(data)
        data_pool = catboost.Pool(data, cat_features=categorical_idx)

        # create model object from stored file
        loading_on = time_ms()
        model_path = os.path.join(self.factory.get_artifacts_directory(),
                                  "model.cbm")
        if not os.path.isfile(model_path):
            self.exception(
                "CatBoostPlugin.predict - cannot find saved model in %s",
                model_path)

        model = self.create_model(training)
        model.load_model(model_path)
        results["performance"]["loading_ms"] = time_ms(loading_on)

        algo = training.get("algorithm", ALGORITHM_TYPE_REGRESSION)
        if algo == ALGORITHM_TYPE_REGRESSION:
            y_predictions = model.predict(data_pool)
            y_predictions = np.around(y_predictions, decimals=3)
            results["predictions"] = list(y_predictions)

        else:
            # predict class and probabilities of each class
            y_predictions = model.predict(
                data_pool, prediction_type="Class"
            )  # array di array of 1 element with class index
            y_probabilities = model.predict(
                data_pool, prediction_type="Probability"
            )  # array of array of probabilities
            y_classes = training["data"]["classes"]  # list of possible classes

            preds = results["predictions"] = []
            probs = results["probabilities"] = []

            # create predictions with assigned class and probabilities
            if algo == ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION:
                for i in range(0, len(data)):
                    preds.append(y_classes[int(y_predictions[i][0])])
                    probs.append({
                        y_classes[j]: y_probabilities[i][j]
                        for j in range(0, len(y_classes))
                    })

            elif algo == ALGORITHM_TYPE_BINARY_CLASSICATION:
                for i in range(0, len(data)):
                    preds.append(y_classes[int(y_predictions[i])])
                    probs.append({
                        y_classes[0]: y_probabilities[i][0],
                        y_classes[1]: y_probabilities[i][1]
                    })

        return results
Пример #8
0
    def train(self, train, test, results, *args, **kwargs):
        """ Train with algorithm and given data to produce a trained model """
        try:
            assert isinstance(train, pd.DataFrame) and len(train.columns) > 1
            train_df = train
            test_df = test

            # if not specified the prediction target will be the last column of the dataset
            label = self.get_attribute("data.label")
            if not label:
                label = train_df.columns[len(train_df.columns) - 1]
            results["data"]["label"] = label

            # choose between regression, binary classification and multiclass classification
            label_type = analitico.schema.get_column_type(train_df, label)
            self.info("label: %s", label)
            self.info("label_type: %s", label_type)
            if label_type == analitico.schema.ANALITICO_TYPE_CATEGORY:
                label_classes = list(train_df[label].cat.categories)
                results["data"]["classes"] = label_classes
                train_df[label] = train_df[label].cat.codes
                results["algorithm"] = (
                    ALGORITHM_TYPE_BINARY_CLASSICATION if len(label_classes)
                    == 2 else ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION)
                self.info("classes: %s", label_classes)
            else:
                results["algorithm"] = ALGORITHM_TYPE_REGRESSION
            self.info("algorithm: %s", results["algorithm"])

            # remove rows with missing label from training and test sets
            train_rows = len(train_df)
            train_df = train_df.dropna(subset=[label])
            if len(train_df) < train_rows:
                self.warning("Training data has %s rows without '%s' label",
                             train_rows - len(train_df), label)
            if test_df:
                test_rows = len(test_df)
                test_df = test_df.dropna(subset=[label])
                if len(test_df) < test_rows:
                    self.warning("Test data has %s rows without '%s' label",
                                 test_rows - len(test_df), label)

            # make sure schemas match
            train_schema = self.validate_schema(train_df, test_df)

            # shortened training was requested?
            tail = self.get_attribute("parameters.tail", 0)
            if tail > 0:
                self.info("Tail: %d, cutting training data", tail)
                train_df = train_df.tail(tail).copy()

            # create test set from training set if not provided
            if not test_df:
                # decide how to create test set from settings variable
                # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
                chronological = self.get_attribute("data.chronological", False)
                test_size = self.get_attribute("parameters.test_size", 0.20)
                results["data"]["chronological"] = chronological
                results["parameters"]["test_size"] = test_size
                if chronological:
                    # test set if from the last rows (chronological order)
                    self.info("Test set split: chronological")
                    test_rows = int(len(train_df) * test_size)
                    test_df = train_df[-test_rows:]
                    train_df = train_df[:-test_rows]
                else:
                    # test set if from a random assortment of rows
                    self.info("Test set split: random")
                    train_df, test_df, = train_test_split(train_df,
                                                          test_size=test_size,
                                                          random_state=42)

            self.info("training: %d rows", len(train_df))
            self.info("testing: %d rows", len(test_df))

            # validate data types
            for column in train_schema["columns"]:
                if column["type"] not in ("integer", "float", "boolean",
                                          "category"):
                    self.warning(
                        "Column '%s' of type '%s' is incompatible and will be dropped",
                        column["name"], column["type"])
                    train_df = train_df.drop(column["name"], axis=1)
                    test_df = test_df.drop(column["name"], axis=1)

            # save schema after dropping unused columns
            results["data"]["schema"] = generate_schema(train_df)
            results["data"]["source_records"] = len(train)
            results["data"]["training_records"] = len(train_df)
            results["data"]["test_records"] = len(test_df)
            results["data"]["dropped_records"] = len(train) - len(
                train_df) - len(test_df)

            # save some training data for debugging
            artifacts_path = self.factory.get_artifacts_directory()
            self.info("artifacts_path: %s", artifacts_path)

            samples_df = analitico.pandas.pd_sample(train_df, 200)
            samples_path = os.path.join(artifacts_path,
                                        "training-samples.json")
            samples_df.to_json(samples_path, orient="records")
            self.info("saved: %s (%d bytes)", samples_path,
                      os.path.getsize(samples_path))
            samples_path = os.path.join(artifacts_path, "training-samples.csv")
            samples_df.to_csv(samples_path)
            self.info("saved: %s (%d bytes)", samples_path,
                      os.path.getsize(samples_path))

            # split data and labels
            train_labels = train_df[label]
            train_df = train_df.drop([label], axis=1)
            test_labels = test_df[label]
            test_df = test_df.drop([label], axis=1)

            # indexes of columns that should be considered categorical
            categorical_idx = self.get_categorical_idx(train_df)
            train_pool = catboost.Pool(train_df,
                                       train_labels,
                                       cat_features=categorical_idx)
            test_pool = catboost.Pool(test_df,
                                      test_labels,
                                      cat_features=categorical_idx)

            # create regressor or classificator then train
            training_on = time_ms()
            model = self.create_model(results)
            model.fit(train_pool, eval_set=test_pool)
            results["performance"]["training_ms"] = time_ms(training_on)

            # score test set, add related metrics to results
            self.score_training(model, test_df, test_pool, test_labels,
                                results)
            if results["algorithm"] == ALGORITHM_TYPE_REGRESSION:
                self.score_regressor_training(model, test_df, test_pool,
                                              test_labels, results)
            else:
                self.score_classifier_training(model, test_df, test_pool,
                                               test_labels, results)

            # save model file and training results
            model_path = os.path.join(artifacts_path, "model.cbm")
            model.save_model(model_path)
            results["scores"]["model_size"] = os.path.getsize(model_path)
            self.info("saved: %s (%d bytes)", model_path,
                      os.path.getsize(model_path))
            return results

        except Exception as exc:
            self.exception("CatBoostPlugin - error while training: %s",
                           str(exc),
                           exception=exc)
Пример #9
0
    def upload_random_rainbows(self, item: Item, size: int):
        """ Uploads random bytes to test upload limits, timeouts, etc. Size of upload is specified by caller. """
        try:
            # random directory to test subdirectory generation
            remotepath = f"tst_dir_{id_generator(12)}/abc/def/ghi/unicorns.data"
            logger.info(f"\nsdk upload {remotepath}")

            # random bytes to avoid compression, etc
            data1 = bytearray(os.urandom(size))

            # upload data directly to item's storage
            with tempfile.NamedTemporaryFile() as f1:
                f1.write(data1)
                started_ms = time_ms()
                item.upload(filepath=f1.name,
                            remotepath=remotepath,
                            direct=True)

                elapsed_ms = max(1, time_ms(started_ms))
                kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0)
                msg = f"sdk upload (direct): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s"
                logger.info(msg)

            # download (streaming)
            started_ms = time_ms()
            stream2 = item.download(remotepath, stream=True)
            with tempfile.NamedTemporaryFile() as f2:
                for chunk in iter(stream2):
                    f2.write(chunk)

                elapsed_ms = max(1, time_ms(started_ms))
                kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0)
                msg = f"sdk download (streaming): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s"
                logger.info(msg)

                f2.seek(0)
                data2 = f2.file.read()
                self.assertEqual(data1, data2)

            # upload data to /files APIs
            with tempfile.NamedTemporaryFile() as f1:
                f1.write(data1)
                started_ms = time_ms()
                item.upload(filepath=f1.name,
                            remotepath=remotepath,
                            direct=False)

                elapsed_ms = max(1, time_ms(started_ms))
                kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0)
                msg = f"sdk upload (server): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s"
                logger.info(msg)

            # download data from item's storage
            with tempfile.NamedTemporaryFile() as f3:
                started_ms = time_ms()
                item.download(remotepath, f3.name)
                elapsed_ms = max(1, time_ms(started_ms))
                kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0)
                msg = f"sdk download (file): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s"
                logger.info(msg)

                data3 = f3.file.read()
                self.assertEqual(data1, data3)

        except Exception:
            raise

        finally:
            pass