def get_metadata(self, *args): """ Transform list of arguments into a dictionary describing them (used to log status, etc) """ output = [] if args and len(args) > 0: for i, arg in enumerate(args): meta = {} meta["type"] = str(type(arg)) if isinstance(arg, pd.DataFrame): df = arg meta["rows"] = len(df) meta["schema"] = generate_schema(df) samples = analitico.pandas.pd_sample(df, DATAFRAME_SAMPLES) meta["samples"] = pd_to_dict(samples) # debugging help self.factory.debug("output[%d]: pd.DataFrame", i) self.factory.debug(" rows: %d", len(df)) self.factory.debug(" columns: %d", len(df.columns)) for j, column in enumerate(df.columns): self.factory.debug( " %3d %s (%s/%s)", j, column, df.dtypes[j], pandas_to_analitico_type(df.dtypes[j])) else: self.factory.debug("output[%d]: %s", i, str(type(arg))) output.append(meta) return output
def validate_schema(self, train_df, test_df): """ Checks training and test dataframes to make sure they have matching schemas """ train_schema = generate_schema(train_df) if test_df: test_schema = generate_schema(test_df) train_columns = train_schema["columns"] test_columns = test_schema["columns"] if len(train_columns) != len(test_columns): msg = "{} - training data has {} columns while test data has {} columns".format( self.name, len(train_columns), len(test_columns)) raise PluginError(msg) for i in range(0, len(train_columns)): if train_columns[i]["name"] != test_columns[i]["name"]: msg = "{} - column {} of train '{}' and test '{}' have different names".format( self.name, i, train_columns[i]["name"], test_columns[i]["name"]) raise PluginError(msg) if train_columns[i]["type"] != test_columns[i]["type"]: msg = "{} - column {} of train '{}' and test '{}' have different names".format( self.name, i, train_columns[i]["type"], test_columns[i]["type"]) raise PluginError(msg) return train_schema
def test_dataset_csv4_applyschema_index(self): """ Test reading a table then making a column its index """ try: df = self.read_dataframe_asset("ds_test_4.json") schema = generate_schema(df) columns = schema["columns"] self.assertEqual(len(columns), 3) self.assertEqual(df.index.name, None) schema["columns"][0]["index"] = True df = apply_schema(df, schema) columns = df.columns self.assertEqual(df.index.name, "First") except Exception as exc: raise exc
def test_dataset_csv4_applyschema_rename(self): """ Test reading a table then renaming a column """ try: df = self.read_dataframe_asset("ds_test_4.json") schema = generate_schema(df) columns = schema["columns"] self.assertEqual(len(columns), 3) self.assertEqual(df.columns[1], "Second") schema["columns"][1]["rename"] = "Secondo" df = apply_schema(df, schema) columns = df.columns self.assertEqual(df.columns[1], "Secondo") except Exception as exc: raise exc
def run(self, *args, action=None, **kwargs): """ Process the plugins in sequence then save the resulting dataframe """ df = super().run(*args, action=action, **kwargs) if not isinstance(df, pd.DataFrame): self.logger.warn( "DataframePipelinePlugin.run - pipeline didn't produce a valid dataframe" ) return None # save dataframe as data.csv # we will save the index column only if it is named # and it was created explicitely artifacts_path = self.factory.get_artifacts_directory() csv_path = os.path.join(artifacts_path, "data.csv") index = bool(df.index.name) df.to_csv(csv_path, index=index) # save schema as data.csv.info schema = generate_schema(df) csv_info_path = csv_path + ".info" analitico.utilities.save_json({"schema": schema}, csv_info_path) return df
def test_dataset_csv7_autoschema(self): """ Test automatically generating an analitico schema from a pandas dataframe """ try: df = self.read_dataframe_asset("ds_test_7_autoschema.json") schema = generate_schema(df) columns = schema["columns"] self.assertEqual(len(columns), 12) self.assertEqual(columns[0]["name"], "name") self.assertEqual(columns[0]["type"], "string") self.assertEqual(columns[1]["name"], "slug") self.assertEqual(columns[1]["type"], "category") self.assertEqual(columns[2]["name"], "parent_id") self.assertEqual(columns[2]["type"], "float") self.assertEqual(columns[3]["name"], "depth") self.assertEqual(columns[3]["type"], "integer") self.assertEqual(columns[4]["name"], "priority") self.assertEqual(columns[4]["type"], "integer") self.assertEqual(columns[5]["name"], "max_weight") self.assertEqual(columns[5]["type"], "integer") self.assertEqual(columns[6]["name"], "frozen") self.assertEqual(columns[6]["type"], "boolean") self.assertEqual(columns[7]["name"], "rate") self.assertEqual(columns[7]["type"], "float") self.assertEqual(columns[8]["name"], "has_ingredients_book") self.assertEqual(columns[8]["type"], "boolean") self.assertEqual(columns[9]["name"], "indice") self.assertEqual(columns[9]["type"], "integer") self.assertEqual(columns[9]["index"], True) self.assertEqual(columns[10]["name"], "updated_at") self.assertEqual(columns[10]["type"], "datetime") self.assertEqual(columns[11]["name"], "elapsed") self.assertEqual(columns[11]["type"], "timespan") except Exception as exc: raise exc
def train(self, train, test, results, *args, **kwargs): """ Train with algorithm and given data to produce a trained model """ try: assert isinstance(train, pd.DataFrame) and len(train.columns) > 1 train_df = train test_df = test # if not specified the prediction target will be the last column of the dataset label = self.get_attribute("data.label") if not label: label = train_df.columns[len(train_df.columns) - 1] results["data"]["label"] = label # choose between regression, binary classification and multiclass classification label_type = analitico.schema.get_column_type(train_df, label) self.info("label: %s", label) self.info("label_type: %s", label_type) if label_type == analitico.schema.ANALITICO_TYPE_CATEGORY: label_classes = list(train_df[label].cat.categories) results["data"]["classes"] = label_classes train_df[label] = train_df[label].cat.codes results["algorithm"] = ( ALGORITHM_TYPE_BINARY_CLASSICATION if len(label_classes) == 2 else ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION) self.info("classes: %s", label_classes) else: results["algorithm"] = ALGORITHM_TYPE_REGRESSION self.info("algorithm: %s", results["algorithm"]) # remove rows with missing label from training and test sets train_rows = len(train_df) train_df = train_df.dropna(subset=[label]) if len(train_df) < train_rows: self.warning("Training data has %s rows without '%s' label", train_rows - len(train_df), label) if test_df: test_rows = len(test_df) test_df = test_df.dropna(subset=[label]) if len(test_df) < test_rows: self.warning("Test data has %s rows without '%s' label", test_rows - len(test_df), label) # make sure schemas match train_schema = self.validate_schema(train_df, test_df) # shortened training was requested? tail = self.get_attribute("parameters.tail", 0) if tail > 0: self.info("Tail: %d, cutting training data", tail) train_df = train_df.tail(tail).copy() # create test set from training set if not provided if not test_df: # decide how to create test set from settings variable # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html chronological = self.get_attribute("data.chronological", False) test_size = self.get_attribute("parameters.test_size", 0.20) results["data"]["chronological"] = chronological results["parameters"]["test_size"] = test_size if chronological: # test set if from the last rows (chronological order) self.info("Test set split: chronological") test_rows = int(len(train_df) * test_size) test_df = train_df[-test_rows:] train_df = train_df[:-test_rows] else: # test set if from a random assortment of rows self.info("Test set split: random") train_df, test_df, = train_test_split(train_df, test_size=test_size, random_state=42) self.info("training: %d rows", len(train_df)) self.info("testing: %d rows", len(test_df)) # validate data types for column in train_schema["columns"]: if column["type"] not in ("integer", "float", "boolean", "category"): self.warning( "Column '%s' of type '%s' is incompatible and will be dropped", column["name"], column["type"]) train_df = train_df.drop(column["name"], axis=1) test_df = test_df.drop(column["name"], axis=1) # save schema after dropping unused columns results["data"]["schema"] = generate_schema(train_df) results["data"]["source_records"] = len(train) results["data"]["training_records"] = len(train_df) results["data"]["test_records"] = len(test_df) results["data"]["dropped_records"] = len(train) - len( train_df) - len(test_df) # save some training data for debugging artifacts_path = self.factory.get_artifacts_directory() self.info("artifacts_path: %s", artifacts_path) samples_df = analitico.pandas.pd_sample(train_df, 200) samples_path = os.path.join(artifacts_path, "training-samples.json") samples_df.to_json(samples_path, orient="records") self.info("saved: %s (%d bytes)", samples_path, os.path.getsize(samples_path)) samples_path = os.path.join(artifacts_path, "training-samples.csv") samples_df.to_csv(samples_path) self.info("saved: %s (%d bytes)", samples_path, os.path.getsize(samples_path)) # split data and labels train_labels = train_df[label] train_df = train_df.drop([label], axis=1) test_labels = test_df[label] test_df = test_df.drop([label], axis=1) # indexes of columns that should be considered categorical categorical_idx = self.get_categorical_idx(train_df) train_pool = catboost.Pool(train_df, train_labels, cat_features=categorical_idx) test_pool = catboost.Pool(test_df, test_labels, cat_features=categorical_idx) # create regressor or classificator then train training_on = time_ms() model = self.create_model(results) model.fit(train_pool, eval_set=test_pool) results["performance"]["training_ms"] = time_ms(training_on) # score test set, add related metrics to results self.score_training(model, test_df, test_pool, test_labels, results) if results["algorithm"] == ALGORITHM_TYPE_REGRESSION: self.score_regressor_training(model, test_df, test_pool, test_labels, results) else: self.score_classifier_training(model, test_df, test_pool, test_labels, results) # save model file and training results model_path = os.path.join(artifacts_path, "model.cbm") model.save_model(model_path) results["scores"]["model_size"] = os.path.getsize(model_path) self.info("saved: %s (%d bytes)", model_path, os.path.getsize(model_path)) return results except Exception as exc: self.exception("CatBoostPlugin - error while training: %s", str(exc), exception=exc)