def add_scoring_columns(tree, df, output_probabilities, is_evaluation=False, check_prediction=False): for leaf_id in tree.leaves: leaf = tree.get_node(leaf_id) if leaf.prediction is not None: filtered_df = tree.get_filtered_df(leaf, df) label_indices = filtered_df.index if is_evaluation: filtered_df = filtered_df[filtered_df[tree.target].isin(tree.target_values)] filtered_df_indices = filtered_df.index if output_probabilities: remaining_target_classes = set(tree.target_values) for target_class_name, proba in leaf.probabilities: df.loc[filtered_df_indices, "proba_"+safe_str(target_class_name)] = proba remaining_target_classes.remove(target_class_name) for target_class_name in remaining_target_classes: df.loc[filtered_df_indices, "proba_"+safe_str(target_class_name)] = 0 df.loc[filtered_df_indices, "prediction"] = leaf.prediction if check_prediction: df.loc[filtered_df_indices, "prediction_correct"] = filtered_df[tree.target] == leaf.prediction df.loc[label_indices, "label"] = leaf.label elif leaf.label is not None: filtered_df = tree.get_filtered_df(leaf, df) df.loc[filtered_df.index, "label"] = leaf.label
def parse_nodes(self, nodes): self.nodes, ids = {}, deque() node = Node(0, -1, set(nodes["0"]["treated_as_numerical"])) node.rebuild(nodes["0"]["children_ids"], nodes["0"]["prediction"], nodes["0"]["samples"], nodes["0"]["probabilities"], nodes["0"]["label"]) self.nodes[0] = node ids += node.children_ids if not ids: self.leaves.add(node.id) while ids: dict_node = nodes[safe_str(ids.popleft())] if dict_node.get("values") is not None: node = CategoricalNode( dict_node.pop("id"), dict_node.pop("parent_id"), set(dict_node.pop("treated_as_numerical")), dict_node.pop("feature"), dict_node.pop("values"), others=dict_node.pop("others")) else: node = NumericalNode( dict_node.pop("id"), dict_node.pop("parent_id"), set(dict_node.pop("treated_as_numerical")), dict_node.pop("feature"), beginning=dict_node.pop("beginning", None), end=dict_node.pop("end", None)) node.rebuild(**dict_node) if not node.children_ids: self.leaves.add(node.id) self.nodes[node.id] = node ids += node.children_ids
def get_scored_df_schema(tree, schema, columns, output_probabilities, is_evaluation=False, check_prediction=False): check_input_schema(tree, set(column["name"] for column in schema), is_evaluation) if columns is not None: schema = update_input_schema(schema, columns) if output_probabilities: for value in tree.target_values: schema.append({'type': 'double', 'name': "proba_" + safe_str(value)}) if columns is not None: columns.append("proba_"+safe_str(value)) schema.append({'type': 'string', 'name': 'prediction'}) if columns is not None: columns.append("prediction") if check_prediction: schema.append({'type': 'boolean', 'name': 'prediction_correct'}) if columns is not None: columns.append("prediction_correct") schema.append({'type': 'string', 'name': 'label'}) if columns is not None: columns.append("label") return schema
def score_chunk(tree, df, check_prediction): filtered_dfs = [] for leaf_id in tree.leaves: leaf = tree.get_node(leaf_id) filtered_df = tree.get_filtered_df(leaf, df) for proba in leaf.probabilities: filtered_df["proba_" + safe_str(proba[0])] = proba[1] filtered_df["prediction"] = leaf.prediction if check_prediction and leaf.prediction is not None: filtered_df["prediction_correct"] = filtered_df[ tree.target] == leaf.prediction filtered_df["label"] = leaf.label filtered_dfs.append(filtered_df) return filtered_dfs
def get_stats_numerical_node(self, column, target_column, mean): if column.empty: return {"no_values": True} stats = {"bins": [], "mean": column.mean(), "max": column.max(), "min": column.min()} bins = pd.cut(column.fillna(mean), bins = min(10, column.nunique()), include_lowest = True, right = False) target_grouped = target_column.groupby(bins) target_distrib = target_grouped.apply(lambda x: x.value_counts()) col_distrib = target_grouped.count() for interval, count in col_distrib.items(): stats["bins"].append({"value": safe_str(interval), "target_distrib": target_distrib[interval].to_dict() if count > 0 else {}, "mid": interval.mid, "count": count}) return stats
def score(tree, input_dataset, chunk_size_param, check_prediction): dfs = [] first_chunk = True for df in input_dataset.iter_dataframes(chunksize=chunk_size_param): if first_chunk: check(df, tree, check_prediction) first_chunk = False dfs += score_chunk(tree, df, check_prediction) full_df = pd.concat(dfs).sort_index() proba_columns = [ "proba_" + safe_str(target_value) for target_value in tree.target_values ] full_df[proba_columns] = full_df[proba_columns].fillna(0) return full_df
def write_with_schema(tree, input_dataset, scored_dataset, scored_df, output_probabilities, check_prediction): schema = input_dataset.read_schema() if output_probabilities: for value in tree.target_values: schema.append({ 'type': 'double', 'name': "proba_" + safe_str(value) }) schema.append({'type': 'string', 'name': 'prediction'}) if check_prediction: schema.append({'type': 'boolean', 'name': 'prediction_correct'}) schema.append({'type': 'string', 'name': 'label'}) scored_dataset.write_schema(schema) with scored_dataset.get_writer() as writer: writer.write_dataframe(scored_df)
def get_stats_categorical_node(self, column, target_column, unfiltered_col): stats = {"bins": []} empty_values = set(unfiltered_col.unique()) if not column.empty: target_grouped = target_column.groupby(column.fillna("No values").apply(safe_str)) target_distrib = target_grouped.value_counts(dropna=False) col_distrib = target_grouped.count().sort_values(ascending=False) empty_values -= set(col_distrib.index) stats["same_target_distrib"] = True for value in col_distrib.index: stats["bins"].append({"value": value, "target_distrib": target_distrib[value].to_dict(), "count": col_distrib[value]}) if stats.get("same_target_distrib") and stats["bins"][0]["target_distrib"] != stats["bins"][-1]["target_distrib"]: del stats["same_target_distrib"] else: stats["no_values"] = True for value in empty_values: stats["bins"].append({"value": safe_str(value), "count": 0}) return stats
def parse_nodes(self, nodes, rebuild_nodes=False, numerical_features=None): self.nodes, ids = {}, deque() root_node_dict = nodes["0"] treated_as_numerical = set(root_node_dict["treated_as_numerical"]) if numerical_features is not None: treated_as_numerical.intersection_update(numerical_features) root_node = Node(0, -1, treated_as_numerical) root_node.label = root_node_dict["label"] self.add_node(root_node) ids += root_node_dict["children_ids"] while ids: dict_node = nodes[safe_str(ids.popleft())] treated_as_numerical = set(dict_node["treated_as_numerical"]) feature = dict_node["feature"] if numerical_features is not None: treated_as_numerical.intersection_update(numerical_features) if dict_node.get("values") is not None: node = CategoricalNode(dict_node["id"], dict_node["parent_id"], treated_as_numerical, feature, dict_node["values"], others=dict_node["others"]) else: node = NumericalNode(dict_node["id"], dict_node["parent_id"], treated_as_numerical, feature, beginning=dict_node.get("beginning", None), end=dict_node.get("end", None)) node.label = dict_node["label"] self.add_node(node) if rebuild_nodes: node.rebuild(dict_node["prediction"], dict_node["samples"], dict_node["probabilities"]) ids += dict_node["children_ids"]
get_output_names_for_role("metrics_dataset")[0]) folder = dataiku.Folder(get_input_names_for_role("folder")[0]) chunk_size_param = get_recipe_config()["chunk_size"] try: tree = folder.read_json(get_recipe_config()["tree_file"]) except ValueError: raise Exception("No tree file named " + get_recipe_config()["tree_file"]) tree["df"] = input_dataset.get_dataframe() tree = Tree(**tree) scored_df = score(tree, input_dataset, chunk_size_param, True) target_mapping = { safe_str(label): index for index, label in enumerate(tree.target_values) } scored_df_nona = scored_df.dropna(subset=["prediction"]) y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction y_actual = y_actual.map(lambda t: int(target_mapping[safe_str(t)])) y_pred = y_pred.map(lambda t: int(target_mapping[safe_str(t)])) if len(tree.target_values) > 2: compute_metrics = compute_multiclass_metrics metrics = [ "precision", "recall", "accuracy", "mrocAUC", "logLoss", "hammingLoss", "mcalibrationLoss" ] else: compute_metrics = compute_binary_classification_metrics
def get_stats(self, i, col): node = self.get_node(i) filtered_df = self.get_filtered_df(node, self.df) column = filtered_df[col] target_column = filtered_df[self.target] stats = {} if col in node.treated_as_numerical: if not column.empty: stats.update({ "mean": column.mean(), "max": column.max(), "min": column.min() }) target_grouped = target_column.groupby( pd.cut(column.fillna(self.features[col]["mean"]), bins=min(10, column.nunique()), include_lowest=True, right=False)) target_distrib = target_grouped.apply( lambda x: x.value_counts()) col_distrib = target_grouped.count() stats["bins"] = [] for interval, count in col_distrib.items(): stats["bins"].append({ "value": safe_str(interval), "target_distrib": target_distrib[interval].to_dict() if count > 0 else {}, "mid": interval.mid, "count": count }) else: stats["no_values"] = True return stats stats["bins"] = [] empty_values = set(self.df[col].dropna().apply(safe_str).unique()) if not column.empty: target_grouped = target_column.groupby( column.fillna("No values").apply(safe_str)) target_distrib = target_grouped.value_counts(dropna=False) col_distrib = target_grouped.count().sort_values(ascending=False) empty_values -= set(col_distrib.index) stats["same_target_distrib"] = True for value in col_distrib.index: stats["bins"].append({ "value": value, "target_distrib": target_distrib[value].to_dict(), "count": col_distrib[value] }) if stats.get("same_target_distrib" ) and stats["bins"][0]["target_distrib"] != stats[ "bins"][-1]["target_distrib"]: del stats["same_target_distrib"] else: stats["no_values"] = True for value in empty_values: stats["bins"].append({"value": safe_str(value), "count": 0}) return stats