示例#1
0
def _load_attention_data(
    run_id: str, local_mlflow_dir: str
) -> Optional[
    Tuple[
        Dict[str, Dict[str, float]],
        Dict[str, List[Tuple[str, float]]],
        Dict[str, float],
        Dict[str, List[Tuple[str, float]]],
    ]
]:
    attention_weights = load_attention_weights(run_id, local_mlflow_dir)
    if attention_weights is None:
        return None
    attention_importances = calculate_attention_importances(attention_weights)
    shared_attention_importances = {
        k: v for k, v in attention_importances.items() if len(v) > 1
    }
    shared_attention_weights = calculate_shared_attention_weights(
        attention_weights, shared_attention_importances
    )

    print("Number of features", len(attention_weights))
    print("Total number of hidden features", len(attention_importances))
    print("Number of shared hidden features", len(shared_attention_importances))
    print(
        "Number of features with >0.5 shared embedding",
        len([k for k, v in shared_attention_weights.items() if float(v) > 0.5]),
    )
    return (
        attention_weights,
        attention_importances,
        shared_attention_weights,
        shared_attention_importances,
    )
示例#2
0
def create_graph_visualization_reference(
    run_id: str,
    reference_run_id: str,
    local_mlflow_dir: str,
    threshold: float,
    run_name: str,
    use_node_mapping: bool = True,
    colored_connections_color: str="red",
):
    attention_weights = load_attention_weights(run_id, local_mlflow_dir)
    if attention_weights is None:
        return None

    feature_node_mapping = convert_to_node_mapping(
        [x for x in attention_weights], use_node_mapping
    )
    colored_connections = gather_colored_connections(
        reference_run_id=reference_run_id,
        local_mlflow_dir=local_mlflow_dir,
        attention_weights=attention_weights,
        feature_node_mapping=feature_node_mapping,
    )

    return _create_graph_visualization(
        attention_weights,
        threshold=threshold,
        run_name=run_name,
        node_mapping=feature_node_mapping,
        colored_connections=colored_connections,
        colored_connections_color=colored_connections_color,
    )
示例#3
0
def create_graph_visualization(
    run_id: str,
    local_mlflow_dir: str,
    threshold: float,
    run_name: str,
    use_node_mapping: bool = True,
) -> Optional[Dict[str, str]]:
    attention_weights = load_attention_weights(run_id, local_mlflow_dir)
    if attention_weights is None:
        return None

    feature_node_mapping = convert_to_node_mapping(
        [x for x in attention_weights], use_node_mapping
    )
    return _create_graph_visualization(
        attention_weights,
        threshold=threshold,
        run_name=run_name,
        node_mapping=feature_node_mapping,
        colored_connections=set(),
    )
示例#4
0
def gather_colored_connections(
    reference_run_id: str,
    local_mlflow_dir: str,
    attention_weights: Dict[str, Dict[str, float]],
    feature_node_mapping: Dict[str, str],
) -> Set[Tuple[str, str]]:
    if reference_run_id is None:
        return set()

    reference_attention_weights = load_attention_weights(
        reference_run_id, local_mlflow_dir
    )
    if reference_attention_weights is None:
        return set()
    reference_connections = set(
        [
            (child, parent)
            for child, parents in reference_attention_weights.items()
            for parent in parents
        ]
    )
    return calculate_colored_connections(
        reference_connections, attention_weights, feature_node_mapping
    )
def load_prediction_df(
    run_id: str,
    local_mlflow_dir: str,
    num_percentiles: int = 10,
    convert_df: bool = True,
    feature_replacements: Dict[str, str] = {},
    cluster_threshold: float = 0.9,
) -> Optional[pd.DataFrame]:
    run_mlflow_dir = Path(local_mlflow_dir + run_id)
    if not run_mlflow_dir.is_dir():
        print("Run {} is not in local MlFlow dir".format(run_id))

    input_frequency_dict = load_input_frequency_dict(run_id, local_mlflow_dir)
    if input_frequency_dict is None:
        print("No frequency file for run {} in local MlFlow dir".format(run_id))
    elif len(feature_replacements) > 0:
        for child, parent in feature_replacements.items():
            input_frequency_dict[parent] = input_frequency_dict.get(parent, {})
            input_frequency_dict[parent]["absolute_frequency"] = input_frequency_dict[
                parent
            ].get("absolute_frequency", 0) + input_frequency_dict.get(child, {}).get(
                "absolute_frequency", 0
            )
            input_frequency_dict[parent]["absolue_frequency"] = input_frequency_dict[
                parent
            ].get("absolue_frequency", 0) + input_frequency_dict.get(child, {}).get(
                "absolue_frequency", 0
            )
            input_frequency_dict[parent]["relative_frequency"] = input_frequency_dict[
                parent
            ].get("relative_frequency", 0) + input_frequency_dict.get(child, {}).get(
                "relative_frequency", 0
            )

    output_percentile_dict = load_output_percentile_mapping_dict(
        run_id, local_mlflow_dir
    )
    if output_percentile_dict is None:
        print("No output percentile file for run {} in local MlFlow dir".format(run_id))

    attention_weights = load_attention_weights(run_id, local_mlflow_dir)
    if attention_weights is None:
        print("No attention file for run {} in local MlFlow dir".format(run_id))
        attention_weights = {}

    run_prediction_output_path = Path(
        local_mlflow_dir + run_id + "/artifacts/prediction_output.csv"
    )
    if not run_prediction_output_path.exists():
        print("No prediction output file for run {} in local MlFlow dir".format(run_id))
        return None
    prediction_output_df = pd.read_csv(run_prediction_output_path)

    if convert_df:
        prediction_output_df = convert_prediction_df(
            prediction_df=prediction_output_df,
            input_frequency_dict=input_frequency_dict,
            output_percentile_dict=output_percentile_dict,
            num_percentiles=num_percentiles,
            feature_replacements=feature_replacements,
            attention_weights=attention_weights,
            cluster_threshold=cluster_threshold,
        )

    return prediction_output_df