예제 #1
0
def create_pairwise_metric_table(row_list: Set[Text], column_list: Set[Text],
                                 name_value_map: Dict[Text, float],
                                 same_match_value) -> Text:
    """Construct table for pair-wise computed metrics, e.g.,
  PEARSON_CORRELATION, ANOVA, CHI_SQUARE, INFORMATION_GAIN

  Examples:
  ​|tips|tolls|trip_total
  :-----:|:-----:|:-----:|:-----:
  tips|1|0.0001942405360750854|0.1952170878648758
  tolls|0.0001942405360750854|1|0.22858665883541107
  trip_total|0.1952170878648758|0.22858665883541107|1

  Args:
      row_list: (List[str]), list of attribute names for table header
      column_list: (List[str]), list of attribute names for table row name
      name_value_map: (Dict[str, float]), map of name -> value
      same_match_value: value if the column and row name are the same. This
      could be either float or 'NA' depends on whether the computation of
      A-v.s.-A makes sense

  Returns:
      string
  """
    table_template = template.TABLE_TEMPLATE

    headers = ['​'] + list(column_list)
    header_string = "|".join(headers)
    header_separator = "|".join([":-----:" for i in range(len(headers))])

    table_content = []

    for row_name in row_list:
        # row header is in BOLD
        row_values = [template.BOLD.format(content=row_name.strip())]
        for col_name in column_list:
            # same_match_value is used when row_name == column_name
            if row_name == col_name:
                value = same_match_value
            else:
                value = name_value_map[row_name + '-' + col_name]

            # if the same_match_value is string, simply append it
            if isinstance(value, str):
                row_values.append(same_match_value)
            else:
                row_values.append(formatting.numeric_formatting(value))

        table_content.append("|".join(row_values))

    table_content_string = "\n".join(table_content)

    return table_template.format(header=header_string,
                                 header_separator=header_separator,
                                 table_content=table_content_string)
예제 #2
0
def create_table_descriptive_row_from_analysis(attribute_name: Text,
                                               base_analysis: Analysis,
                                               additional_analysis: Analysis,
                                               figure_base_path: Text) -> Text:
    # pylint: disable-msg=too-many-locals
    """Create makrdown formatted descriptive analysis result

  Args:
      attribute_name: (string), name of the attribute
      base_analysis: (analysis_entity_pb2.Analysis), analysis holding
      all the metrics
      additional_analysis: (analysis_entity_pb2.Analysis), histogram for
      numerical attribute, value_counts for categorical attributes
      figure_base_path: (string), the folder for holding figures

  Returns:
      string, markdown formatted content
  """
    row_template = template.TABLE_DESCRIPTIVE_ROW_TEMPLATE
    stats_template = template.TABLE_DESCRIPTIVE_STATS_TEMPLATE

    metrics = base_analysis.smetrics
    attribute_type = base_analysis.features[0].type

    # Make sure the display order of each attribute is consistent
    common_order = query_constants.COMMON_ORDER
    if attribute_type == Attribute.NUMERICAL:
        detail_order = query_constants.NUMERICAL_ORDER
    else:
        detail_order = query_constants.CATEGORICAL_ORDER
    # Use a OrderedDict to store the result
    result_holder = OrderedDict([(item, 0)
                                 for item in common_order + detail_order])
    for item in metrics:
        name = ScalarMetric.Name.Name(item.name)
        value = formatting.numeric_formatting(item.value)
        result_holder[name] = value

    # Construct the markdown formatted row
    row_stats_contents = []
    for item in result_holder:
        row_stats_contents.append(
            stats_template.format(metric=item, value=result_holder[item]))

    figure_path = visualization.plot_bar_chart(additional_analysis,
                                               figure_base_path)
    return row_template.format(
        name=attribute_name,
        type=Attribute.Type.Name(attribute_type),
        stats=' <br/> '.join(row_stats_contents),
        url=figure_path,
        alt_text=attribute_name,
    )
예제 #3
0
def create_table_from_table_metric(table_metric: TableMetric) -> Text:
    """Create a table for a TableMetric object. Currently, this function is
  used for Contingency_Table and TABLE_DESCRIPTIVE

  Examples:
  &#x200B;|Cash|Credit Card|No Charge|Unknown|Mobile|Prcard
  :-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:
  frequency|108114952.0|74475448.0|797730.0|369844.0|255082.0|192063.0

  Args:
      table_metric: (analysis_entity_pb2.TableMetric)

  Returns:
      string
  """

    supported_metric = {
        TableMetric.CONTINGENCY_TABLE, TableMetric.TABLE_DESCRIPTIVE
    }

    assert table_metric.name in supported_metric

    table_template = template.TABLE_TEMPLATE

    headers = ['&#x200B;'] + list(table_metric.column_indexes)
    header_string = "|".join(headers)
    header_separator = "|".join([":-----:" for i in range(len(headers))])

    table_content = []

    for row in table_metric.rows:
        # row header is in BOLD
        row_header = template.BOLD.format(content=str(row.row_index).strip())
        row_values = [row_header] + [
            formatting.numeric_formatting(item.value) for item in row.cells
        ]
        table_content.append("|".join(row_values))

    table_content_string = "\n".join(table_content)

    return table_template.format(header=header_string,
                                 header_separator=header_separator,
                                 table_content=table_content_string)
예제 #4
0
def check_p_value(analysis: Analysis) -> Union[None, Text]:
    """Check whether the p-value of statistical tests
  exceed the predefined threshold

  Args:
      analysis: (analysis_entity_pb2.Analysis), analysis that contain the result
      of statistical test

  Returns:
    Union[None, string]
  """
    metric = analysis.smetrics[0]
    name_list = [att.name for att in analysis.features]
    p_value = metric.value

    if p_value < P_VALUE_THRESHOLD:
        return template.LOW_P_VALUE.format(
            name_one=name_list[0],
            name_two=name_list[1],
            metric='p-value',
            value=formatting.numeric_formatting(p_value))

    return None
예제 #5
0
def create_target_metrics_highlight(
        target_name: Text, metric_name_list: List[Text],
        metric_analysis_list: List[List[Analysis]]) -> Text:
    # pylint: disable-msg=too-many-locals
    """Create the content for highlight section regarding a target attribute

  Args:
      target_name: (string)
      metric_name_list: (List(string)
      metric_analysis_list: (List[List[analysis_entity_pb2.Analysis]])

  Returns:

  """

    assert len(metric_name_list) == len(metric_analysis_list)
    # Every metric should have the same length, i.e., target v.s. remaining
    assert len({len(item) for item in metric_analysis_list}) == 1

    name_enrich = {
        'ANOVA': 'ANOVA P-value',
        'CHI_SQUARE': 'Chi-square P-value',
        'INFORMATION_GAIN': 'Information Gain',
        'PEARSON_CORRELATION': 'Correlation Coefficient'
    }

    table_template = template.TARGET_METRIC_HIGHLIGHT_TEMPLATE
    row_template = template.TARGET_METRIC_HIGHLIGHT_ROW_TEMPLATE

    num_metrics = len(metric_name_list)

    enrich_name_list = [
        name_enrich[item] if item in name_enrich else item
        for item in metric_name_list
    ]
    metric_names_str = '|'.join(enrich_name_list)

    separator_str = ':-----:|' * num_metrics

    attribute_set = set()
    metric_holders = {metric: {} for metric in metric_name_list}
    for i in range(num_metrics):
        for analysis in metric_analysis_list[i]:
            metric_name = Analysis.Name.Name(analysis.name)
            attribute_name = [
                att.name for att in analysis.features
                if att.name != target_name
            ][0]
            attribute_set.add(attribute_name)
            metric_value = analysis.smetrics[0].value
            metric_holders[metric_name][attribute_name] = metric_value

    row_content_list = []
    for attribute in attribute_set:
        values_str = '|'.join([
            formatting.numeric_formatting(metric_holders[metric][attribute])
            for metric in metric_name_list
        ])
        row_content_list.append(
            row_template.format(name=attribute, values=values_str))

    return table_template.format(target_column=target_name,
                                 metric_names=metric_names_str,
                                 seperators=separator_str,
                                 row_content='\n'.join(row_content_list))