예제 #1
0
def _generate_partial_statistics_from_df(
    dataframe: DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()
    drop_columns = []
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        if (not table_util.NumpyKindToArrowType(col_type.kind)
                or (feature_whitelist and col_name not in feature_whitelist)):
            drop_columns.append(col_name)
        elif col_type.kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())
    dataframe = dataframe.drop(columns=drop_columns)
    if schema.feature:
        stats_options_modified.schema = schema
    record_batch_with_primitive_arrays = table_util.DataFrameToRecordBatch(
        dataframe)
    record_batch_with_list_arrays = table_util.CanonicalizeRecordBatch(
        record_batch_with_primitive_arrays)
    return stats_impl.generate_partial_statistics_in_memory(
        record_batch_with_list_arrays, stats_options_modified,
        stats_generators)
예제 #2
0
  def testDataFrameToRecordBatch(self):

    df_data = pd.DataFrame([{
        "age": 17,
        "language": "english",
        "prediction": False,
        "label": False,
        "complex_var": 2 + 3j
    }, {
        "age": 30,
        "language": "spanish",
        "prediction": True,
        "label": True,
        "complex_var": 2 + 3j
    }])

    expected_fields = {"age", "language", "prediction", "label"}
    expected_row_counts = collections.Counter({
        (17, 30): 1,
        (0, 1): 2,
        (b"english", b"spanish"): 1
    })

    rb_data = table_util.DataFrameToRecordBatch(df_data)
    self.assertSetEqual(set(rb_data.schema.names), expected_fields)

    actual_row_counts = collections.Counter()
    for col in rb_data.columns:
      row = tuple(col.to_pylist())
      actual_row_counts[row] += 1
    self.assertDictEqual(actual_row_counts, expected_row_counts)

    canonicalized_rb_data = table_util.CanonicalizeRecordBatch(rb_data)
    self.assertSetEqual(
        set(canonicalized_rb_data.schema.names), expected_fields)

    actual_row_counts = collections.Counter()
    for col in canonicalized_rb_data.columns:
      col = col.to_pylist()
      row = (col[0][0], col[1][0])
      actual_row_counts[row] += 1
    self.assertDictEqual(actual_row_counts, expected_row_counts)

    expected_age_column = pa.array([[17], [30]], type=pa.list_(pa.int64()))
    expected_language_column = pa.array([["english"], ["spanish"]],
                                        type=pa.list_(pa.binary()))
    expected_prediction_column = pa.array([[0], [1]], type=pa.list_(pa.int8()))
    expected_label_column = pa.array([[0], [1]], type=pa.list_(pa.int8()))
    self.assertTrue(
        canonicalized_rb_data.column(
            canonicalized_rb_data.schema.get_field_index("age")).equals(
                expected_age_column))
    self.assertTrue(
        canonicalized_rb_data.column(
            canonicalized_rb_data.schema.get_field_index("language")).equals(
                expected_language_column))
    self.assertTrue(
        canonicalized_rb_data.column(
            canonicalized_rb_data.schema.get_field_index("prediction")).equals(
                expected_prediction_column))
    self.assertTrue(
        canonicalized_rb_data.column(
            canonicalized_rb_data.schema.get_field_index("label")).equals(
                expected_label_column))