예제 #1
0
def _generate_partial_statistics_from_df(
    dataframe: DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()
    drop_columns = []
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        if (not table_util.NumpyKindToArrowType(col_type.kind)
                or (feature_whitelist and col_name not in feature_whitelist)):
            drop_columns.append(col_name)
        elif col_type.kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())
    dataframe = dataframe.drop(columns=drop_columns)
    if schema.feature:
        stats_options_modified.schema = schema
    record_batch_with_primitive_arrays = table_util.DataFrameToRecordBatch(
        dataframe)
    record_batch_with_list_arrays = table_util.CanonicalizeRecordBatch(
        record_batch_with_primitive_arrays)
    return stats_impl.generate_partial_statistics_in_memory(
        record_batch_with_list_arrays, stats_options_modified,
        stats_generators)
def _generate_partial_statistics_from_df(
    dataframe: pd.DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    inmemory_dicts = [{} for _ in range(len(dataframe))]
    isnull = pd.isnull
    # Initialize decoding fn based on column type.
    int_fn = lambda x: np.array([x], dtype=np.integer)
    float_fn = lambda x: None if isnull(x) else np.array([x],
                                                         dtype=np.floating)
    str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object)
    decode_fn = {
        # int type.
        'i': int_fn,
        'u': int_fn,
        # float type.
        'f': float_fn,
        # bool type.
        'b': int_fn,
        # string type.
        'S': str_fn,
        'O': str_fn,
        'U': str_fn,
    }

    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        kind = col_type.kind
        if (kind not in decode_fn
                or (feature_whitelist and col_name not in feature_whitelist)):
            logging.warning('Ignoring feature %s of type %s', col_name,
                            col_type)
            continue
        if kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())

        # Get decoding fn based on column type.
        fn = decode_fn[kind]
        # Iterate over the column and apply the decoding fn.
        j = 0
        for val in dataframe[col_name]:
            inmemory_dicts[j][col_name] = fn(val)
            j += 1
    if schema.feature:
        stats_options_modified.schema = schema
    record_batch = decoded_examples_to_arrow.DecodedExamplesToRecordBatch(
        inmemory_dicts)
    return stats_impl.generate_partial_statistics_in_memory(
        record_batch, stats_options_modified, stats_generators)
예제 #3
0
def _generate_partial_statistics_from_df(
    dataframe: pd.DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    inmemory_dicts = [{} for _ in range(len(dataframe))]
    isnull = pd.isnull
    # Initialize decoding fn based on column type.
    int_fn = lambda x: np.array([x], dtype=np.integer)
    float_fn = lambda x: None if isnull(x) else np.array([x],
                                                         dtype=np.floating)
    str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object)
    decode_fn = {
        # int type.
        'i': int_fn,
        'u': int_fn,
        # float type.
        'f': float_fn,
        # bool type.
        'b': int_fn,
        # string type.
        'S': str_fn,
        'O': str_fn,
        'U': str_fn,
    }

    schema = schema_pb2.Schema()
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        kind = col_type.kind
        if kind not in decode_fn:
            logging.warning('Ignoring feature %s of type %s', col_name,
                            col_type)
            continue
        if kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())

        # Get decoding fn based on column type.
        fn = decode_fn[kind]
        # Iterate over the column and apply the decoding fn.
        j = 0
        for val in dataframe[col_name]:
            inmemory_dicts[j][col_name] = fn(val)
            j += 1
    if schema.feature:
        stats_options.schema = schema
    return stats_impl.generate_partial_statistics_in_memory(
        decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts),
        stats_options, stats_generators)
예제 #4
0
def _generate_partial_statistics_from_df(
    dataframe: pd.DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()

    arrow_fields = []
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        kind = col_type.kind
        if (kind not in _NUMPY_KIND_TO_ARROW_TYPE
                or (feature_whitelist and col_name not in feature_whitelist)):
            logging.warning('Ignoring feature %s of type %s', col_name,
                            col_type)
            continue
        if kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())
        arrow_fields.append(pa.field(col_name,
                                     _NUMPY_KIND_TO_ARROW_TYPE[kind]))
    if schema.feature:
        stats_options_modified.schema = schema
    record_batch_with_primitive_arrays = pa.RecordBatch.from_pandas(
        dataframe, schema=pa.schema(arrow_fields))
    arrays = []
    for column_array in record_batch_with_primitive_arrays.columns:
        arrays.append(array_util.ToSingletonListArray(column_array))
    # TODO(pachristopher): Consider using a list of record batches instead of a
    # single record batch to avoid having list arrays larger than 2^31 elements.
    record_batch_with_list_arrays = pa.RecordBatch.from_arrays(
        arrays, record_batch_with_primitive_arrays.schema.names)
    return stats_impl.generate_partial_statistics_in_memory(
        record_batch_with_list_arrays, stats_options_modified,
        stats_generators)
예제 #5
0
    'feature_name':
    'x',
    'domain':
    schema_pb2.StringDomain(value=['a', 'b']),
    'output_schema_proto_text':
    '''
          feature { name: 'x' string_domain { value: 'a' value: 'b' } }'''
}, {
    'testcase_name':
    'bool_domain',
    'input_schema_proto_text':
    '''feature { name: 'x' }''',
    'feature_name':
    'x',
    'domain':
    schema_pb2.BoolDomain(true_value='T', false_value='F'),
    'output_schema_proto_text':
    '''
          feature { name: 'x' bool_domain { true_value: 'T' false_value: 'F' } }
        '''
}, {
    'testcase_name':
    'global_domain',
    'input_schema_proto_text':
    '''
          string_domain { name: 'global_domain' value: 'a' value: 'b' }
          feature { name: 'x' }''',
    'feature_name':
    'x',
    'domain':
    'global_domain',