def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, all_weights: Dict[types.FeatureName, np.ndarray], ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type innermost_nested_type = get_innermost_nested_type(array_type) if pa.types.is_struct(innermost_nested_type): if not enumerate_leaves_only: weights = all_weights.get(example_weight_map.get(feature_path)) # special handing for a flat struct array -- wrap it in a ListArray # whose elements are singleton lists. This way downstream can keep # assuming the enumerated arrays are list<*>. to_yield = array if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: to_yield = array_util.ToSingletonListArray(array) yield (feature_path, to_yield, weights) flat_struct_array, parent_indices = flatten_nested( array, bool(all_weights)) # Potential optimization: # Only flatten weights that we know will be used in the recursion. flat_all_weights = { weight_feature_name: w[parent_indices] for weight_feature_name, w in all_weights.items() } for field in flat_struct_array.type: field_name = field.name yield from _recursion_helper( feature_path.child(field_name), flat_struct_array.field(field_name), flat_all_weights) else: weights = all_weights.get(example_weight_map.get(feature_path)) yield (feature_path, array, weights)
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" array_type = array.type if not query_path: if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: array = array_util.ToSingletonListArray(array) return array, example_indices if not pa.types.is_struct(get_innermost_nested_type(array_type)): raise KeyError('Cannot process query_path "{}" inside an array of type ' '{}. Expecting a struct<...> or ' '(large_)list...<struct<...>>.'.format( query_path, array_type)) flat_struct_array, parent_indices = flatten_nested( array, example_indices is not None) flat_indices = None if example_indices is not None: flat_indices = example_indices[parent_indices] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type innermost_nested_type = get_innermost_nested_type(array_type) if pa.types.is_struct(innermost_nested_type): if not enumerate_leaves_only: # special handing for a flat struct array -- wrap it in a ListArray # whose elements are singleton lists. This way downstream can keep # assuming the enumerated arrays are list<*>. to_yield = array if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: to_yield = array_util.ToSingletonListArray(array) yield (feature_path, to_yield, weights) flat_struct_array, parent_indices = flatten_nested( array, weights is not None) flat_weights = None if weights is None else weights[parent_indices] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def testGetArrayWrapFlatStructArray(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_RECORD_BATCH, feature, return_example_indices=True, wrap_flat_struct_in_list=True) expected_arr, expected_indices = expected if pa.types.is_struct(expected_arr.type): expected_arr = array_util.ToSingletonListArray(expected_arr) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def testEnumerateArrays(self): for leaves_only, has_weights, wrap_flat_struct_in_list in ( itertools.product([True, False], [True, False], [True, False])): actual_results = {} for feature_path, feature_array, weights in arrow_util.enumerate_arrays( _INPUT_RECORD_BATCH, _EXAMPLE_WEIGHT_MAP if has_weights else None, leaves_only, wrap_flat_struct_in_list): actual_results[feature_path] = (feature_array, weights) expected_results = {} # leaf fields for p in [["f1"], ["w"], ["w_override1"], ["w_override2"], ["f2", "sf1"], ["f2", "sf2", "ssf1"], ["f3", "sf1"], ["f3", "sf2"]]: feature_path = types.FeaturePath(p) expected_results[feature_path] = ( _FEATURES_TO_ARRAYS[feature_path].array, _FEATURES_TO_ARRAYS[feature_path].weights if has_weights else None) if not leaves_only: for p in [["f2"], ["f2", "sf2"], ["f3"]]: feature_path = types.FeaturePath(p) expected_array = _FEATURES_TO_ARRAYS[feature_path][0] if wrap_flat_struct_in_list and pa.types.is_struct( expected_array.type): expected_array = array_util.ToSingletonListArray( expected_array) expected_results[feature_path] = ( expected_array, _FEATURES_TO_ARRAYS[feature_path].weights if has_weights else None) self.assertLen(actual_results, len(expected_results)) for k, v in six.iteritems(expected_results): self.assertIn(k, actual_results) actual = actual_results[k] self.assertTrue( actual[0].equals(v[0]), "leaves_only={}; has_weights={}; " "wrap_flat_struct_in_list={} feature={}; expected: {}; actual: {}" .format(leaves_only, has_weights, wrap_flat_struct_in_list, k, v, actual)) np.testing.assert_array_equal(actual[1], v[1])
def _generate_partial_statistics_from_df( dataframe: pd.DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() arrow_fields = [] for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if (kind not in _NUMPY_KIND_TO_ARROW_TYPE or (feature_whitelist and col_name not in feature_whitelist)): logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) arrow_fields.append(pa.field(col_name, _NUMPY_KIND_TO_ARROW_TYPE[kind])) if schema.feature: stats_options_modified.schema = schema record_batch_with_primitive_arrays = pa.RecordBatch.from_pandas( dataframe, schema=pa.schema(arrow_fields)) arrays = [] for column_array in record_batch_with_primitive_arrays.columns: arrays.append(array_util.ToSingletonListArray(column_array)) # TODO(pachristopher): Consider using a list of record batches instead of a # single record batch to avoid having list arrays larger than 2^31 elements. record_batch_with_list_arrays = pa.RecordBatch.from_arrays( arrays, record_batch_with_primitive_arrays.schema.names) return stats_impl.generate_partial_statistics_in_memory( record_batch_with_list_arrays, stats_options_modified, stats_generators)
def CanonicalizeRecordBatch( record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch: """Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays. Args: record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are stored in primitive arrays or singleton list arrays. Returns: pyArrow.RecordBatch in SingletonListArray format. """ arrays = [] for column_array in record_batch_with_primitive_arrays.columns: arr_type = column_array.type if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)): arrays.append(array_util.ToSingletonListArray(column_array)) else: arrays.append(column_array) # TODO(pachristopher): Consider using a list of record batches instead of a # single record batch to avoid having list arrays larger than 2^31 elements. return pa.RecordBatch.from_arrays( arrays, record_batch_with_primitive_arrays.schema.names)
def testToSingletonListArray(self, array, expected_result): result = array_util.ToSingletonListArray(array) result.validate() self.assertTrue( result.equals(expected_result), "expected: {}; got: {}".format(expected_result, result))