def _normalize_feature(feature: schema_pb2.Feature, schema: schema_pb2.Schema) -> None: """Make each feature self-contained. If the feature references a global domain, copy the global domain locally. Also do this for any child features. Note: the name of the domain is retained, so if we want to, we could attempt to "unnormalize" the feature, recreating global domains. Args: feature: feature to modify in place. schema: schema containing any global domains. """ if feature.HasField("struct_domain"): for x in feature.struct_domain.feature: _normalize_feature(x, schema) if feature.HasField("domain"): for string_domain in schema.string_domain: if string_domain.name == feature.domain: feature.string_domain.CopyFrom(string_domain) return for int_domain in schema.int_domain: if int_domain.name == feature.domain: feature.int_domain.CopyFrom(int_domain) return for float_domain in schema.float_domain: if float_domain.name == feature.domain: feature.float_domain.CopyFrom(float_domain) return raise ValueError("Did not find domain {} in schema {}".format( feature.domain, schema))
def is_categorical_feature(feature: schema_pb2.Feature): """Checks if the input feature is categorical.""" if feature.type == schema_pb2.BYTES: return True elif feature.type == schema_pb2.INT: return ((feature.HasField('int_domain') and feature.int_domain.is_categorical) or feature.HasField('bool_domain')) else: return False
def is_categorical_feature(feature: schema_pb2.Feature): """Checks if the input feature is categorical.""" if feature.type == schema_pb2.BYTES: return True elif feature.type == schema_pb2.INT: return ((feature.HasField('int_domain') and feature.int_domain.is_categorical) or feature.WhichOneof('domain_info') in ['bool_domain', 'natural_language_domain']) else: return False
def _apply_feature(original_child: expression.Expression, feature: schema_pb2.Feature): """Apply a feature to an expression. Feature should be "unclean".""" feature_copy = [x for x in feature.struct_domain.feature ] if feature.HasField("struct_domain") else [] return _SchemaExpression(original_child, feature_copy, _clean_feature(feature))
def _copy_domain_info(origin: schema_pb2.Feature, dest: schema_pb2.Feature): """Copy the domain info.""" one_of_field_name = origin.WhichOneof("domain_info") if one_of_field_name is None: return origin_field = getattr(origin, one_of_field_name) field_descriptor = origin.DESCRIPTOR.fields_by_name.get(one_of_field_name) if field_descriptor is None or field_descriptor.message_type is None: setattr(dest, one_of_field_name, origin_field) else: dest_field = getattr(dest, one_of_field_name) dest_field.CopyFrom(origin_field)
def _infer_feature_shape(feature: schema_pb2.Feature): if feature.HasField('struct_domain'): for struct_domain_feature in feature.struct_domain.feature: _infer_feature_shape(struct_domain_feature) # Currently we infer shape only for required features. if feature.presence.min_fraction == 1: if (feature.HasField('value_count') and feature.value_count.min != 0 and feature.value_count.min == feature.value_count.max): feature.shape.dim.add().size = feature.value_count.min elif feature.HasField('value_counts'): # Infer shape for a feature that has a nestedness level > 1 if and only # if the min value count equals the max value count at each nestedness # level. dimension_sizes = list() for value_count in feature.value_counts.value_count: if (value_count.min == 0 or value_count.min != value_count.max): return dimension_sizes.append(value_count.min) if len(dimension_sizes) == len( feature.value_counts.value_count): for size in dimension_sizes: feature.shape.dim.add().size = size