Exemplo n.º 1
0
    def generate_slices(
        self, accessor: slice_accessor.SliceAccessor
    ) -> Generator[SliceKeyType, None, None]:
        """Generates all slices that match this specification from the data.

    Should only be called within this file.

    Examples:
      - columns = [], features = []
        slice accessor has features age=[5], gender=['f'], interest=['knitting']
        returns [[]]
      - columns = ['age'], features = [('gender', 'f')]
        slice accessor has features age=[5], gender=['f'], interest=['knitting']
        returns [[('age', 5), ('gender, 'f')]]
      - columns = ['interest'], features = [('gender', 'f')]
        slice accessor has features age=[5], gender=['f'],
        interest=['knitting', 'games']
        returns [[('gender', 'f'), ('interest, 'knitting')],
                 [('gender', 'f'), ('interest, 'games')]]

    Args:
      accessor: slice accessor.

    Yields:
      A SliceKeyType for each slice that matches this specification. Nothing
      will be yielded if there no slices matched this specification. The entries
      in the yielded SliceKeyTypes are guaranteed to be sorted by key names (and
      then values, if necessary), ascending.
    """
        # Check all the value matches (where there's a specific value specified).
        for (key, value) in self._features:
            if not accessor.has_key(key):
                return
            if value not in accessor.get(key):
                return

        # Get all the column matches (where we're matching only the column).
        #
        # For each column, we generate a List[SingletonSliceKeyType] containing
        # all pairs (column, value) for all values of the column. So this will be
        # a List[List[SingletonSliceKeyType]].
        #
        # For example, column_matches might be:
        # [[('gender', 'f'), ('gender', 'm')], [('age', 4), ('age', 5)]]
        column_matches = []
        for column in self._columns:
            # If a column to slice on doesn't appear in the example, then there will
            # be no applicable slices, so return.
            if not accessor.has_key(column):
                return

            column_matches.append([(column, value)
                                   for value in accessor.get(column)])

        # We can now take the Cartesian product of the column_matches, and append
        # the value matches to each element of that, to generate the final list of
        # slices.
        for column_part in itertools.product(*column_matches):
            yield tuple(sorted(self._value_matches + list(column_part)))
Exemplo n.º 2
0
    def generate_slices(
        self, accessor: slice_accessor.SliceAccessor
    ) -> Generator[SliceKeyType, None, None]:
        """Generates all slices that match this specification from the data.

    Should only be called within this file.

    Examples:
      - columns = [], features = [] (the overall slice case)
        slice accessor has features age=[5], gender=['f'], interest=['knitting']
        returns [()]
      - columns = ['age'], features = [('gender', 'f')]
        slice accessor has features age=[5], gender=['f'], interest=['knitting']
        returns [[('age', 5), ('gender, 'f')]]
      - columns = ['interest'], features = [('gender', 'f')]
        slice accessor has features age=[5], gender=['f'],
        interest=['knitting', 'games']
        returns [[('gender', 'f'), ('interest, 'knitting')],
                 [('gender', 'f'), ('interest, 'games')]]

    Args:
      accessor: slice accessor.

    Yields:
      A SliceKeyType for each slice that matches this specification. Nothing
      will be yielded if there no slices matched this specification. The entries
      in the yielded SliceKeyTypes are guaranteed to be sorted by key names (and
      then values, if necessary), ascending.
    """
        # Check all the value matches (where there's a specific value specified).
        for (key, value) in self._features:
            if not accessor.has_key(key):
                return

            accessor_values = accessor.get(key)
            if value not in accessor_values:
                if isinstance(value, str):
                    if value.encode() not in accessor_values:  # For Python3.
                        return
                # Check that string version of int/float not in values.
                elif str(value) not in accessor_values:
                    return

        # Get all the column matches (where we're matching only the column).
        #
        # For each column, we generate a List[SingletonSliceKeyType] containing
        # all pairs (column, value) for all values of the column. So this will be
        # a List[List[SingletonSliceKeyType]].
        #
        # For example, column_matches might be:
        # [[('gender', 'f'), ('gender', 'm')], [('age', 4), ('age', 5)]]
        column_matches = []
        for column in self._columns:
            # If a column to slice on doesn't appear in the example, then there will
            # be no applicable slices, so return.
            if not accessor.has_key(column):
                return

            column_match = []
            for value in accessor.get(column):
                if isinstance(value, bytes):
                    try:
                        column_match.append((column, tf.compat.as_text(value)))
                    except UnicodeDecodeError as e:
                        raise ValueError('Found non-UTF8 feature value {} in '
                                         'column "{}"'.format(value,
                                                              column)) from e
                else:
                    column_match.append((column, value))
            column_matches.append(column_match)

        # We can now take the Cartesian product of the column_matches, and append
        # the value matches to each element of that, to generate the final list of
        # slices. Note that for the overall slice case the column_matches is [] and
        # the Cartesian product of [] is ().
        for column_part in itertools.product(*column_matches):
            yield tuple(sorted(self._value_matches + list(column_part)))