Exemplo n.º 1
0
   def assert_on_two_protos_with_different_num_examples(self):
       expected = text_format.Parse(
           """
 num_examples: 1
 features {
   path {
     step: 'fa'
   }
   type: STRING
   string_stats {
     unique: 4
   }
 }
 """, statistics_pb2.DatasetFeatureStatistics())
       actual = text_format.Parse(
           """
 num_examples: 2
 features {
   path {
     step: 'fa'
   }
   type: STRING
   string_stats {
     unique: 4
   }
 }""", statistics_pb2.DatasetFeatureStatistics())
       test_util.assert_dataset_feature_stats_proto_equal(
           self, actual, expected)
Exemplo n.º 2
0
 def assert_on_two_protos_with_different_numbers_of_features(self):
     expected = text_format.Parse(
         """
 features {
   name: 'fa'
   type: STRING
   string_stats {
     unique: 4
   }
 }
 features {
   name: 'fb'
   type: STRING
   string_stats {
     unique: 5
   }
 }""", statistics_pb2.DatasetFeatureStatistics())
     actual = text_format.Parse(
         """
 features {
   name: 'fa'
   type: STRING
   string_stats {
     unique: 4
   }
 }""", statistics_pb2.DatasetFeatureStatistics())
     test_util.assert_dataset_feature_stats_proto_equal(
         self, actual, expected)
    def test_make_dataset_feature_stats_proto_topk_single(self):
        expected_result = text_format.Parse(
            """
        features {
          string_stats {
            top_values {
              value: "e"
              frequency: 20.0
            }
            top_values {
              value: "d"
              frequency: 20.0
            }
            top_values {
              value: "a"
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                label: "e"
                sample_count: 20.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "d"
                sample_count: 20.0
              }
            }
          }
          path {
            step: "fa"
          }
    }""", statistics_pb2.DatasetFeatureStatistics())

        value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)]
        value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in value_counts
        ]
        result = (top_k_uniques_stats_util.
                  make_dataset_feature_stats_proto_topk_single(
                      types.FeaturePath(['fa']).steps(),
                      value_count_list=value_count_list,
                      categorical_features=frozenset([
                          types.FeaturePath(['fa']),
                          types.FeaturePath(['fb'])
                      ]),
                      is_weighted_stats=False,
                      num_top_values=3,
                      frequency_threshold=1,
                      num_rank_histogram_buckets=2))
        test_util.assert_dataset_feature_stats_proto_equal(
            self, result, expected_result)
Exemplo n.º 4
0
    def test_stats_gen_with_dataframe(self):
        records, _, expected_result = self._get_csv_test(delimiter=',',
                                                         with_header=True)
        input_data_path = self._write_records_to_csv(records,
                                                     self._get_temp_dir(),
                                                     'input_data.csv')

        dataframe = pd.read_csv(input_data_path)
        result = stats_gen_lib.generate_statistics_from_dataframe(
            dataframe=dataframe, stats_options=self._default_stats_options)
        self.assertLen(result.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, result.datasets[0], expected_result.datasets[0])
Exemplo n.º 5
0
 def test_generate_statistics_in_memory(
     self, examples, options, expected_result_proto_text, schema=None):
   expected_result = text_format.Parse(
       expected_result_proto_text,
       statistics_pb2.DatasetFeatureStatisticsList())
   if schema is not None:
     options.schema = schema
   result = stats_impl.generate_statistics_in_memory(
       examples, options)
   # generate_statistics_in_memory does not deterministically
   # order multiple features within a DatasetFeatureStatistics proto. So, we
   # cannot use compare.assertProtoEqual (which requires the same ordering of
   # repeated fields) here.
   test_util.assert_dataset_feature_stats_proto_equal(
       self, result.datasets[0], expected_result.datasets[0])
Exemplo n.º 6
0
 def test_write_stats_to_tfrecrod(self):
     stats = text_format.Parse(
         """
     datasets {
       name: 'x'
       num_examples: 100
     }
     """, statistics_pb2.DatasetFeatureStatisticsList())
     output_path = os.path.join(self._get_temp_dir(), 'stats')
     with beam.Pipeline() as p:
         _ = (p | beam.Create([stats])
              | stats_api.WriteStatisticsToTFRecord(output_path))
     stats_from_file = stats_util.load_statistics(output_path)
     self.assertLen(stats_from_file.datasets, 1)
     test_util.assert_dataset_feature_stats_proto_equal(
         self, stats_from_file.datasets[0], stats.datasets[0])
Exemplo n.º 7
0
  def test_stats_gen_with_dataframe_feature_allowlist(self):
    records, _, expected_result = self._get_csv_test(delimiter=',',
                                                     with_header=True)
    input_data_path = self._write_records_to_csv(records, self._get_temp_dir(),
                                                 'input_data.csv')

    dataframe = pd.read_csv(input_data_path)
    stats_options_allowlist = self._default_stats_options
    stats_options_allowlist.feature_allowlist = list(dataframe.columns)
    dataframe['to_be_removed_column'] = [
        [1, 2], [], None, [1], None, [3, 4], [], None]
    result = stats_gen_lib.generate_statistics_from_dataframe(
        dataframe=dataframe, stats_options=stats_options_allowlist, n_jobs=1)
    self.assertLen(result.datasets, 1)
    test_util.assert_dataset_feature_stats_proto_equal(
        self, result.datasets[0], expected_result.datasets[0])
    def test_get_combined_statistics_allowlist_features(self):
        statistics = text_format.Parse(
            """
    datasets {
      name: 'test'
      features {
        path { step: 'a' }
        type: FLOAT
      }
      features {
        path { step: 'c' }
        type: INT
      }
      features {
        path { step: 'b' }
        type: STRING
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        expected_output = text_format.Parse(
            """
    datasets {
      name: 'test'
      features {
        path { step: 'a' }
        type: FLOAT
      }
      features {
        path { step: 'b' }
        type: STRING
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        actual_output = display_util._get_combined_statistics(
            statistics,
            allowlist_features=[
                types.FeaturePath(['a']),
                types.FeaturePath(['b'])
            ])
        self.assertLen(actual_output.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, actual_output.datasets[0], expected_output.datasets[0])
Exemplo n.º 9
0
   def assert_on_two_protos_with_same_features_in_same_order(self):
       expected = text_format.Parse(
           """
 features {
   path {
     step: 'fa'
   }
   type: STRING
   string_stats {
     unique: 4
   }
 }
 features {
   path {
     step: 'fb'
   }
   type: STRING
   string_stats {
     unique: 5
   }
 }
 """, statistics_pb2.DatasetFeatureStatistics())
       actual = text_format.Parse(
           """
 features {
   path {
     step: 'fa'
   }
   type: STRING
   string_stats {
     unique: 4
   }
 }
 features {
   path {
     step: 'fb'
   }
   type: STRING
   string_stats {
     unique: 5
   }
 }""", statistics_pb2.DatasetFeatureStatistics())
       test_util.assert_dataset_feature_stats_proto_equal(
           self, actual, expected)
Exemplo n.º 10
0
 def test_write_stats_to_binary_file(self):
     stats = text_format.Parse(
         """
     datasets {
       name: 'x'
       num_examples: 100
     }
     """, statistics_pb2.DatasetFeatureStatisticsList())
     output_path = os.path.join(self._get_temp_dir(), 'stats')
     with beam.Pipeline() as p:
         _ = (p | beam.Create([stats])
              | stats_api.WriteStatisticsToBinaryFile(output_path))
     stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
     serialized_stats = io_util.read_file_to_string(output_path,
                                                    binary_mode=True)
     stats_from_file.ParseFromString(serialized_stats)
     self.assertLen(stats_from_file.datasets, 1)
     test_util.assert_dataset_feature_stats_proto_equal(
         self, stats_from_file.datasets[0], stats.datasets[0])
Exemplo n.º 11
0
    def test_write_stats_to_tfrecord_and_binary(self):
        stats1 = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f1"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        stats2 = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        stats_combined = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f1"
             }
          }
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        output_path_binary = os.path.join(self._get_temp_dir(), 'stats.pb')
        output_path_prefix = os.path.join(self._get_temp_dir(), 'stats_shards')
        with beam.Pipeline() as p:
            _ = (p | beam.Create([stats1, stats2])
                 | stats_api.WriteStatisticsToRecordsAndBinaryFile(
                     output_path_binary, output_path_prefix))

        stats_from_pb = statistics_pb2.DatasetFeatureStatisticsList()
        serialized_stats = io_util.read_file_to_string(output_path_binary,
                                                       binary_mode=True)
        stats_from_pb.ParseFromString(serialized_stats)
        self.assertLen(stats_from_pb.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, stats_from_pb.datasets[0], stats_combined.datasets[0])

        stats_from_shards = stats_util.load_sharded_statistics(
            output_path_prefix + '*').proto()
        self.assertLen(stats_from_shards.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, stats_from_shards.datasets[0], stats_combined.datasets[0])