Пример #1
0
  def test_numerics(self):
    test_folder = os.path.join(self._bucket_root, 'test_numerics')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

    schema = [{'name': 'col1', 'type': 'INTEGER'},
              {'name': 'col2', 'type': 'FLOAT'}]
    features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                'col2': {'transform': 'identity', 'source_column': 'col2'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        inverted_features=analyze.invert_features(features))

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

    self.assertEqual(stats['num_examples'], 100)
    col = stats['column_stats']['col1']
    self.assertAlmostEqual(col['max'], 99.0)
    self.assertAlmostEqual(col['min'], 0.0)
    self.assertAlmostEqual(col['mean'], 49.5)

    col = stats['column_stats']['col2']
    self.assertAlmostEqual(col['max'], 990.5)
    self.assertAlmostEqual(col['min'], 0.5)
    self.assertAlmostEqual(col['mean'], 495.5)
Пример #2
0
  def test_numerics(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

      schema = [{'name': 'col1', 'type': 'INTEGER'},
                {'name': 'col2', 'type': 'FLOAT'}]
      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'}}
      analyze.run_local_analysis(
          output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
Пример #3
0
    def test_text(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            csv_file = [
                'the quick brown fox,raining in kir',
                'quick   brown brown chicken,raining in pdx'
            ]
            file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

            schema = [{
                'name': 'col1',
                'type': 'STRING'
            }, {
                'name': 'col2',
                'type': 'STRING'
            }]
            features = {
                'col1': {
                    'transform': 'bag_of_words',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'tfidf',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())
            self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
            self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['col1', 'count'])
            self.assertEqual(vocab['col1'].tolist(),
                             ['quick', 'brown', 'the', 'fox', 'chicken'])
            self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['col2', 'count'])
            self.assertEqual(vocab['col2'].tolist(),
                             ['raining', 'in', 'pdx', 'kir'])
            self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
        finally:
            shutil.rmtree(output_folder)
Пример #4
0
  def test_categorical(self):
    test_folder = os.path.join(self._bucket_root, 'test_categorical')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
                'green,airplane']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'color', 'type': 'STRING'},
              {'name': 'transport', 'type': 'STRING'}]
    features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                'transport': {'transform': 'embedding', 'source_column': 'transport'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        inverted_features=analyze.invert_features(features))

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
    self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

    # Color column.
    vocab_str = file_io.read_file_to_string(
      os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['color', 'count'])
    expected_vocab = pd.DataFrame(
        {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
        columns=['color', 'count'])
    pd.util.testing.assert_frame_equal(vocab, expected_vocab)

    # transport column.
    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['transport', 'count'])
    self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
    self.assertEqual(vocab['transport'].tolist(),
                     ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
Пример #5
0
    def test_numerics(self):
        test_folder = os.path.join(self._bucket_root, 'test_numerics')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        file_io.write_string_to_file(
            input_file_path,
            '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        features = {
            'col1': {
                'transform': 'scale',
                'source_column': 'col1'
            },
            'col2': {
                'transform': 'identity',
                'source_column': 'col2'
            }
        }
        analyze.run_cloud_analysis(
            output_dir=output_folder,
            csv_file_pattern=input_file_path,
            bigquery_table=None,
            schema=schema,
            inverted_features=analyze.invert_features(features))

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.STATS_FILE)).decode())

        self.assertEqual(stats['num_examples'], 100)
        col = stats['column_stats']['col1']
        self.assertAlmostEqual(col['max'], 99.0)
        self.assertAlmostEqual(col['min'], 0.0)
        self.assertAlmostEqual(col['mean'], 49.5)

        col = stats['column_stats']['col2']
        self.assertAlmostEqual(col['max'], 990.5)
        self.assertAlmostEqual(col['min'], 0.5)
        self.assertAlmostEqual(col['mean'], 495.5)
Пример #6
0
  def test_text(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['the quick brown fox,raining in kir',
                  'quick   brown brown chicken,raining in pdx']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}]
      features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                  'col2': {'transform': 'tfidf', 'source_column': 'col2'}}
      analyze.run_local_analysis(
        output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
      self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col1', 'count'])

      # vocabs are sorted by count only
      col1_vocab = vocab['col1'].tolist()
      self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick'])
      self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col2', 'count'])

      # vocabs are sorted by count only
      col2_vocab = vocab['col2'].tolist()
      self.assertItemsEqual(col2_vocab[:2], ['in', 'raining'])
      self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
    finally:
      shutil.rmtree(output_folder)
Пример #7
0
  def test_numerics(self):
    """Build a BQ table, and then call analyze on it."""
    schema = [{'name': 'col1', 'type': 'INTEGER'},
              {'name': 'col2', 'type': 'FLOAT'}]
    project_id = dl.Context.default().project_id
    dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
    table_name = 'temp_table'
    full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

    output_folder = tempfile.mkdtemp()

    try:
      # Make a dataset, a table, and insert data.
      db = bq.Dataset((project_id, dataset_name))
      db.create()

      table = bq.Table(full_table_name)
      table.create(schema=bq.Schema(schema), overwrite=True)

      data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
      table.insert(data)

      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'}}
      analyze.run_cloud_analysis(
          output_dir=output_folder,
          csv_file_pattern=None,
          bigquery_table=full_table_name,
          schema=schema,
          inverted_features=analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
      db.delete(delete_contents=True)
Пример #8
0
  def test_text(self):
    test_folder = os.path.join(self._bucket_root, 'test_text')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['the quick brown fox,raining in kir',
                'quick   brown brown chicken,raining in pdx']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'col1', 'type': 'STRING'},
              {'name': 'col2', 'type': 'STRING'}]
    features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                'col2': {'transform': 'tfidf', 'source_column': 'col2'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        inverted_features=analyze.invert_features(features))

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
    self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col1', 'count'])
    self.assertEqual(vocab['col1'].tolist(),
                     ['brown', 'quick', 'chicken', 'fox', 'the', ])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col2', 'count'])
    self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx'])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
Пример #9
0
  def test_categorical(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
                  'green,airplane']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'color', 'type': 'STRING'},
                {'name': 'transport', 'type': 'STRING'}]
      features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                  'transport': {'transform': 'embedding', 'source_column': 'transport'}}
      analyze.run_local_analysis(
        output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
      self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

      # Color column.
      vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['color', 'count'])
      expected_vocab = pd.DataFrame(
          {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
          columns=['color', 'count'])
      pd.util.testing.assert_frame_equal(vocab, expected_vocab)

      # transport column. As each vocab has the same count, order in file is
      # not known.
      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['transport', 'count'])
      self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
      self.assertItemsEqual(vocab['transport'].tolist(),
                            ['car', 'truck', 'van', 'bike', 'train', 'airplane'])
    finally:
      shutil.rmtree(output_folder)
Пример #10
0
    def test_numerics(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            file_io.write_string_to_file(
                input_file_path,
                '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

            schema = [{
                'name': 'col1',
                'type': 'INTEGER'
            }, {
                'name': 'col2',
                'type': 'FLOAT'
            }]
            features = {
                'col1': {
                    'transform': 'scale',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'identity',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
Пример #11
0
  def test_check_schema_transforms_match(self):
    with self.assertRaises(ValueError):
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'INTEGER'}],
         analyze.invert_features({'col1': {'transform': 'one_hot', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'FLOAT'}],
         analyze.invert_features({'col1': {'transform': 'embedding', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'STRING'}],
         analyze.invert_features({'col1': {'transform': 'scale', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'xxx'}],
         analyze.invert_features({'col1': {'transform': 'scale', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'INTEGER'}],
         analyze.invert_features({'col1': {'transform': 'xxx', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      # scale and one_hot different transform family
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'INTEGER'}],
         analyze.invert_features(
            {'col1': {'transform': 'scale', 'source_column': 'col1'},
             'col2': {'transform': 'one_hot', 'source_column': 'col1'},
             'col3': {'transform': 'key', 'source_column': 'col1'}}))

    with self.assertRaises(ValueError):
      # Unknown transform
      analyze.check_schema_transforms_match(
         [{'name': 'col1', 'type': 'INTEGER'}],
         analyze.invert_features({'col1': {'transform': 'x', 'source_column': 'col1'}}))
Пример #12
0
    def test_text(self):
        test_folder = os.path.join(self._bucket_root, 'test_text')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        csv_file = [
            'the quick brown fox,raining in kir',
            'quick   brown brown chicken,raining in pdx'
        ]
        file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

        schema = [{
            'name': 'col1',
            'type': 'STRING'
        }, {
            'name': 'col2',
            'type': 'STRING'
        }]
        features = {
            'col1': {
                'transform': 'bag_of_words',
                'source_column': 'col1'
            },
            'col2': {
                'transform': 'tfidf',
                'source_column': 'col2'
            }
        }
        analyze.run_cloud_analysis(
            output_dir=output_folder,
            csv_file_pattern=input_file_path,
            bigquery_table=None,
            schema=schema,
            inverted_features=analyze.invert_features(features))

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.STATS_FILE)).decode())
        self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
        self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['col1', 'count'])
        self.assertEqual(vocab['col1'].tolist(), [
            'brown',
            'quick',
            'chicken',
            'fox',
            'the',
        ])
        self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['col2', 'count'])
        self.assertEqual(vocab['col2'].tolist(),
                         ['in', 'raining', 'kir', 'pdx'])
        self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
Пример #13
0
    def test_categorical(self):
        test_folder = os.path.join(self._bucket_root, 'test_categorical')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        csv_file = [
            'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
            'green,airplane'
        ]
        file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

        schema = [{
            'name': 'color',
            'type': 'STRING'
        }, {
            'name': 'transport',
            'type': 'STRING'
        }]
        features = {
            'color': {
                'transform': 'one_hot',
                'source_column': 'color'
            },
            'transport': {
                'transform': 'embedding',
                'source_column': 'transport'
            }
        }
        analyze.run_cloud_analysis(
            output_dir=output_folder,
            csv_file_pattern=input_file_path,
            bigquery_table=None,
            schema=schema,
            inverted_features=analyze.invert_features(features))

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.STATS_FILE)).decode())
        self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
        self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

        # Color column.
        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['color', 'count'])
        expected_vocab = pd.DataFrame(
            {
                'color': ['red', 'blue', 'green'],
                'count': [3, 2, 1]
            },
            columns=['color', 'count'])
        pd.util.testing.assert_frame_equal(vocab, expected_vocab)

        # transport column.
        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['transport', 'count'])
        self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
        self.assertEqual(vocab['transport'].tolist(),
                         ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
Пример #14
0
    def test_numerics(self):
        """Build a BQ table, and then call analyze on it."""
        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        project_id = dl.Context.default().project_id
        dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
        table_name = 'temp_table'
        full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

        output_folder = tempfile.mkdtemp()

        try:
            # Make a dataset, a table, and insert data.
            db = bq.Dataset((project_id, dataset_name))
            db.create()

            table = bq.Table(full_table_name)
            table.create(schema=bq.Schema(schema), overwrite=True)

            data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
            table.insert(data)

            features = {
                'col1': {
                    'transform': 'scale',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'identity',
                    'source_column': 'col2'
                }
            }
            analyze.run_cloud_analysis(
                output_dir=output_folder,
                csv_file_pattern=None,
                bigquery_table=full_table_name,
                schema=schema,
                inverted_features=analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
            db.delete(delete_contents=True)
Пример #15
0
    def test_categorical(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            csv_file = [
                'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
                'green,airplane'
            ]
            file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

            schema = [{
                'name': 'color',
                'type': 'STRING'
            }, {
                'name': 'transport',
                'type': 'STRING'
            }]
            features = {
                'color': {
                    'transform': 'one_hot',
                    'source_column': 'color'
                },
                'transport': {
                    'transform': 'embedding',
                    'source_column': 'transport'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())
            self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
            self.assertEqual(stats['column_stats']['transport']['vocab_size'],
                             6)

            # Color column.
            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['color', 'count'])
            expected_vocab = pd.DataFrame(
                {
                    'color': ['red', 'blue', 'green'],
                    'count': [3, 2, 1]
                },
                columns=['color', 'count'])
            pd.util.testing.assert_frame_equal(vocab, expected_vocab)

            # transport column. As each vocab has the same count, order in file is
            # not known.
            vocab_str = file_io.read_file_to_string(
                os.path.join(
                    output_folder,
                    analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['transport', 'count'])
            self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
            self.assertItemsEqual(
                vocab['transport'].tolist(),
                ['car', 'truck', 'van', 'bike', 'train', 'airplane'])
        finally:
            shutil.rmtree(output_folder)
Пример #16
0
    def test_check_schema_transforms_match(self):
        with self.assertRaises(ValueError):
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'INTEGER'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform':
                                                          'one_hot',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'FLOAT'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform':
                                                          'embedding',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'STRING'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform': 'scale',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'xxx'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform': 'scale',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'INTEGER'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform': 'xxx',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            # scale and one_hot different transform family
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'INTEGER'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform': 'scale',
                                                          'source_column':
                                                          'col1'
                                                      },
                                                      'col2': {
                                                          'transform':
                                                          'one_hot',
                                                          'source_column':
                                                          'col1'
                                                      },
                                                      'col3': {
                                                          'transform': 'key',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))

        with self.assertRaises(ValueError):
            # Unknown transform
            analyze.check_schema_transforms_match([{
                'name': 'col1',
                'type': 'INTEGER'
            }],
                                                  analyze.invert_features({
                                                      'col1': {
                                                          'transform': 'x',
                                                          'source_column':
                                                          'col1'
                                                      }
                                                  }))