示例#1
0
  def test_transform_csv(self, popen_mock, run_and_monitor_mock):
    mlmagic.ml(
        line='transform --prefix my_prefix --shuffle --cloud',
        cell="""\
            output: my_out_dir
            analysis: my_analyze_dir
            batch_size: 123
            training_data:
              csv: file*.csv
            cloud_config:
              project_id: my_id
              num_workers: 987
              worker_machine_type: BLUE
              job_name: RED""")
    cmd_list = run_and_monitor_mock.call_args[0][0]
    # cmd_list = [u'python', u'transform.py', u'--output', 'path/my_out_dir',
    #   u'--analysis', 'path/my_analyze_dir', u'--prefix', 'my_prefix',
    #   u'--shuffle', u'--batch-size', '100', u'--csv=/path/file*.csv'
    #   ...
    self.assertEqual('python', cmd_list[0])
    self.assertEqual('transform.py', cmd_list[1])
    self.assertIn('--shuffle', cmd_list)

    self.assertTrue(find_key_endswith(cmd_list, '--output', 'my_out_dir'))
    self.assertTrue(find_key_endswith(cmd_list, '--analysis', 'my_analyze_dir'))
    self.assertTrue(find_key_value(cmd_list, '--prefix', 'my_prefix'))
    self.assertTrue(find_key_value(cmd_list, '--batch-size', '123'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--csv=', 'file*.csv'))
    self.assertTrue(find_key_value(cmd_list, '--project-id', 'my_id'))
    self.assertTrue(find_key_value(cmd_list, '--num-workers', '987'))
    self.assertTrue(find_key_value(cmd_list, '--worker-machine-type', 'BLUE'))
    self.assertTrue(find_key_value(cmd_list, '--job-name', 'RED'))
示例#2
0
  def test_train_csv(self, popen_mock, submit_training_mock,
                     package_and_copy_mock, _show_job_link_mock):
    mlmagic.ml(
        line='train --cloud',
        cell="""\
            output: gs://my_out_dir
            analysis: my_analyze_dir
            training_data:
              csv: file*.csv
            evaluation_data:
              transformed: file*.tfrecords.gz
            model_args:
              key: value
            cloud_config:
              job_name: job1
              project_id: id""")
    job_request = submit_training_mock.call_args[0][0]

    cmd_list = job_request['args']

    self.assertEqual(job_request['project_id'], 'id')
    self.assertEqual(job_request['job_dir'], 'gs://my_out_dir')
    self.assertEqual(job_request['python_module'], 'trainer.task')
    self.assertEqual(job_request['package_uris'], ['gs://my_out_dir/staging/trainer.tar.gz'])

    self.assertTrue(find_key_value(cmd_list, '--job-dir', 'gs://my_out_dir'))
    self.assertTrue(find_key_endswith(cmd_list, '--analysis', 'my_analyze_dir'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--train=', 'file*.csv'))
    self.assertIn('--transform', cmd_list)
    self.assertTrue(find_startswith_endswith(cmd_list, '--eval=', '*.tfrecords.gz'))
    self.assertTrue(find_key_value(cmd_list, '--key', 'value'))
示例#3
0
  def test_analyze_csv_local(self, popen_mock, run_and_monitor_mock):
    mlmagic.ml(
      line='dataset create',
      cell="""\
          format: csv
          train: ./taxi/train.csv
          eval: ./taxi/eval.csv
          name: taxi_data
          schema:
              - name: unique_key
                type: STRING
              - name: fare
                type: FLOAT"""
    )
    mlmagic.ml(
        line='analyze',
        cell="""\
            output: my_out_dir
            data: taxi_data
            features: dummy_features""")
    cmd_list = run_and_monitor_mock.call_args[0][0]
    # cmd_list = [u'python', u'analyze.py', u'--output', 'path/my_out_dir',
    #   u'--csv=path/file*.csv', u'--schema', u'/path/schema.json',
    #   u'--features', u'path/features.json']

    self.assertEqual('python', cmd_list[0])
    self.assertEqual('analyze.py', cmd_list[1])
    self.assertIn('--schema', cmd_list)
    self.assertIn('--features', cmd_list)
    self.assertTrue(find_key_endswith(cmd_list, '--output', 'my_out_dir'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--csv=', 'train.csv'))
示例#4
0
  def test_train_csv(self, popen_mock, submit_training_mock,
                     package_and_copy_mock, _show_job_link_mock):
    mlmagic.ml(
      line='dataset create',
      cell="""\
          format: transformed
          train: ./taxi/train_tfrecord.tar.gz
          eval: ./taxi/eval_tfrecord.tar.gz
          name: taxi_data_transformed"""
    )
    mlmagic.ml(
        line='train --cloud',
        cell="""\
            output: gs://my_out_dir
            analysis: my_analyze_dir
            data: $taxi_data_transformed
            model_args:
              key: value
            cloud_config:
              job_name: job1
              project_id: id""")
    job_request = submit_training_mock.call_args[0][0]

    cmd_list = job_request['args']

    self.assertEqual(job_request['project_id'], 'id')
    self.assertEqual(job_request['job_dir'], 'gs://my_out_dir')
    self.assertEqual(job_request['python_module'], 'trainer.task')
    self.assertEqual(job_request['package_uris'], ['gs://my_out_dir/staging/trainer.tar.gz'])

    self.assertTrue(find_key_value(cmd_list, '--job-dir', 'gs://my_out_dir'))
    self.assertTrue(find_key_endswith(cmd_list, '--analysis', 'my_analyze_dir'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--train=', 'train_tfrecord.tar.gz'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--eval=', 'eval_tfrecord.tar.gz'))
    self.assertTrue(find_key_value(cmd_list, '--key', 'value'))
示例#5
0
  def test_predict_csv(self, popen_mock, get_prediction_results_mock, display_mock):

    # Don't run prediction on a real graph, just return 1 value.
    df = pd.DataFrame({'col1': ['key1'], 'col2': ['value1'], 'col3': ['value2']})
    get_prediction_results_mock.return_value = df

    mlmagic.ml(
        line='predict --cloud',
        cell="""\
            model: model.version
            headers: col1,col2,col3
            image_columns: col3
            prediction_data:
              - key1,value1,value2""")

    # Check it was printed
    self.assertEqual(1, display_mock.call_count)
示例#6
0
    def _create_text_test_data(self):
        """Create text model."""

        test_data = """1,sour green round,lime
    2,melon green long,cucumber
    3,sweet round red,apple"""
        train_csv = os.path.join(self._test_dir, 'train.csv')
        with open(train_csv, 'w') as f:
            f.write(test_data)

        analyze_dir = os.path.join(self._test_dir, 'analysistxt')
        train_dir = os.path.join(self._test_dir, 'traintxt')

        mlmagic.ml(line='analyze',
                   cell="""\
            output: %s
            training_data:
              csv: %s
              schema:
                - name: key
                  type: INTEGER
                - name: text
                  type: STRING
                - name: target
                  type: STRING
            features:
              key:
                transform: key
              text:
                transform: bag_of_words
              target:
                transform: target""" % (analyze_dir, train_csv))

        mlmagic.ml(line='train',
                   cell="""\
            output: %s
            analysis: %s
            training_data:
              csv: %s
            evaluation_data:
              csv: %s
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 300""" %
                   (train_dir, analyze_dir, train_csv, train_csv))
示例#7
0
  def test_transform_csv(self, popen_mock, run_and_monitor_mock):
    mlmagic.ml(
      line='dataset create',
      cell="""\
          format: csv
          train: ./taxi/train.csv
          eval: ./taxi/eval.csv
          name: taxi_data
          schema:
              - name: unique_key
                type: STRING
              - name: fare
                type: FLOAT"""
    )
    mlmagic.ml(
        line='transform --shuffle --cloud',
        cell="""\
            output: my_out_dir
            analysis: my_analyze_dir
            batch_size: 123
            data: taxi_data
            cloud_config:
              project_id: my_id
              num_workers: 987
              worker_machine_type: BLUE
              job_name: RED""")
    cmd_list = run_and_monitor_mock.call_args[0][0]
    # cmd_list = [u'python', u'transform.py', u'--output', 'path/my_out_dir',
    #   u'--analysis', 'path/my_analyze_dir', u'--prefix', 'my_prefix',
    #   u'--shuffle', u'--batch-size', '100', u'--csv=/path/file*.csv'
    #   ...
    self.assertEqual('python', cmd_list[0])
    self.assertEqual('transform.py', cmd_list[1])
    self.assertIn('--shuffle', cmd_list)

    self.assertTrue(find_key_endswith(cmd_list, '--output', 'my_out_dir'))
    self.assertTrue(find_key_endswith(cmd_list, '--analysis', 'my_analyze_dir'))
    self.assertTrue(find_key_value(cmd_list, '--prefix', 'train') or
                    find_key_value(cmd_list, '--prefix', 'eval'))
    self.assertTrue(find_key_value(cmd_list, '--batch-size', '123'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--csv=', 'train.csv') or
                    find_startswith_endswith(cmd_list, '--csv=', 'eval.csv'))
    self.assertTrue(find_key_value(cmd_list, '--project-id', 'my_id'))
    self.assertTrue(find_key_value(cmd_list, '--num-workers', '987'))
    self.assertTrue(find_key_value(cmd_list, '--worker-machine-type', 'BLUE'))
    self.assertTrue(find_key_value(cmd_list, '--job-name', 'RED'))
示例#8
0
  def test_analyze_csv_local(self, popen_mock, run_and_monitor_mock):
    mlmagic.ml(
        line='analyze',
        cell="""\
            output: my_out_dir
            training_data:
              csv: file*.csv
              schema: dummy_schema
            features: dummy_features""")
    cmd_list = run_and_monitor_mock.call_args[0][0]
    # cmd_list = [u'python', u'analyze.py', u'--output', 'path/my_out_dir',
    #   u'--csv=path/file*.csv', u'--schema', u'/path/schema.json',
    #   u'--features', u'path/features.json']

    self.assertEqual('python', cmd_list[0])
    self.assertEqual('analyze.py', cmd_list[1])
    self.assertIn('--schema', cmd_list)
    self.assertIn('--features', cmd_list)
    self.assertTrue(find_key_endswith(cmd_list, '--output', 'my_out_dir'))
    self.assertTrue(find_startswith_endswith(cmd_list, '--csv=', 'file*.csv'))
示例#9
0
    def _create_tabular_test_data(self):
        """Create tabular model with text."""

        test_data = """1,5.0,monday,word1 word2 word3,true
    2,3.2,tuesday,word1 word3,true
    3,-1.1,friday,word1,false"""
        train_csv = os.path.join(self._test_dir, 'train.csv')
        with open(train_csv, 'w') as f:
            f.write(test_data)

        df = pd.read_csv(train_csv,
                         names=['key', 'num', 'weekday', 'garbage', 'target'])
        analyze_dir = os.path.join(self._test_dir, 'analysistab')
        train_dir = os.path.join(self._test_dir, 'traintab')

        mlmagic.ml(line='dataset create',
                   cell="""\
            format: csv
            name: mytabular
            schema:
                - name: key
                  type: INTEGER
                - name: num
                  type: FLOAT
                - name: weekday
                  type: STRING
                - name: garbage
                  type: STRING
                - name: target
                  type: STRING
            train: %s
            eval: %s""" % (train_csv, train_csv))

        mlmagic.ml(line='analyze',
                   cell="""\
            output: %s
            data: mytabular
            features:
              key:
                transform: key
              num:
                transform: scale
              weekday:
                transform: one_hot
              garbage:
                transform: bag_of_words
              target:
                transform: target""" % (analyze_dir))

        mlmagic.ml(line='train',
                   cell="""\
            output: %s
            analysis: %s
            data: mytabular
            notb: true
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 300""" % (train_dir, analyze_dir))
        return df
示例#10
0
  def test_batch_predict_csv(self, popen_mock, submit_batch_prediction_mock,
                             default_mock, _show_job_link_mock):
    default_mock.return_value = mock.Mock(project_id='my_project_id')

    mlmagic.ml(
        line='batch_predict --cloud',
        cell="""\
            model: my_model.my_version
            output: gs://output
            format: json
            batch_size: 10
            prediction_data:
              csv: %s""" % os.path.abspath(__file__))

    job_args = submit_batch_prediction_mock.call_args[0][0]

    self.assertEqual(job_args['input_paths'], [os.path.abspath(__file__)])
    self.assertEqual(
        job_args['version_name'],
        'projects/my_project_id/models/my_model/versions/my_version')
    self.assertEqual(job_args['output_path'], 'gs://output')
    self.assertEqual(job_args['data_format'], 'TEXT')
示例#11
0
  def _create_text_test_data(self):
    """Create text model."""

    test_data = """1,sour green round,lime
    2,melon green long,cucumber
    3,sweet round red,apple"""
    train_csv = os.path.join(self._test_dir, 'train.csv')
    with open(train_csv, 'w') as f:
      f.write(test_data)

    analyze_dir = os.path.join(self._test_dir, 'analysistxt')
    train_dir = os.path.join(self._test_dir, 'traintxt')

    mlmagic.ml(
        line='dataset create',
        cell="""\
            format: csv
            name: mytext
            schema:
                - name: key
                  type: INTEGER
                - name: text
                  type: STRING
                - name: target
                  type: STRING
            train: %s
            eval: %s""" % (train_csv, train_csv))

    mlmagic.ml(
        line='analyze',
        cell="""\
            output: %s
            data: mytext
            features:
              key:
                transform: key
              text:
                transform: bag_of_words
              target:
                transform: target""" % (analyze_dir))

    mlmagic.ml(
        line='train',
        cell="""\
            output: %s
            analysis: %s
            data: mytext
            notb: true
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 300""" % (train_dir, analyze_dir))
示例#12
0
    def _create_image_test_data(self):
        image_path1 = os.path.join(self._test_dir, 'img1.jpg')
        image_path2 = os.path.join(self._test_dir, 'img2.jpg')
        image_path3 = os.path.join(self._test_dir, 'img3.jpg')
        Image.new('RGB', size=(128, 128),
                  color=(155, 211, 64)).save(image_path1, "JPEG")
        Image.new('RGB', size=(64, 64),
                  color=(111, 21, 86)).save(image_path2, "JPEG")
        Image.new('RGB', size=(16, 16),
                  color=(255, 21, 1)).save(image_path3, "JPEG")
        test_data = """1,1.2,word1 word2,%s,true
2,3.2,word2 word3,%s,false
5,-2.1,word3 word4,%s,true""" % (image_path1, image_path2, image_path3)

        train_csv = os.path.join(self._test_dir, 'train.csv')
        with open(train_csv, 'w') as f:
            f.write(test_data)

        analyze_dir = os.path.join(self._test_dir, 'analysisimg')
        transform_dir = os.path.join(self._test_dir, 'transformimg')
        train_dir = os.path.join(self._test_dir, 'trainimg')

        # Download inception checkpoint. Note that gs url doesn't work because
        # we may not have gcloud signed in when running the test.
        url = ('https://storage.googleapis.com/cloud-ml-data/img/' +
               'flower_photos/inception_v3_2016_08_28.ckpt')
        checkpoint_path = os.path.join(self._test_dir, "checkpoint")
        response = urlopen(url)
        with open(checkpoint_path, 'wb') as f:
            f.write(response.read())

        mlmagic.ml(line='dataset create',
                   cell="""\
            format: csv
            name: myds
            schema:
              - name: key
                type: INTEGER
              - name: num
                type: FLOAT
              - name: text
                type: STRING
              - name: img_url
                type: STRING
              - name: target
                type: STRING
            train: %s
            eval: %s""" % (train_csv, train_csv))

        mlmagic.ml(line='analyze',
                   cell="""\
            output: %s
            data: myds
            features:
              key:
                transform: key
              num:
                transform: scale
              text:
                transform: bag_of_words
              img_url:
                transform: image_to_vec
                checkpoint: %s
              target:
                transform: target""" % (analyze_dir, checkpoint_path))

        mlmagic.ml(line='transform',
                   cell="""\
            output: %s
            analysis: %s
            data: myds""" % (transform_dir, analyze_dir))

        mlmagic.ml(line='dataset create',
                   cell="""\
            format: transformed
            name: transformed_ds
            train: %s/train-*
            eval: %s/eval-*""" % (transform_dir, transform_dir))

        mlmagic.ml(line='train',
                   cell="""\
            output: %s
            analysis: %s
            data: transformed_ds
            notb: true
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 200""" % (train_dir, analyze_dir))
示例#13
0
  def _create_tabular_test_data(self):
    """Create tabular model with text."""

    test_data = """1,5.0,monday,word1 word2 word3,true
    2,3.2,tuesday,word1 word3,true
    3,-1.1,friday,word1,false"""
    train_csv = os.path.join(self._test_dir, 'train.csv')
    with open(train_csv, 'w') as f:
      f.write(test_data)

    df = pd.read_csv(train_csv, names=['key', 'num', 'weekday', 'garbage', 'target'])
    analyze_dir = os.path.join(self._test_dir, 'analysistab')
    train_dir = os.path.join(self._test_dir, 'traintab')

    mlmagic.ml(
        line='dataset create',
        cell="""\
            format: csv
            name: mytabular
            schema:
                - name: key
                  type: INTEGER
                - name: num
                  type: FLOAT
                - name: weekday
                  type: STRING
                - name: garbage
                  type: STRING
                - name: target
                  type: STRING
            train: %s
            eval: %s""" % (train_csv, train_csv))

    mlmagic.ml(
        line='analyze',
        cell="""\
            output: %s
            data: mytabular
            features:
              key:
                transform: key
              num:
                transform: scale
              weekday:
                transform: one_hot
              garbage:
                transform: bag_of_words
              target:
                transform: target""" % (analyze_dir))

    mlmagic.ml(
        line='train',
        cell="""\
            output: %s
            analysis: %s
            data: mytabular
            notb: true
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 300""" % (train_dir, analyze_dir))
    return df
示例#14
0
  def _create_image_test_data(self):
    image_path1 = os.path.join(self._test_dir, 'img1.jpg')
    image_path2 = os.path.join(self._test_dir, 'img2.jpg')
    image_path3 = os.path.join(self._test_dir, 'img3.jpg')
    Image.new('RGB', size=(128, 128), color=(155, 211, 64)).save(image_path1, "JPEG")
    Image.new('RGB', size=(64, 64), color=(111, 21, 86)).save(image_path2, "JPEG")
    Image.new('RGB', size=(16, 16), color=(255, 21, 1)).save(image_path3, "JPEG")
    test_data = """1,1.2,word1 word2,%s,true
2,3.2,word2 word3,%s,false
5,-2.1,word3 word4,%s,true""" % (image_path1, image_path2, image_path3)

    train_csv = os.path.join(self._test_dir, 'train.csv')
    with open(train_csv, 'w') as f:
      f.write(test_data)

    analyze_dir = os.path.join(self._test_dir, 'analysisimg')
    transform_dir = os.path.join(self._test_dir, 'transformimg')
    train_dir = os.path.join(self._test_dir, 'trainimg')

    # Download inception checkpoint. Note that gs url doesn't work because
    # we may not have gcloud signed in when running the test.
    url = ('https://storage.googleapis.com/cloud-ml-data/img/' +
           'flower_photos/inception_v3_2016_08_28.ckpt')
    checkpoint_path = os.path.join(self._test_dir, "checkpoint")
    response = urlopen(url)
    with open(checkpoint_path, 'wb') as f:
      f.write(response.read())

    mlmagic.ml(
        line='dataset create',
        cell="""\
            format: csv
            name: myds
            schema:
              - name: key
                type: INTEGER
              - name: num
                type: FLOAT
              - name: text
                type: STRING
              - name: img_url
                type: STRING
              - name: target
                type: STRING
            train: %s
            eval: %s""" % (train_csv, train_csv))

    mlmagic.ml(
        line='analyze',
        cell="""\
            output: %s
            data: myds
            features:
              key:
                transform: key
              num:
                transform: scale
              text:
                transform: bag_of_words
              img_url:
                transform: image_to_vec
                checkpoint: %s
              target:
                transform: target""" % (analyze_dir, checkpoint_path))

    mlmagic.ml(
        line='transform',
        cell="""\
            output: %s
            analysis: %s
            data: myds""" % (transform_dir, analyze_dir))

    mlmagic.ml(
        line='dataset create',
        cell="""\
            format: transformed
            name: transformed_ds
            train: %s/train-*
            eval: %s/eval-*""" % (transform_dir, transform_dir))

    mlmagic.ml(
        line='train',
        cell="""\
            output: %s
            analysis: %s
            data: transformed_ds
            notb: true
            model_args:
              model: linear_classification
              top-n: 0
              max-steps: 200""" % (train_dir, analyze_dir))