Exemplo n.º 1
0
def local_analysis(args):
  if args.analysis:
    # Already analyzed.
    return

  if not args.schema or not args.features:
    raise ValueError('Either --analysis, or both --schema and --features are provided.')

  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_spec = tf_config.get('cluster', {})
  if len(cluster_spec.get('worker', [])) > 0:
    raise ValueError('If "schema" and "features" are provided, local analysis will run and ' +
                     'only BASIC scale-tier (no workers node) is supported.')

  if cluster_spec and not (args.schema.startswith('gs://') and args.features.startswith('gs://')):
    raise ValueError('Cloud trainer requires GCS paths for --schema and --features.')

  print('Running analysis.')
  schema = json.loads(file_io.read_file_to_string(args.schema).decode())
  features = json.loads(file_io.read_file_to_string(args.features).decode())
  args.analysis = os.path.join(args.job_dir, 'analysis')
  args.transform = True
  file_io.recursive_create_dir(args.analysis)
  feature_analysis.run_local_analysis(args.analysis, args.train, schema, features)
  print('Analysis done.')
Exemplo n.º 2
0
  def test_categorical(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['red,apple', 'red,pepper', 'red,apple', 'blue,grape',
                  'blue,apple', 'green,pepper']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'color', 'type': 'STRING'},
                {'name': 'type', 'type': 'STRING'}]
      features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                  'type': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
        output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)

      # Color column.
      vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['color', 'count'])
      expected_vocab = pd.DataFrame(
          {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
          columns=['color', 'count'])
      pd.util.testing.assert_frame_equal(vocab, expected_vocab)

    finally:
      shutil.rmtree(output_folder)
Exemplo n.º 3
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.schema:
    schema = json.loads(
        file_io.read_file_to_string(args.schema).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features).decode())

  file_io.recursive_create_dir(args.output)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        bigquery_table=args.bigquery,
        schema=schema,
        features=features)
  else:
    feature_analysis.run_local_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        schema=schema,
        features=features)
Exemplo n.º 4
0
  def test_text(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true',
                  'quick   brown brown chicken,raining in pdx,cat2|cat3|cat4,false']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'col1', 'type': 'STRING'},
                {'name': 'col2', 'type': 'STRING'},
                {'name': 'col3', 'type': 'STRING'},
                {'name': 'col4', 'type': 'STRING'}]
      features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                  'col2': {'transform': 'tfidf', 'source_column': 'col2'},
                  'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'},
                  'col4': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
        output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
      self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)
      self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4)

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col1', 'count'])

      # vocabs are sorted by count only
      col1_vocab = vocab['col1'].tolist()
      self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick'])
      self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col2', 'count'])

      # vocabs are sorted by count only
      col2_vocab = vocab['col2'].tolist()
      self.assertItemsEqual(col2_vocab[:2], ['in', 'raining'])
      self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
    finally:
      shutil.rmtree(output_folder)
Exemplo n.º 5
0
 def testAtomicWriteStringToFileOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.atomic_write_string_to_file(file_path, "old", overwrite=False)
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.atomic_write_string_to_file(file_path, "new", overwrite=False)
   file_contents = file_io.read_file_to_string(file_path)
   self.assertEqual("old", file_contents)
   file_io.delete_file(file_path)
   file_io.atomic_write_string_to_file(file_path, "new", overwrite=False)
   file_contents = file_io.read_file_to_string(file_path)
   self.assertEqual("new", file_contents)
Exemplo n.º 6
0
  def __init__(self, *args, **kwargs):
    super(ApiCompatibilityTest, self).__init__(*args, **kwargs)

    golden_update_warning_filename = os.path.join(
        resource_loader.get_root_dir_with_all_resources(), _UPDATE_WARNING_FILE)
    self._update_golden_warning = file_io.read_file_to_string(
        golden_update_warning_filename)

    test_readme_filename = os.path.join(
        resource_loader.get_root_dir_with_all_resources(), _TEST_README_FILE)
    self._test_readme_message = file_io.read_file_to_string(
        test_readme_filename)
Exemplo n.º 7
0
  def test_text(self):
    test_folder = os.path.join(self._bucket_root, 'test_text')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true',
                'quick   brown brown chicken,raining in pdx,cat2|cat3|cat4,false']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'col1', 'type': 'STRING'},
              {'name': 'col2', 'type': 'STRING'},
              {'name': 'col3', 'type': 'STRING'},
              {'name': 'col4', 'type': 'STRING'}]
    features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                'col2': {'transform': 'tfidf', 'source_column': 'col2'},
                'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'},
                'col4': {'transform': 'target'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        features=features)

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
    self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)
    self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4)

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col1', 'count'])
    self.assertEqual(vocab['col1'].tolist(),
                     ['brown', 'quick', 'chicken', 'fox', 'the', ])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col2', 'count'])
    self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx'])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
Exemplo n.º 8
0
  def test_categorical(self):
    test_folder = os.path.join(self._bucket_root, 'test_categorical')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape',
                'blue,train,apple', 'green,airplane,pepper']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'color', 'type': 'STRING'},
              {'name': 'transport', 'type': 'STRING'},
              {'name': 'type', 'type': 'STRING'}]
    features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                'transport': {'transform': 'embedding', 'source_column': 'transport'},
                'type': {'transform': 'target'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        features=features)

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
    self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

    # Color column.
    vocab_str = file_io.read_file_to_string(
      os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['color', 'count'])
    expected_vocab = pd.DataFrame(
        {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
        columns=['color', 'count'])
    pd.util.testing.assert_frame_equal(vocab, expected_vocab)

    # transport column.
    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['transport', 'count'])
    self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
    self.assertEqual(vocab['transport'].tolist(),
                     ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
Exemplo n.º 9
0
def get_model_schema_and_features(model_dir):
  """Get a local model's schema and features config.

  Args:
    model_dir: local or GCS path of a model.
  Returns:
    A tuple of schema (list) and features config (dict).
  """
  schema_file = os.path.join(model_dir, 'assets.extra', 'schema.json')
  schema = json.loads(file_io.read_file_to_string(schema_file))
  features_file = os.path.join(model_dir, 'assets.extra', 'features.json')
  features_config = json.loads(file_io.read_file_to_string(features_file))
  return schema, features_config
Exemplo n.º 10
0
  def test_categorical(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape',
                  'blue,train,apple', 'green,airplane,pepper']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'color', 'type': 'STRING'},
                {'name': 'transport', 'type': 'STRING'},
                {'name': 'type', 'type': 'STRING'}]
      features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                  'transport': {'transform': 'embedding', 'source_column': 'transport'},
                  'type': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
        output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
      self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

      # Color column.
      vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['color', 'count'])
      expected_vocab = pd.DataFrame(
          {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
          columns=['color', 'count'])
      pd.util.testing.assert_frame_equal(vocab, expected_vocab)

      # transport column. As each vocab has the same count, order in file is
      # not known.
      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['transport', 'count'])
      self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
      self.assertItemsEqual(vocab['transport'].tolist(),
                            ['car', 'truck', 'van', 'bike', 'train', 'airplane'])
    finally:
      shutil.rmtree(output_folder)
Exemplo n.º 11
0
def load_model(saved_model_path):
  """Load a keras.Model from SavedModel.

  load_model reinstantiates model state by:
  1) loading model topology from json (this will eventually come
     from metagraph).
  2) loading model weights from checkpoint.

  Args:
    saved_model_path: a string specifying the path to an existing SavedModel.

  Returns:
    a keras.Model instance.
  """
  # restore model topology from json string
  model_json_filepath = os.path.join(
      compat.as_bytes(saved_model_path),
      compat.as_bytes(constants.ASSETS_DIRECTORY),
      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
  model_json = file_io.read_file_to_string(model_json_filepath)
  model = model_from_json(model_json)

  # restore model weights
  checkpoint_prefix = os.path.join(
      compat.as_text(saved_model_path),
      compat.as_text(constants.VARIABLES_DIRECTORY),
      compat.as_text(constants.VARIABLES_FILENAME))
  model.load_weights(checkpoint_prefix)
  return model
Exemplo n.º 12
0
def run_analysis(args):
  """Builds an analysis file for training.

  Uses BiqQuery tables to do the analysis.

  Args:
    args: command line args

  Raises:
    ValueError if schema contains unknown types.
  """
  import google.datalab.bigquery as bq
  if args.bigquery_table:
    table = bq.Table(args.bigquery_table)
    schema_list = table.schema._bq_schema
  else:
    schema_list = json.loads(
        file_io.read_file_to_string(args.schema_file).decode())
    table = bq.ExternalDataSource(
        source=args.input_file_pattern,
        schema=bq.Schema(schema_list))

  # Check the schema is supported.
  for col_schema in schema_list:
    col_type = col_schema['type'].lower()
    if col_type != 'string' and col_type != 'integer' and col_type != 'float':
      raise ValueError('Schema contains an unsupported type %s.' % col_type)

  run_numerical_analysis(table, schema_list, args)
  run_categorical_analysis(table, schema_list, args)

  # Save a copy of the schema to the output location.
  file_io.write_string_to_file(
      os.path.join(args.output_dir, SCHEMA_FILE),
      json.dumps(schema_list, indent=2, separators=(',', ': ')))
Exemplo n.º 13
0
 def testMultipleWrites(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   with file_io.FileIO(file_path, mode="w") as f:
     f.write("line1\n")
     f.write("line2")
   file_contents = file_io.read_file_to_string(file_path)
   self.assertEqual("line1\nline2", file_contents)
Exemplo n.º 14
0
  def _read_latest_config_files(self, run_path_pairs):
    """Reads and returns the projector config files in every run directory."""
    configs = {}
    config_fpaths = {}
    for run_name, assets_dir in run_path_pairs:
      config = projector_config_pb2.ProjectorConfig()
      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
      if file_io.file_exists(config_fpath):
        file_content = file_io.read_file_to_string(config_fpath)
        text_format.Merge(file_content, config)
      has_tensor_files = False
      for embedding in config.embeddings:
        if embedding.tensor_path:
          has_tensor_files = True
          break

      if not config.model_checkpoint_path:
        # See if you can find a checkpoint file in the logdir.
        logdir = _assets_dir_to_logdir(assets_dir)
        ckpt_path = _find_latest_checkpoint(logdir)
        if not ckpt_path and not has_tensor_files:
          continue
        if ckpt_path:
          config.model_checkpoint_path = ckpt_path

      # Sanity check for the checkpoint file.
      if (config.model_checkpoint_path and
          not checkpoint_exists(config.model_checkpoint_path)):
        logging.warning('Checkpoint file "%s" not found',
                        config.model_checkpoint_path)
        continue
      configs[run_name] = config
      config_fpaths[run_name] = config_fpath
    return configs, config_fpaths
Exemplo n.º 15
0
 def testCopy(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
Exemplo n.º 16
0
  def test_numerics(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(['%s,%s,%s' % (i, 10 * i + 0.5, i + 0.5) for i in range(100)]))

      schema = [{'name': 'col1', 'type': 'INTEGER'},
                {'name': 'col2', 'type': 'FLOAT'},
                {'name': 'col3', 'type': 'FLOAT'}]
      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'},
                  'col3': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
          output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
Exemplo n.º 17
0
  def testUpdateCheckpointStateSaveRelativePaths(self):
    save_dir = self._get_test_dir("update_checkpoint_state")
    os.chdir(save_dir)
    abs_path2 = os.path.join(save_dir, "model-2")
    rel_path2 = "model-2"
    abs_path0 = os.path.join(save_dir, "model-0")
    rel_path0 = "model-0"
    checkpoint_management.update_checkpoint_state_internal(
        save_dir=save_dir,
        model_checkpoint_path=abs_path2,
        all_model_checkpoint_paths=[rel_path0, abs_path2],
        save_relative_paths=True)

    # File should contain relative paths.
    file_content = file_io.read_file_to_string(
        os.path.join(save_dir, "checkpoint"))
    ckpt = CheckpointState()
    text_format.Merge(file_content, ckpt)
    self.assertEqual(ckpt.model_checkpoint_path, rel_path2)
    self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2)
    self.assertEqual(ckpt.all_model_checkpoint_paths[-1], rel_path2)
    self.assertEqual(ckpt.all_model_checkpoint_paths[0], rel_path0)

    # get_checkpoint_state should return absolute paths.
    ckpt = checkpoint_management.get_checkpoint_state(save_dir)
    self.assertEqual(ckpt.model_checkpoint_path, abs_path2)
    self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2)
    self.assertEqual(ckpt.all_model_checkpoint_paths[-1], abs_path2)
    self.assertEqual(ckpt.all_model_checkpoint_paths[0], abs_path0)
Exemplo n.º 18
0
  def test_numerics(self):
    test_folder = os.path.join(self._bucket_root, 'test_numerics')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

    schema = [{'name': 'col1', 'type': 'INTEGER'},
              {'name': 'col2', 'type': 'FLOAT'}]
    features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                'col2': {'transform': 'identity', 'source_column': 'col2'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        inverted_features=analyze.invert_features(features))

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

    self.assertEqual(stats['num_examples'], 100)
    col = stats['column_stats']['col1']
    self.assertAlmostEqual(col['max'], 99.0)
    self.assertAlmostEqual(col['min'], 0.0)
    self.assertAlmostEqual(col['mean'], 49.5)

    col = stats['column_stats']['col2']
    self.assertAlmostEqual(col['max'], 990.5)
    self.assertAlmostEqual(col['min'], 0.5)
    self.assertAlmostEqual(col['mean'], 495.5)
Exemplo n.º 19
0
def _read_file(filename):
  """Reads a file containing `GraphDef` and returns the protocol buffer.

  Args:
    filename: `graph_def` filename including the path.

  Returns:
    A `GraphDef` protocol buffer.

  Raises:
    IOError: If the file doesn't exist, or cannot be successfully parsed.
  """
  graph_def = graph_pb2.GraphDef()
  if not file_io.file_exists(filename):
    raise IOError("File %s does not exist." % filename)
  # First try to read it as a binary file.
  file_content = file_io.read_file_to_string(filename)
  try:
    graph_def.ParseFromString(file_content)
    return graph_def
  except Exception:  # pylint: disable=broad-except
    pass

  # Next try to read it as a text file.
  try:
    text_format.Merge(file_content.decode("utf-8"), graph_def)
  except text_format.ParseError as e:
    raise IOError("Cannot parse file %s: %s." % (filename, str(e)))

  return graph_def
  def _GetBaseApiMap(self):
    """Get a map from graph op name to its base ApiDef.

    Returns:
      Dictionary mapping graph op name to corresponding ApiDef.
    """
    # Convert base ApiDef in Multiline format to Proto format.
    converted_base_api_dir = os.path.join(
        test.get_temp_dir(), 'temp_base_api_defs')
    subprocess.check_call(
        [os.path.join(resource_loader.get_root_dir_with_all_resources(),
                      _CONVERT_FROM_MULTILINE_SCRIPT),
         _BASE_API_DIR, converted_base_api_dir])

    name_to_base_api_def = {}
    base_api_files = file_io.get_matching_files(
        os.path.join(converted_base_api_dir, 'api_def_*.pbtxt'))
    for base_api_file in base_api_files:
      if file_io.file_exists(base_api_file):
        api_defs = api_def_pb2.ApiDefs()
        text_format.Merge(
            file_io.read_file_to_string(base_api_file), api_defs)
        for api_def in api_defs.op:
          name_to_base_api_def[api_def.graph_op_name] = api_def
    return name_to_base_api_def
Exemplo n.º 21
0
 def testFileWrite(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   self.assertTrue(file_io.file_exists(file_path))
   file_contents = file_io.read_file_to_string(file_path)
   self.assertEqual(b"testing", file_contents)
   file_io.delete_file(file_path)
Exemplo n.º 22
0
  def _read_config_files(self, run_paths):
    configs = {}
    config_fpaths = {}
    for run_name, logdir in run_paths.items():
      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
      if not file_io.file_exists(config_fpath):
        # Skip runs that have no config file.
        continue
      # Read the config file.
      file_content = file_io.read_file_to_string(config_fpath).decode('utf-8')
      config = ProjectorConfig()
      text_format.Merge(file_content, config)

      if not config.model_checkpoint_path:
        # See if you can find a checkpoint file in the logdir.
        ckpt_path = latest_checkpoint(logdir)
        if not ckpt_path:
          # Or in the parent of logdir.
          ckpt_path = latest_checkpoint(os.path.join('../', logdir))
          if not ckpt_path:
            logging.warning('Cannot find model checkpoint in %s', logdir)
            continue
        config.model_checkpoint_path = ckpt_path

      # Sanity check for the checkpoint file.
      if not file_io.file_exists(config.model_checkpoint_path):
        logging.warning('Checkpoint file %s not found',
                        config.model_checkpoint_path)
        continue
      configs[run_name] = config
      config_fpaths[run_name] = config_fpath
    return configs, config_fpaths
Exemplo n.º 23
0
 def testCopyOverwrite(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.write_string_to_file(copy_path, "copy")
   file_io.copy(file_path, copy_path, overwrite=True)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
Exemplo n.º 24
0
 def testCopy(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self.get_temp_dir(), "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
   file_io.delete_file(file_path)
   file_io.delete_file(copy_path)
Exemplo n.º 25
0
def _parse_saved_model(export_dir):
  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.

  Args:
    export_dir: Directory containing the SavedModel file.

  Returns:
    A `SavedModel` protocol buffer.

  Raises:
    IOError: If the file does not exist, or cannot be successfully parsed.
  """
  # Build the path to the SavedModel in pbtxt format.
  path_to_pbtxt = os.path.join(
      compat.as_bytes(export_dir),
      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
  # Build the path to the SavedModel in pb format.
  path_to_pb = os.path.join(
      compat.as_bytes(export_dir),
      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))

  # Ensure that the SavedModel exists at either path.
  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
      path_to_pb):
    raise IOError("SavedModel file does not exist at: %s" % export_dir)

  saved_model = saved_model_pb2.SavedModel()

  # Parse the SavedModel protocol buffer.
  try:
    file_content = file_io.read_file_to_string(path_to_pb)
    saved_model.ParseFromString(file_content)
    return saved_model
  except Exception:  # pylint: disable=broad-except
    # Pass for exceptions in order to try reading the file in text format.
    pass

  try:
    file_content = file_io.read_file_to_string(path_to_pbtxt)
    text_format.Merge(file_content.decode("utf-8"), saved_model)
  except text_format.ParseError as e:
    raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
  return saved_model
def get_checkpoint_state(checkpoint_dir, latest_filename=None):
  """Returns CheckpointState proto from the "checkpoint" file.

  If the "checkpoint" file contains a valid CheckpointState
  proto, returns it.

  Args:
    checkpoint_dir: The directory of checkpoints.
    latest_filename: Optional name of the checkpoint file.  Default to
      'checkpoint'.

  Returns:
    A CheckpointState if the state was available, None
    otherwise.

  Raises:
    ValueError: if the checkpoint read doesn't have model_checkpoint_path set.
  """
  ckpt = None
  coord_checkpoint_filename = _GetCheckpointFilename(checkpoint_dir,
                                                     latest_filename)
  f = None
  try:
    # Check that the file exists before opening it to avoid
    # many lines of errors from colossus in the logs.
    if file_io.file_exists(coord_checkpoint_filename):
      file_content = file_io.read_file_to_string(
          coord_checkpoint_filename)
      ckpt = CheckpointState()
      text_format.Merge(file_content, ckpt)
      if not ckpt.model_checkpoint_path:
        raise ValueError("Invalid checkpoint state loaded from "
                         + checkpoint_dir)
      # For relative model_checkpoint_path and all_model_checkpoint_paths,
      # prepend checkpoint_dir.
      if not os.path.isabs(ckpt.model_checkpoint_path):
        ckpt.model_checkpoint_path = os.path.join(checkpoint_dir,
                                                  ckpt.model_checkpoint_path)
      for i in range(len(ckpt.all_model_checkpoint_paths)):
        p = ckpt.all_model_checkpoint_paths[i]
        if not os.path.isabs(p):
          ckpt.all_model_checkpoint_paths[i] = os.path.join(checkpoint_dir, p)
  except errors.OpError as e:
    # It's ok if the file cannot be read
    logging.warning("%s: %s", type(e).__name__, e)
    logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
    return None
  except text_format.ParseError as e:
    logging.warning("%s: %s", type(e).__name__, e)
    logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
    return None
  finally:
    if f:
      f.close()
  return ckpt
Exemplo n.º 27
0
  def testAssets(self):
    export_dir = os.path.join(
        compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes("with-assets"))
    builder = saved_model_builder.SavedModelBuilder(export_dir)

    with self.test_session(graph=tf.Graph()) as sess:
      v = tf.Variable(42, name="v")
      sess.run(tf.initialize_all_variables())
      self.assertEqual(42, v.eval())

      # Build an asset collection.
      asset_filepath = os.path.join(
          compat.as_bytes(tf.test.get_temp_dir()),
          compat.as_bytes("hello42.txt"))
      file_io.write_string_to_file(asset_filepath, "foo bar baz")
      asset_file_tensor = tf.constant(asset_filepath, name="asset_file_tensor")
      tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)

      ignored_filepath = os.path.join(
          compat.as_bytes(tf.test.get_temp_dir()),
          compat.as_bytes("ignored.txt"))
      file_io.write_string_to_file(ignored_filepath, "will be ignored")

      asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)

      builder.add_meta_graph_and_variables(
          sess, ["foo"], assets_collection=asset_collection)

    # Save the SavedModel to disk.
    builder.save()

    with self.test_session(graph=tf.Graph()) as sess:
      foo_graph = loader.load(sess, ["foo"], export_dir)

      # Validate the assets.
      collection_def = foo_graph.collection_def
      assets_any = collection_def[constants.ASSETS_KEY].any_list.value
      self.assertEqual(len(assets_any), 1)
      asset = manifest_pb2.AssetFile()
      assets_any[0].Unpack(asset)
      assets_path = os.path.join(
          compat.as_bytes(export_dir),
          compat.as_bytes(constants.ASSETS_DIRECTORY),
          compat.as_bytes("hello42.txt"))
      asset_contents = file_io.read_file_to_string(assets_path)
      self.assertEqual("foo bar baz", compat.as_text(asset_contents))
      self.assertEqual("hello42.txt", asset.filename)
      self.assertEqual("asset_file_tensor:0", asset.tensor_binding.tensor_name)
      ignored_asset_path = os.path.join(
          compat.as_bytes(export_dir),
          compat.as_bytes(constants.ASSETS_DIRECTORY),
          compat.as_bytes("ignored.txt"))
      self.assertFalse(file_io.file_exists(ignored_asset_path))
Exemplo n.º 28
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.schema:
    schema = json.loads(
        file_io.read_file_to_string(args.schema).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features).decode())

  expand_defaults(schema, features)  # features are updated.
  inverted_features = invert_features(features)
  check_schema_transforms_match(schema, inverted_features)

  file_io.recursive_create_dir(args.output)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        bigquery_table=args.bigquery,
        schema=schema,
        inverted_features=inverted_features)
  else:
    run_local_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        schema=schema,
        inverted_features=inverted_features)

  # Save a copy of the schema and features in the output folder.
  file_io.write_string_to_file(
    os.path.join(args.output, constant.SCHEMA_FILE),
    json.dumps(schema, indent=2))

  file_io.write_string_to_file(
    os.path.join(args.output, constant.FEATURES_FILE),
    json.dumps(features, indent=2))
Exemplo n.º 29
0
def read_schema(path):
  """Reads a schema from the provided location.

  Args:
    path: The location of the file holding a serialized Schema proto.

  Returns:
    An instance of Schema or None if the input argument is None
  """
  result = schema_pb2.Schema()
  contents = file_io.read_file_to_string(path)
  text_format.Parse(contents, result)
  return result
Exemplo n.º 30
0
def load_schema_text(input_path: bytes) -> schema_pb2.Schema:
  """Loads the schema stored in text format in the input path.

  Args:
    input_path: File path to load the schema from.

  Returns:
    A Schema protocol buffer.
  """
  schema = schema_pb2.Schema()
  schema_text = file_io.read_file_to_string(input_path)
  text_format.Parse(schema_text, schema)
  return schema
Exemplo n.º 31
0
Arquivo: taxi.py Projeto: zwcdp/tfx
def read_schema(path):
    """Reads a schema from the provided location.

  Args:
    path: The location of the file holding a serialized Schema proto.

  Returns:
    An instance of Schema or None if the input argument is None
  """
    result = schema_pb2.Schema()
    contents = file_io.read_file_to_string(path)
    text_format.Parse(contents, result)
    return result
    def testGeneratedFileMatchesHead(self):
        expected_contents = gradient_input_output_exclusions.get_contents()
        filename = os.path.join(
            resource_loader.get_root_dir_with_all_resources(),
            resource_loader.get_path_to_datafile(
                "pywrap_gradient_exclusions.cc"))
        actual_contents = file_io.read_file_to_string(filename)
        self.assertEqual(
            actual_contents, expected_contents, """
pywrap_gradient_exclusions.cc needs to be updated.
Please regenerate using:
bazel run tensorflow/python/eager:gradient_input_output_exclusions -- $PWD/tensorflow/python/eager/pywrap_gradient_exclusions.cc"""
        )
Exemplo n.º 33
0
def get_data(name, dir):
    if name not in ['train', 'test', 'unlabeled']:
        raise ValueError('{} is not in the dataset!'.format(name))

    data = np.load(
        BytesIO(file_io.read_file_to_string('{}/{}.npz'.format(dir, name))))
    seqs = data['data']
    labels = None

    if name != 'unlabeled':
        labels = data['label']

    return seqs, labels
Exemplo n.º 34
0
def load_batch(fpath):
    object = file_io.read_file_to_string(fpath)
    #origin_bytes = bytes(object, encoding='latin1')
    # with open(fpath, 'rb') as f:
    if sys.version_info > (3, 0):
        # Python3
        d = pickle.loads(object, encoding='latin1')
    else:
        # Python2
        d = pickle.loads(object)
    data = d["data"]
    labels = d["labels"]
    return data, labels
Exemplo n.º 35
0
 def _fetch_embedding(self, emb_filepath):
     try:
         embedding = np.frombuffer(
             file_io.read_file_to_string(emb_filepath), dtype=np.float32)
         embedding = embedding.reshape(self.SHAPE)
     except ValueError as e:
         logging.warn('Could not load an embedding file from %s: %s',
                      emb_filepath, str(e))
         error_count.inc()
         if e.message.startswith('cannot reshape array of size 0 into'):
             file_io.delete_file(emb_filepath)
             return
         raise e
Exemplo n.º 36
0
def run_analysis(args):
  """Builds an analysis files for training."""

  # Read the schema and input feature types
  schema_list = json.loads(
      file_io.read_file_to_string(args.schema_file))

  run_numerical_categorical_analysis(args, schema_list)

  # Also save a copy of the schema in the output folder.
  file_io.copy(args.schema_file,
               os.path.join(args.output_dir, SCHEMA_FILE),
               overwrite=True)
Exemplo n.º 37
0
def read_schema(file_path):
    """Reads a schema file from specified location.

  Args:
    file_path: The location of the file holding a serialized Schema proto.

  Returns:
    An instance of Schema object.
  """
    result = schema_pb2.Schema()
    contents = file_io.read_file_to_string(file_path)
    text_format.Parse(contents, result)
    return result
def get_checkpoint_state(checkpoint_dir, latest_filename=None):
    """Returns CheckpointState proto from the "checkpoint" file.
    If the "checkpoint" file contains a valid CheckpointState
    proto, returns it.
    Args:
    checkpoint_dir: The directory of checkpoints.
    latest_filename: Optional name of the checkpoint file.  Default to
      'checkpoint'.
    Returns:
    A CheckpointState if the state was available, None
    otherwise.
    Raises:
    ValueError: if the checkpoint read doesn't have model_checkpoint_path set.
    """
    ckpt = None
    coord_checkpoint_filename = _GetCheckpointFilename(checkpoint_dir,
                                                       latest_filename)
    f = None
    try:
        # Check that the file exists before opening it to avoid
        # many lines of errors from colossus in the logs.
        if file_io.file_exists(coord_checkpoint_filename):
            file_content = file_io.read_file_to_string(
                coord_checkpoint_filename)
            ckpt = CheckpointState()
            text_format.Merge(file_content, ckpt)
            if not ckpt.model_checkpoint_path:
                raise ValueError("Invalid checkpoint state loaded from " +
                                 checkpoint_dir)
            # For relative model_checkpoint_path and all_model_checkpoint_paths,
            # prepend checkpoint_dir.
            if not os.path.isabs(ckpt.model_checkpoint_path):
                ckpt.model_checkpoint_path = os.path.join(
                    checkpoint_dir, ckpt.model_checkpoint_path)
            for i, p in enumerate(ckpt.all_model_checkpoint_paths):
                if not os.path.isabs(p):
                    ckpt.all_model_checkpoint_paths[i] = os.path.join(
                        checkpoint_dir, p)
    except errors.OpError as e:
        # It's ok if the file cannot be read
        logging.warning("%s: %s", type(e).__name__, e)
        logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
        return None
    except text_format.ParseError as e:
        logging.warning("%s: %s", type(e).__name__, e)
        logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
        return None
    finally:
        if f:
            f.close()
    return ckpt
Exemplo n.º 39
0
def load_from_saved_model(saved_model_path, custom_objects=None):
    """Loads a keras Model from a SavedModel created by `export_saved_model()`.

  This function reinstantiates model state by:
  1) loading model topology from json (this will eventually come
     from metagraph).
  2) loading model weights from checkpoint.

  Example:

  ```python
  import tensorflow as tf

  # Create a tf.keras model.
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
  model.summary()

  # Save the tf.keras model in the SavedModel format.
  path = '/tmp/simple_keras_model'
  tf.keras.experimental.export_saved_model(model, path)

  # Load the saved keras model back.
  new_model = tf.keras.experimental.load_from_saved_model(path)
  new_model.summary()
  ```

  Args:
    saved_model_path: a string specifying the path to an existing SavedModel.
    custom_objects: Optional dictionary mapping names
        (strings) to custom classes or functions to be
        considered during deserialization.

  Returns:
    a keras.Model instance.
  """
    # restore model topology from json string
    model_json_filepath = os.path.join(
        compat.as_bytes(saved_model_path),
        compat.as_bytes(constants.ASSETS_DIRECTORY),
        compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
    model_json = file_io.read_file_to_string(model_json_filepath)
    model = model_from_json(model_json, custom_objects=custom_objects)

    # restore model weights
    checkpoint_prefix = os.path.join(
        compat.as_text(saved_model_path),
        compat.as_text(constants.VARIABLES_DIRECTORY),
        compat.as_text(constants.VARIABLES_FILENAME))
    model.load_weights(checkpoint_prefix)
    return model
Exemplo n.º 40
0
def main(argv=None):
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.csv_schema_file:
        schema = json.loads(
            file_io.read_file_to_string(args.csv_schema_file).decode())
    else:
        import google.datalab.bigquery as bq
        schema = bq.Table(args.bigquery_table).schema._bq_schema
    features = json.loads(
        file_io.read_file_to_string(args.features_file).decode())

    expand_defaults(schema, features)  # features are updated.
    inverted_features = invert_features(features)
    check_schema_transforms_match(schema, inverted_features)

    file_io.recursive_create_dir(args.output_dir)

    if args.cloud:
        run_cloud_analysis(output_dir=args.output_dir,
                           csv_file_pattern=args.csv_file_pattern,
                           bigquery_table=args.bigquery_table,
                           schema=schema,
                           inverted_features=inverted_features)
    else:
        run_local_analysis(output_dir=args.output_dir,
                           csv_file_pattern=args.csv_file_pattern,
                           schema=schema,
                           inverted_features=inverted_features)

    # Save a copy of the schema and features in the output folder.
    file_io.write_string_to_file(
        os.path.join(args.output_dir, constant.SCHEMA_FILE),
        json.dumps(schema, indent=2))

    file_io.write_string_to_file(
        os.path.join(args.output_dir, constant.FEATURES_FILE),
        json.dumps(features, indent=2))
Exemplo n.º 41
0
def main(train_file,
         test_file,
         output_file,
         num_words,
         batch_size,
         epoch):

    num_features = 300
    model = word_level_cnn(num_words, num_features)

    print('=========== Loading word2vec ===========')
    
    wordModel = KeyedVectors.load_word2vec_format('../google_300_model.bin', binary=True)

    train_input = StringIO(file_io.read_file_to_string(train_file))
    train = read_data(train_input)
    
    X = encode_data_by_word(train['review'], num_words, num_features, wordModel, True)
    y = keras.utils.to_categorical(train['sentiment'])

    #model = word_level_lstm(num_words, num_features)
    X = np.reshape(X, [-1, num_words, 1, num_features])
    model.fit(X, y, batch_size=batch_size, epochs=epoch, shuffle=True)

    test_input = StringIO(file_io.read_file_to_string(test_file))
    test = read_data(test_input)
    test_X = encode_data_by_word(test['review'], num_words, num_features, wordModel, True)
    test_X = np.reshape(test_X, [-1, num_words, 1, num_features])

    print('========== do prediction ===============')
    pred = model.predict(test_X)
    pred = np.squeeze(pred)
    pred = np.argmax(pred, axis=1)

    output = pd.DataFrame(data={"id":test["id"], "sentiment":pred})
    result_file = file_io.FileIO(output_file, 'w')
    output.to_csv(result_file, index=False, quoting=3)
    result_file.close()
Exemplo n.º 42
0
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

        # load dictionary with top num_words to index
        tmp = StringIO(file_io.read_file_to_string(self.dict_file))
        self.dict_top_words_to_index = np.load(tmp).item()

        # create dictionary index to word
        self.dict_index_to_top_words = {v: k for k, v in self.dict_top_words_to_index.iteritems()}

        self.num_words = len(self.dict_top_words_to_index)

        tmp = StringIO(file_io.read_file_to_string(self.bias_file))
        self.word_bias_init = np.load(tmp)

        # load word embeddings
        self.tf_initialize_word_embeddings()
        print('word embeddings randomly initialized')

        # training set: initialize .tfrecords reader and tf batch variables
        print('initialize .tfrecords from: '+self.train_file)
        self.image_train, self.label_train, self.id_train = read_and_decode_example(
            self.train_file, self.max_sequence_len, self.num_epochs,
            )

        self.image_batch_train, self.label_batch_train = tf.train.shuffle_batch(
            [self.image_train, self.label_train],
            batch_size=self.batch_size,
            capacity=self.min_after_dequeue + 3 * self.batch_size,
            min_after_dequeue=self.min_after_dequeue,
            )

        # load pre-trained CNN weights
        tmp = StringIO(file_io.read_file_to_string(self.weights_file))
        self.weights_dict = np.load(tmp, encoding='latin1').item()

        # run training
        self.train()
Exemplo n.º 43
0
  def test_numerics(self):
    """Build a BQ table, and then call analyze on it."""
    schema = [{'name': 'col1', 'type': 'INTEGER'},
              {'name': 'col2', 'type': 'FLOAT'},
              {'name': 'col3', 'type': 'FLOAT'}]
    project_id = dl.Context.default().project_id
    dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
    table_name = 'temp_table'
    full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

    output_folder = tempfile.mkdtemp()

    try:
      # Make a dataset, a table, and insert data.
      db = bq.Dataset((project_id, dataset_name))
      db.create()

      table = bq.Table(full_table_name)
      table.create(schema=bq.Schema(schema), overwrite=True)

      data = [{'col1': i, 'col2': 10 * i + 0.5, 'col3': i + 0.5} for i in range(100)]
      table.insert(data)

      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'},
                  'col3': {'transform': 'target'}}
      analyze.run_cloud_analysis(
          output_dir=output_folder,
          csv_file_pattern=None,
          bigquery_table=full_table_name,
          schema=schema,
          features=features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
      db.delete(delete_contents=True)
Exemplo n.º 44
0
def main(argv=None):
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.schema:
        schema = json.loads(file_io.read_file_to_string(args.schema).decode())
    else:
        import google.datalab.bigquery as bq
        schema = bq.Table(args.bigquery).schema._bq_schema
    features = json.loads(file_io.read_file_to_string(args.features).decode())

    file_io.recursive_create_dir(args.output)

    if args.cloud:
        run_cloud_analysis(output_dir=args.output,
                           csv_file_pattern=args.csv,
                           bigquery_table=args.bigquery,
                           schema=schema,
                           features=features)
    else:
        feature_analysis.run_local_analysis(output_dir=args.output,
                                            csv_file_pattern=args.csv,
                                            schema=schema,
                                            features=features)
def main(job_dir, **args):
    # # local load
    # X = np.load('x_mood_dataset_samples_22k_off43_dur59049.npy')
    # y = np.load('y_mood_dataset_samples_22k_off43_dur59049.npy')

    # cloud load (PYTHON 2 ONLY - py2.7 runs on CloudML by default)
    from StringIO import StringIO
    f = StringIO(file_io.read_file_to_string(job_dir + 'x_mood_dataset_samples_22k_off43_dur59049.npy'))
    X = np.load(f)
    f1 = StringIO(file_io.read_file_to_string(job_dir + 'y_mood_dataset_samples_22k_off43_dur59049.npy'))
    y = np.load(f1)

    print('shape of training data: ', X.shape)
    print('shape of labels: ', y.shape)

    y = to_categorical(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    with tf.device('/cpu:0'):
        model = build_model(input_shape=X_train[0].shape)

    model = tf.keras.utils.multi_gpu_model(model, gpus=NUM_GPUS)

    train(parakkek, job_dir, X_train, y_train, X_test, y_test)
Exemplo n.º 46
0
def load_stats_text(input_path):
    """Loads the specified DatasetFeatureStatisticsList proto stored in text format.

  Args:
    input_path: File path from which to load the DatasetFeatureStatisticsList
      proto.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_proto = statistics_pb2.DatasetFeatureStatisticsList()
    stats_text = file_io.read_file_to_string(input_path)
    text_format.Parse(stats_text, stats_proto)
    return stats_proto
Exemplo n.º 47
0
  def read(self, schema_path: Text) -> schema_pb2.Schema:
    """Gets a tf.metadata schema.

    Args:
      schema_path: Path to schema file.

    Returns:
      A tf.metadata schema.
    """

    result = schema_pb2.Schema()
    contents = file_io.read_file_to_string(schema_path)
    text_format.Parse(contents, result)
    return result
Exemplo n.º 48
0
def main(train_file,
         test_file,
         output_file,
         num_chars,
         batch_size,
         epoch):

    char_to_idx = char_idx_map()
    vocab_size = len(char_to_idx)

    cnn = char_level_cnn(num_chars, vocab_size)

    train_input = StringIO(file_io.read_file_to_string(train_file))
    train = read_data(train_input)

    test_input = StringIO(file_io.read_file_to_string(test_file))
    test = read_data(test_input)

    X = encode_data_by_char(train['review'], num_chars, char_to_idx, False, one_hot=True)
    X = np.reshape(X, [-1, num_chars, 1, vocab_size])
    y = keras.utils.to_categorical(train['sentiment'])
    
    cnn.fit(X, y, batch_size=batch_size, epochs=epoch, shuffle=True)
    
    test_X = encode_data_by_char(test['review'], num_chars, char_to_idx, False, one_hot=True)
    test_X = np.reshape(test_X, [-1, num_chars, 1, vocab_size])

    print('========== do prediction ===============')

    pred = cnn.predict(test_X)
    pred = np.squeeze(pred)
    pred = np.argmax(pred, axis=1)
    
    output = pd.DataFrame(data={"id":test["id"], "sentiment":pred})
    result_file = file_io.FileIO(output_file, 'w')
    output.to_csv(result_file, index=False, quoting=3)
    result_file.close()
Exemplo n.º 49
0
  def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
                 output_dict: Dict[Text, List[types.Artifact]],
                 exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs:
    # Load and deserialize custom config from execution properties.
    # Note that in the component interface the default serialization of custom
    # config is 'null' instead of '{}'. Therefore we need to default the
    # json_utils.loads to 'null' then populate it with an empty dict when
    # needed.
    custom_config = json_utils.loads(
        exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {}
    if not isinstance(custom_config, dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict. Got %s instead.' % type(custom_config))

    # TODO(ruoyu): Make this a dict of tag -> uri instead of list.
    if input_dict.get(constants.BASE_MODEL_KEY):
      base_model = path_utils.serving_model_path(
          artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY]))
    else:
      base_model = None

    if input_dict.get(constants.HYPERPARAMETERS_KEY):
      hyperparameters_file = io_utils.get_only_uri_in_dir(
          artifact_utils.get_single_uri(
              input_dict[constants.HYPERPARAMETERS_KEY]))
      hyperparameters_config = json.loads(
          file_io.read_file_to_string(hyperparameters_file))
    else:
      hyperparameters_config = None

    output_path = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_KEY])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    model_run_dir = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_RUN_KEY])

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    result = fn_args_utils.get_common_fn_args(input_dict, exec_properties)
    result.transform_output = result.transform_graph_path
    result.serving_model_dir = serving_model_dir
    result.eval_model_dir = eval_model_dir
    result.model_run_dir = model_run_dir
    result.schema_file = result.schema_path
    result.base_model = base_model
    result.hyperparameters = hyperparameters_config
    result.custom_config = custom_config
    return result
Exemplo n.º 50
0
    def test_numerics(self):
        test_folder = os.path.join(self._bucket_root, 'test_numerics')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        file_io.write_string_to_file(
            input_file_path,
            '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        features = {
            'col1': {
                'transform': 'scale',
                'source_column': 'col1'
            },
            'col2': {
                'transform': 'identity',
                'source_column': 'col2'
            }
        }
        analyze.run_cloud_analysis(
            output_dir=output_folder,
            csv_file_pattern=input_file_path,
            bigquery_table=None,
            schema=schema,
            inverted_features=analyze.invert_features(features))

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.STATS_FILE)).decode())

        self.assertEqual(stats['num_examples'], 100)
        col = stats['column_stats']['col1']
        self.assertAlmostEqual(col['max'], 99.0)
        self.assertAlmostEqual(col['min'], 0.0)
        self.assertAlmostEqual(col['mean'], 49.5)

        col = stats['column_stats']['col2']
        self.assertAlmostEqual(col['max'], 990.5)
        self.assertAlmostEqual(col['min'], 0.5)
        self.assertAlmostEqual(col['mean'], 495.5)
Exemplo n.º 51
0
 def _validate_asset_collection(self, export_dir, graph_collection_def,
                                expected_asset_file_name,
                                expected_asset_file_contents,
                                expected_asset_tensor_name):
     assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
     asset = meta_graph_pb2.AssetFileDef()
     assets_any[0].Unpack(asset)
     assets_path = os.path.join(compat.as_bytes(export_dir),
                                compat.as_bytes(constants.ASSETS_DIRECTORY),
                                compat.as_bytes(expected_asset_file_name))
     actual_asset_contents = file_io.read_file_to_string(assets_path)
     self.assertEqual(expected_asset_file_contents,
                      compat.as_text(actual_asset_contents))
     self.assertEqual(expected_asset_file_name, asset.filename)
     self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
Exemplo n.º 52
0
def load_from_saved_model(saved_model_path):
    """Loads a keras.Model from a SavedModel created by keras export().

  This function reinstantiates model state by:
  1) loading model topology from json (this will eventually come
     from metagraph).
  2) loading model weights from checkpoint.

  Example:

  ```python
  import tensorflow as tf

  # Create a tf.keras model.
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
  model.summary()

  # Save the tf.keras model in the SavedModel format.
  saved_to_path = tf.keras.experimental.export(
        model, '/tmp/my_simple_tf_keras_saved_model')

  # Load the saved keras model back.
  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
  model_prime.summary()
  ```

  Args:
    saved_model_path: a string specifying the path to an existing SavedModel.

  Returns:
    a keras.Model instance.
  """
    # restore model topology from json string
    model_json_filepath = os.path.join(
        compat.as_bytes(saved_model_path),
        compat.as_bytes(constants.ASSETS_DIRECTORY),
        compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
    model_json = file_io.read_file_to_string(model_json_filepath)
    model = model_from_json(model_json)

    # restore model weights
    checkpoint_prefix = os.path.join(
        compat.as_text(saved_model_path),
        compat.as_text(constants.VARIABLES_DIRECTORY),
        compat.as_text(constants.VARIABLES_FILENAME))
    model.load_weights(checkpoint_prefix)
    return model
Exemplo n.º 53
0
    def testRun(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()
        metadata_connection = metadata.Metadata(connection_config)

        pipeline_root = os.path.join(test_dir, 'Test')
        input_path = os.path.join(test_dir, 'input')
        fileio.makedirs(os.path.dirname(input_path))
        file_io.write_string_to_file(input_path, 'test')

        input_artifact = test_utils._InputArtifact()
        input_artifact.uri = input_path

        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]))

        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        # We use InProcessComponentLauncher to test BaseComponentLauncher logics.
        launcher = in_process_component_launcher.InProcessComponentLauncher.create(
            component=component,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection=metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join([
                test_utils._FakeComponent.__module__,
                test_utils._FakeComponent.__name__
            ]))
        launcher.launch()

        output_path = component.outputs['output'].get()[0].uri
        self.assertTrue(fileio.exists(output_path))
        contents = file_io.read_file_to_string(output_path)
        self.assertEqual('test', contents)
Exemplo n.º 54
0
def main():
  tf.logging.set_verbosity(tf.logging.INFO)
  args = parse_arguments()
  args.slice_columns = [
    column
    for column in column_group.split(',')
    for column_group in args.slice_columns
  ]
  schema = json.loads(file_io.read_file_to_string(args.schema))
  eval_model_parent_dir = os.path.join(args.model, 'tfma_eval_model_dir')
  model_export_dir = os.path.join(eval_model_parent_dir, file_io.list_directory(eval_model_parent_dir)[0])
  run_analysis(args.output, model_export_dir, args.eval, schema,
               args.project, args.mode, args.slice_columns)
  generate_static_html_output(args.output, args.slice_columns)
  with open('/output.txt', 'w') as f:
    f.write(args.output)
Exemplo n.º 55
0
  def _update_execution_proto(
      self,
      execution: metadata_store_pb2.Execution,
      pipeline_info: Optional[data_types.PipelineInfo] = None,
      component_info: Optional[data_types.ComponentInfo] = None,
      state: Optional[Text] = None,
      exec_properties: Optional[Dict[Text, Any]] = None,
  ) -> metadata_store_pb2.Execution:
    """Updates the execution proto with given type and state."""
    if state is not None:
      execution.properties[
          _EXECUTION_TYPE_KEY_STATE].string_value = tf.compat.as_text(state)
    # Forward-compatible change to leverage built-in schema to track states.
    if state == EXECUTION_STATE_CACHED:
      execution.last_known_state = metadata_store_pb2.Execution.CACHED
    elif state == EXECUTION_STATE_COMPLETE:
      execution.last_known_state = metadata_store_pb2.Execution.COMPLETE
    elif state == EXECUTION_STATE_NEW:
      execution.last_known_state = metadata_store_pb2.Execution.RUNNING

    exec_properties = exec_properties or {}
    # TODO(ruoyu): Enforce a formal rule for execution schema change.
    for k, v in exec_properties.items():
      # We always convert execution properties to unicode.
      execution.properties[k].string_value = tf.compat.as_text(
          tf.compat.as_str_any(v))
    # We also need to checksum UDF file to identify different binary being
    # used. Do we have a better way to checksum a file than hashlib.md5?
    # TODO(ruoyu): Find a better place / solution to the checksum logic.
    # TODO(ruoyu): SHA instead of MD5.
    if 'module_file' in exec_properties and exec_properties[
        'module_file'] and fileio.exists(exec_properties['module_file']):
      contents = file_io.read_file_to_string(exec_properties['module_file'])
      execution.properties['checksum_md5'].string_value = tf.compat.as_text(
          tf.compat.as_str_any(
              hashlib.md5(tf.compat.as_bytes(contents)).hexdigest()))
    if pipeline_info:
      execution.properties[
          'pipeline_name'].string_value = pipeline_info.pipeline_name
      execution.properties[
          'pipeline_root'].string_value = pipeline_info.pipeline_root
      if pipeline_info.run_id:
        execution.properties['run_id'].string_value = pipeline_info.run_id
    if component_info:
      execution.properties[
          'component_id'].string_value = component_info.component_id
    return execution
Exemplo n.º 56
0
    def test_run(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        pipeline_root = os.path.join(test_dir, 'Test')
        input_path = os.path.join(test_dir, 'input')
        tf.gfile.MakeDirs(os.path.dirname(input_path))
        file_io.write_string_to_file(input_path, 'test')

        input_artifact = types.TfxArtifact(type_name='InputPath')
        input_artifact.uri = input_path

        component = _FakeComponent(name='FakeComponent',
                                   input_channel=channel.as_channel(
                                       [input_artifact]))

        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(worker_name=component.component_id,
                                            base_output_dir=os.path.join(
                                                pipeline_root,
                                                component.component_id),
                                            enable_cache=True)

        launcher = component_launcher.ComponentLauncher(
            component=component,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type,
            '.'.join([_FakeComponent.__module__, _FakeComponent.__name__]))
        launcher.launch()

        output_path = os.path.join(pipeline_root, 'output')
        self.assertTrue(tf.gfile.Exists(output_path))
        contents = file_io.read_file_to_string(output_path)
        self.assertEqual('test', contents)
Exemplo n.º 57
0
    def test_numerics(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            file_io.write_string_to_file(
                input_file_path,
                '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

            schema = [{
                'name': 'col1',
                'type': 'INTEGER'
            }, {
                'name': 'col2',
                'type': 'FLOAT'
            }]
            features = {
                'col1': {
                    'transform': 'scale',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'identity',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
Exemplo n.º 58
0
def run_fn(fn_args: TrainerFnArgs):
    hparams = fn_args.hyperparameters
    if type(hparams) is dict and 'values' in hparams.keys():
        hparams = hparams['values']

    schema = schema_pb2.Schema()
    schema_text = file_io.read_file_to_string(fn_args.schema_file)
    text_format.Parse(schema_text, schema)
    feature_spec = schema_utils.schema_as_feature_spec(schema).feature_spec

    tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

    train_dataset = _input_fn(fn_args.train_files, tf_transform_output)
    eval_dataset = _input_fn(fn_args.eval_files, tf_transform_output)

    mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = _build_keras_model(hparams=hparams)
    try:
        log_dir = fn_args.model_run_dir
    except KeyError:
        log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir),
                               'logs')

    # Write logs to path
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                          update_freq='batch')

    model.fit(train_dataset,
              steps_per_epoch=fn_args.train_steps,
              validation_data=eval_dataset,
              validation_steps=fn_args.eval_steps,
              callbacks=[tensorboard_callback])

    signatures = {
        'serving_default':
        _get_serve_tf_examples_fn(model,
                                  tf_transform_output).get_concrete_function(
                                      tf.TensorSpec(shape=[None],
                                                    dtype=tf.string,
                                                    name='examples'))
    }
    model.save(fn_args.serving_model_dir,
               save_format='tf',
               signatures=signatures)
Exemplo n.º 59
0
  def testGeneratedFileMatchesHead(self):
    expected_contents = gradient_input_output_exclusions.get_contents()
    filename = os.path.join(
        resource_loader.get_root_dir_with_all_resources(),
        resource_loader.get_path_to_datafile("pywrap_gradient_exclusions.cc"))
    actual_contents = file_io.read_file_to_string(filename)

    # On windows, one or both of these strings may have CRLF line endings.
    # To make sure, sanitize both:
    sanitized_actual_contents = actual_contents.replace("\r", "")
    sanitized_expected_contents = expected_contents.replace("\r", "")

    self.assertEqual(
        sanitized_actual_contents, sanitized_expected_contents, """
pywrap_gradient_exclusions.cc needs to be updated.
Please regenerate using:
bazel run tensorflow/python/eager:gradient_input_output_exclusions -- $PWD/tensorflow/python/eager/pywrap_gradient_exclusions.cc"""
    )
Exemplo n.º 60
0
def get_data(imgtype):
    train_images = []
    path = "./Class/"
    for x in range(1, 7):
        path = "./Class" + str(x) + "/"
        print(path)
        read_file = file_io.read_file_to_string(path + imgtype +
                                                "/Label/Labels.txt")
        read_file = str(read_file)
        df = pd.read_fwf(path + imgtype + "/Label/Labels.txt")
        for i in range(0, len(df)):
            if (int(df.iloc[i][1]) == 1):
                fname = path + imgtype + "/" + str(df.iloc[i][2])
                print(fname)
                img = cv2.imread(fname, cv2.IMREAD_GRAYSCALE)
                img = cv2.resize(img, (512, 512))
                train_images.append([np.array(img)])
    return train_images