예제 #1
0
  def testTwoTimeSeriesFeatures(self):
    # Build config.
    feature_spec = {
        "time_feature_1": {
            "length": 20,
            "is_time_series": True,
        },
        "time_feature_2": {
            "length": 5,
            "is_time_series": True,
        },
        "aux_feature_1": {
            "length": 1,
            "is_time_series": False,
        },
    }
    hidden_spec = {
        "time_feature_1": {
            "cnn_num_blocks": 2,
            "cnn_block_size": 2,
            "cnn_initial_num_filters": 4,
            "cnn_block_filter_factor": 1.5,
            "cnn_kernel_size": 3,
            "convolution_padding": "same",
            "pool_size": 2,
            "pool_strides": 2,
        },
        "time_feature_2": {
            "cnn_num_blocks": 1,
            "cnn_block_size": 1,
            "cnn_initial_num_filters": 5,
            "cnn_block_filter_factor": 1,
            "cnn_kernel_size": 2,
            "convolution_padding": "same",
            "pool_size": 0,
            "pool_strides": 0,
        }
    }
    config = configurations.base()
    config["inputs"]["features"] = feature_spec
    config["hparams"]["time_series_hidden"] = hidden_spec
    config = configdict.ConfigDict(config)

    # Build model.
    features = input_ops.build_feature_placeholders(config.inputs.features)
    labels = input_ops.build_labels_placeholder()
    model = astro_cnn_model.AstroCNNModel(features, labels, config.hparams,
                                          tf.estimator.ModeKeys.TRAIN)
    model.build()

    # Validate Tensor shapes.
    feature_1_block_1_conv_1 = testing.get_variable_by_name(
        "time_feature_1_hidden/block_1/conv_1/kernel")
    self.assertShapeEquals((3, 1, 4), feature_1_block_1_conv_1)

    feature_1_block_1_conv_2 = testing.get_variable_by_name(
        "time_feature_1_hidden/block_1/conv_2/kernel")
    self.assertShapeEquals((3, 4, 4), feature_1_block_1_conv_2)

    feature_1_block_2_conv_1 = testing.get_variable_by_name(
        "time_feature_1_hidden/block_2/conv_1/kernel")
    self.assertShapeEquals((3, 4, 6), feature_1_block_2_conv_1)

    feature_1_block_2_conv_2 = testing.get_variable_by_name(
        "time_feature_1_hidden/block_2/conv_2/kernel")
    self.assertShapeEquals((3, 6, 6), feature_1_block_2_conv_2)

    feature_2_block_1_conv_1 = testing.get_variable_by_name(
        "time_feature_2_hidden/block_1/conv_1/kernel")
    self.assertShapeEquals((2, 1, 5), feature_2_block_1_conv_1)

    self.assertItemsEqual(["time_feature_1", "time_feature_2"],
                          model.time_series_hidden_layers.keys())
    self.assertShapeEquals((None, 30),
                           model.time_series_hidden_layers["time_feature_1"])
    self.assertShapeEquals((None, 25),
                           model.time_series_hidden_layers["time_feature_2"])
    self.assertItemsEqual(["aux_feature_1"], model.aux_hidden_layers.keys())
    self.assertIs(model.aux_features["aux_feature_1"],
                  model.aux_hidden_layers["aux_feature_1"])
    self.assertShapeEquals((None, 56), model.pre_logits_concat)

    # Execute the TensorFlow graph.
    scaffold = tf.train.Scaffold()
    scaffold.finalize()
    with self.test_session() as sess:
      sess.run([scaffold.init_op, scaffold.local_init_op])
      step = sess.run(model.global_step)
      self.assertEqual(0, step)

      # Fetch predictions.
      features = testing.fake_features(feature_spec, batch_size=16)
      labels = testing.fake_labels(config.hparams.output_dim, batch_size=16)
      feed_dict = input_ops.prepare_feed_dict(model, features, labels)
      predictions = sess.run(model.predictions, feed_dict=feed_dict)
      self.assertShapeEquals((16, 1), predictions)
def main(argv):
  del argv  # Unused.

  config = configdict.ConfigDict(configurations.get_config(FLAGS.config_name))
  config_overrides = json.loads(FLAGS.config_overrides)
  for key in config_overrides:
    if key not in ["dataset", "hparams"]:
      raise ValueError("Unrecognized config override: {}".format(key))
  config.hparams.update(config_overrides.get("hparams", {}))

  # Log configs.
  configs_json = [
      ("config_overrides", config_util.to_json(config_overrides)),
      ("config", config_util.to_json(config)),
  ]
  for config_name, config_json in configs_json:
    tf.logging.info("%s: %s", config_name, config_json)

  # Create the estimator.
  run_config = _create_run_config()
  estimator = estimator_util.create_estimator(
      astrowavenet_model.AstroWaveNet, config.hparams, run_config,
      FLAGS.model_dir, FLAGS.eval_batch_size)

  if FLAGS.schedule in ["train", "train_and_eval"]:
    # Save configs.
    tf.gfile.MakeDirs(FLAGS.model_dir)
    for config_name, config_json in configs_json:
      filename = os.path.join(FLAGS.model_dir, "{}.json".format(config_name))
      with tf.gfile.Open(filename, "w") as f:
        f.write(config_json)

    train_input_fn = _create_input_fn(tf.estimator.ModeKeys.TRAIN,
                                      config_overrides.get("dataset"))

    train_hooks = []
    if FLAGS.schedule == "train":
      estimator.train(
          train_input_fn, hooks=train_hooks, max_steps=FLAGS.train_steps)
    else:
      assert FLAGS.schedule == "train_and_eval"

      eval_args = _create_eval_args(config_overrides.get("dataset"))
      for _ in estimator_runner.continuous_train_and_eval(
          estimator=estimator,
          train_input_fn=train_input_fn,
          eval_args=eval_args,
          local_eval_frequency=FLAGS.local_eval_frequency,
          train_hooks=train_hooks,
          train_steps=FLAGS.train_steps):
        # continuous_train_and_eval() yields evaluation metrics after each
        # FLAGS.local_eval_frequency. It also saves and logs them, so we don't
        # do anything here.
        pass

  else:
    assert FLAGS.schedule == "continuous_eval"

    eval_args = _create_eval_args(config_overrides.get("dataset"))
    for _ in estimator_runner.continuous_eval(
        estimator=estimator, eval_args=eval_args,
        train_steps=FLAGS.train_steps):
      # continuous_train_and_eval() yields evaluation metrics after each
      # checkpoint. It also saves and logs them, so we don't do anything here.
      pass
예제 #3
0
    def testBuildFeaturePlaceholders(self):
        # One time series feature.
        config = configdict.ConfigDict(
            {"time_feature_1": {
                "length": 14,
                "is_time_series": True,
            }})
        expected_shapes = {
            "time_series_features": {
                "time_feature_1": [None, 14],
            },
            "aux_features": {}
        }
        features = input_ops.build_feature_placeholders(config)
        self.assertFeatureShapesEqual(expected_shapes, features)

        # Two time series features.
        config = configdict.ConfigDict({
            "time_feature_1": {
                "length": 14,
                "is_time_series": True,
            },
            "time_feature_2": {
                "length": 5,
                "is_time_series": True,
            }
        })
        expected_shapes = {
            "time_series_features": {
                "time_feature_1": [None, 14],
                "time_feature_2": [None, 5],
            },
            "aux_features": {}
        }
        features = input_ops.build_feature_placeholders(config)
        self.assertFeatureShapesEqual(expected_shapes, features)

        # One aux feature.
        config = configdict.ConfigDict({
            "time_feature_1": {
                "length": 14,
                "is_time_series": True,
            },
            "aux_feature_1": {
                "length": 1,
                "is_time_series": False,
            }
        })
        expected_shapes = {
            "time_series_features": {
                "time_feature_1": [None, 14],
            },
            "aux_features": {
                "aux_feature_1": [None, 1]
            }
        }
        features = input_ops.build_feature_placeholders(config)
        self.assertFeatureShapesEqual(expected_shapes, features)

        # Two aux features.
        config = configdict.ConfigDict({
            "time_feature_1": {
                "length": 14,
                "is_time_series": True,
            },
            "aux_feature_1": {
                "length": 1,
                "is_time_series": False,
            },
            "aux_feature_2": {
                "length": 6,
                "is_time_series": False,
            },
        })
        expected_shapes = {
            "time_series_features": {
                "time_feature_1": [None, 14],
            },
            "aux_features": {
                "aux_feature_1": [None, 1],
                "aux_feature_2": [None, 6]
            }
        }
        features = input_ops.build_feature_placeholders(config)
        self.assertFeatureShapesEqual(expected_shapes, features)
예제 #4
0
def main(argv):
    del argv  # Unused.
    logging.set_verbosity(logging.INFO)

    config = configdict.ConfigDict({
        "input_kepid_file": FLAGS.input_kepid_file,
        "kepler_data_dir": FLAGS.kepler_data_dir,
        "flux_column": FLAGS.flux_column,
        "injected_group": FLAGS.injected_group,
        "scramble_type": FLAGS.scramble_type,
        "invert_light_curves": FLAGS.invert_light_curves,
        "upward_outlier_clipping": FLAGS.upward_outlier_clipping,
        "downward_outlier_clipping": FLAGS.downward_outlier_clipping,
        "clip_lowest_n_values": FLAGS.clip_lowest_n_values,
        "normalize_stddev": FLAGS.normalize_stddev,
    })

    def pipeline(root):
        """Beam pipeline for preprocessing Kepler events."""
        if not FLAGS.input_kepid_file:
            raise ValueError("--input_kepid_file is required")
        if not FLAGS.kepler_data_dir:
            raise ValueError("--kepler_data_dir is required")
        if not FLAGS.output_dir:
            raise ValueError("--output_dir is required")

        # Write the config.
        config_json = json.dumps(config, indent=2)
        root | beam.Create([config_json
                            ]) | "write_config" >> beam.io.WriteToText(
                                os.path.join(FLAGS.output_dir, "config.json"),
                                num_shards=1,
                                shard_name_template="")

        # Read input Kepler ids.
        with tf.gfile.Open(config.input_kepid_file) as f:
            kep_ids = [int(line.strip()) for line in f]
        logging.info("Read %d Kepler ids from %s", len(kep_ids),
                     config.input_kepid_file)

        # Initialize DoFns.
        process_fn = process_light_curve.ProcessLightCurveDoFn(
            config.kepler_data_dir,
            flux_column=config.flux_column,
            injected_group=config.injected_group,
            scramble_type=config.scramble_type,
            invert_light_curves=config.invert_light_curves,
            upward_outlier_clipping=config.upward_outlier_clipping,
            downward_outlier_clipping=config.downward_outlier_clipping,
            clip_lowest_n_values=config.clip_lowest_n_values,
            normalize_stddev=config.normalize_stddev)
        partition_fn = utils.TrainValTestPartitionFn(key_name="kepler_id",
                                                     partitions={
                                                         "train": 0.8,
                                                         "val": 0.1,
                                                         "test": 0.1,
                                                     },
                                                     keys=kep_ids)

        # Create pipeline.
        inputs = [{"kepler_id": kep_id} for kep_id in kep_ids]
        results = (root
                   | "create_pcollection" >> beam.Create(inputs)
                   | "process_light_curves" >> beam.ParDo(process_fn)
                   | "reshuffle" >> beam.Reshuffle()
                   | "partition_results" >> beam.Partition(
                       partition_fn, partition_fn.num_partitions))

        # Write the outputs in TFRecord format.
        for name, subset in zip(partition_fn.partition_names, results):
            if name == "train":
                num_shards = FLAGS.num_shards_train
            elif name == "val":
                num_shards = FLAGS.num_shards_val
            elif name == "test":
                num_shards = FLAGS.num_shards_test
            else:
                raise ValueError("Unrecognized subset name: {}".format(name))

            utils.write_to_tfrecord(subset,
                                    key="example",
                                    output_dir=FLAGS.output_dir,
                                    output_name=name,
                                    coder=beam.coders.ProtoCoder(
                                        tf.train.Example),
                                    num_shards=num_shards)

    pipeline.run()
    logging.info("Preprocessing complete.")
def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  config = configdict.ConfigDict({
      "input_event_csv_file": FLAGS.input_event_csv_file,
      "kepler_data_dir": FLAGS.kepler_data_dir,
      "injected_group": FLAGS.injected_group,
      "invert_light_curves": FLAGS.invert_light_curves,
      "scramble_type": FLAGS.light_curve_scramble_type,
      "gap_width": 0.75,
      "normalize_method": "spline",
      "normalize_args": {
          "bkspace_min": 0.5,
          "bkspace_max": 20,
          "bkspace_num": 20,
          "penalty_coeff": 1.0,
      },
      "remove_event_for_spline": False,
      "remove_events_width_factor": 1.5,
      "upward_outlier_sigma_cut": None,
      "column_value_whitelists": {
          _LABEL_COLUMN: ["PC", "AFP", "NTP", "INV", "INJ1", "SCR1"]
      },
  })

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Write the config.
    config_json = json.dumps(config, indent=2)
    root | beam.Create([config_json]) | "write_config" >> beam.io.WriteToText(
        os.path.join(FLAGS.output_dir, "config.json"),
        num_shards=1,
        shard_name_template="")

    # Read input events table.
    events = _read_events(config)

    # Initialize DoFns.
    read_light_curve = light_curve_fns.ReadLightCurveDoFn(
        config.kepler_data_dir,
        injected_group=config.injected_group,
        scramble_type=config.scramble_type,
        invert=config.invert_light_curves)
    process_light_curve = light_curve_fns.ProcessLightCurveDoFn(
        gap_width=config.gap_width,
        normalize_method=config.normalize_method,
        normalize_args=config.normalize_args,
        upward_outlier_sigma_cut=config.upward_outlier_sigma_cut,
        remove_events_width_factor=config.remove_events_width_factor)
    generate_example = GenerateExampleDoFn()
    partition_fn = utils.TrainValTestPartitionFn(
        key_name="tce_id",
        partitions={
            "train": 0.8,
            "val": 0.1,
            "test": 0.1,
        },
        keys=events.tce_id.values)

    # Create pipeline.
    pipeline_inputs = _prepare_pipeline_inputs(events, config)
    results = (
        root
        | "create_pcollection" >> beam.Create(pipeline_inputs)
        | "read_light_curves" >> beam.ParDo(read_light_curve)
        | "process_light_curves" >> beam.ParDo(process_light_curve)
        | "generate_examples" >> beam.ParDo(generate_example)
        | "reshuffle" >> beam.Reshuffle()
        | "partition_results" >> beam.Partition(partition_fn,
                                                partition_fn.num_partitions))

    for name, subset in zip(partition_fn.partition_names, results):
      if name == "train":
        num_shards = FLAGS.num_shards_train
      elif name == "val":
        num_shards = FLAGS.num_shards_val
      elif name == "test":
        num_shards = FLAGS.num_shards_test
      else:
        raise ValueError("Unrecognized subset name: %s" % name)

      # Write the tf.Examples in TFRecord format.
      utils.write_to_tfrecord(
          subset,
          output_dir=FLAGS.output_dir,
          output_name=name,
          value_name="example",
          value_coder=beam.coders.ProtoCoder(tf.train.Example),
          num_shards=num_shards)

  pipeline.run()
  logging.info("Preprocessing complete.")
    def test_output_weighted(self):
        time_series_length = 6
        input_num_features = 2
        context_num_features = 7

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        weights_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "weights": weights_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 2,
            "skip_output_dim": 6,
            "preprocess_output_size": 3,
            "preprocess_kernel_width": 5,
            "num_residual_blocks": 2,
            "dilation_rates": [1, 2, 4],
            "output_distribution": {
                "type": "normal",
                "min_scale": 0,
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            feed_dict = {
                input_placeholder: [
                    [[1, 9], [1, 9], [1, 9], [1, 9], [1, 9], [1, 9]],
                    [[2, 8], [2, 8], [2, 8], [2, 8], [2, 8], [2, 8]],
                    [[3, 7], [3, 7], [3, 7], [3, 7], [3, 7], [3, 7]],
                ],
                weights_placeholder: [
                    [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]],
                    [[1, 0], [1, 1], [1, 1], [0, 1], [0, 1], [0, 0]],
                    [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                ],
                # Context is not needed since we explicitly feed the dist params.
                model.dist_params["loc"]: [
                    [[1, 8], [1, 8], [1, 8], [1, 8], [1, 8], [1, 8]],
                    [[2, 9], [2, 9], [2, 9], [2, 9], [2, 9], [2, 9]],
                    [[3, 6], [3, 6], [3, 6], [3, 6], [3, 6], [3, 6]],
                ],
                model.dist_params["scale"]: [
                    [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [1, 1], [2, 2],
                     [5, 5]],
                    [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [1, 1], [2, 2],
                     [5, 5]],
                    [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [1, 1], [2, 2],
                     [5, 5]],
                ],
            }
            batch_losses, per_example_loss, num_examples, total_loss = sess.run(
                [
                    model.batch_losses, model.per_example_loss,
                    model.num_nonzero_weight_examples, model.total_loss
                ],
                feed_dict=feed_dict)
            np.testing.assert_array_almost_equal(
                [[[-1.38364656, 48.61635344], [-0.69049938, 11.80950062],
                  [0.22579135, 2.22579135], [0.91893853, 1.41893853],
                  [1.61208571, 1.73708571], [2.52837645, 2.54837645]],
                 [[-1.38364656, 0], [-0.69049938, 11.80950062],
                  [0.22579135, 2.22579135], [0, 1.41893853], [0, 1.73708571],
                  [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
                batch_losses)
            np.testing.assert_array_almost_equal([5.96392435, 2.19185166, 0],
                                                 per_example_loss)
            np.testing.assert_almost_equal(2, num_examples)
            np.testing.assert_almost_equal(4.07788801, total_loss)
    def test_causality(self):
        time_series_length = 7
        input_num_features = 1
        context_num_features = 1

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 1,
            "skip_output_dim": 1,
            "preprocess_output_size": 1,
            "preprocess_kernel_width": 1,
            "num_residual_blocks": 1,
            "dilation_rates": [1],
            "output_distribution": {
                "type": "normal",
                "min_scale": 0.001,
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            feed_dict = {
                input_placeholder: [
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[1], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [1], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [1]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                ],
                context_placeholder: [
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    [[1], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [1], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [1]],
                ],
            }
            network_output = sess.run(model.network_output,
                                      feed_dict=feed_dict)
            np.testing.assert_array_equal(
                [
                    [[0], [0], [0], [0], [0], [0], [0]],
                    # Input elements are used to predict the next timestamp.
                    [[0], [1], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [0], [1], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [0]],
                    # Context elements are used to predict the current timestamp.
                    [[1], [0], [0], [0], [0], [0], [0]],
                    [[0], [0], [0], [1], [0], [0], [0]],
                    [[0], [0], [0], [0], [0], [0], [1]],
                ],
                np.greater(np.abs(network_output), 0))
    def test_output_categorical(self):
        time_series_length = 3
        input_num_features = 1
        context_num_features = 7
        num_classes = 4  # For quantized categorical output predictions.

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 2,
            "skip_output_dim": 6,
            "preprocess_output_size": 3,
            "preprocess_kernel_width": 5,
            "num_residual_blocks": 2,
            "dilation_rates": [1, 2, 4],
            "output_distribution": {
                "type": "categorical",
                "min_scale": 0,
                "num_classes": num_classes,
                "min_quantization_value": 0,
                "max_quantization_value": 1
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        self.assertItemsEqual(["logits"], model.dist_params.keys())
        self.assertShapeEquals(
            (None, time_series_length, input_num_features, num_classes),
            model.dist_params["logits"])

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            feed_dict = {
                input_placeholder: [
                    [[0], [0], [0]],  # min_quantization_value
                    [[0.2], [0.2], [0.2]],  # Within bucket.
                    [[0.25], [0.25], [0.25]],  # On bucket boundary.
                    [[0.5], [0.5], [0.5]],  # On bucket boundary.
                    [[0.8], [0.8], [0.8]],  # Within bucket.
                    [[1], [1], [1]],  # max_quantization_value
                    [[-0.1], [1.5], [200]],  # Outside range: will be clipped.
                ],
                # Context is not needed since we explicitly feed the dist params.
                model.dist_params["logits"]: [
                    [[[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]]],
                    [[[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]]],
                    [[[0, 1, 0, 0]], [[1, 0, 0, 0]], [[0, 0, 1, 0]]],
                    [[[0, 0, 1, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]]],
                    [[[0, 0, 0, 1]], [[1, 0, 0, 0]], [[1, 0, 0, 0]]],
                    [[[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 0, 1, 0]]],
                    [[[1, 0, 0, 0]], [[0, 0, 1, 0]], [[0, 1, 0, 0]]],
                ],
            }
            (target, batch_losses, per_example_loss, num_examples,
             total_loss) = sess.run([
                 model.autoregressive_target, model.batch_losses,
                 model.per_example_loss, model.num_nonzero_weight_examples,
                 model.total_loss
             ],
                                    feed_dict=feed_dict)
            np.testing.assert_array_almost_equal([
                [[0], [0], [0]],
                [[0], [0], [0]],
                [[1], [1], [1]],
                [[2], [2], [2]],
                [[3], [3], [3]],
                [[3], [3], [3]],
                [[0], [3], [3]],
            ], target)
            np.testing.assert_array_almost_equal([
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
                [[0.74366838], [1.74366838], [1.74366838]],
            ], batch_losses)
            np.testing.assert_array_almost_equal([
                1.41033504, 1.41033504, 1.41033504, 1.41033504, 1.41033504,
                1.41033504, 1.41033504
            ], per_example_loss)
            np.testing.assert_almost_equal(7, num_examples)
            np.testing.assert_almost_equal(1.41033504, total_loss)
    def test_build_model(self):
        time_series_length = 9
        input_num_features = 8
        context_num_features = 7

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 2,
            "skip_output_dim": 6,
            "preprocess_output_size": 3,
            "preprocess_kernel_width": 5,
            "num_residual_blocks": 2,
            "dilation_rates": [1, 2, 4],
            "output_distribution": {
                "type": "normal",
                "min_scale": 0.001,
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        variables = {v.op.name: v for v in tf.trainable_variables()}

        # Verify variable shapes in two residual blocks.

        var = variables["preprocess/causal_conv/kernel"]
        self.assertShapeEquals((5, 8, 3), var)
        var = variables["preprocess/causal_conv/bias"]
        self.assertShapeEquals((3, ), var)

        var = variables["block_0/dilation_1/filter/causal_conv/kernel"]
        self.assertShapeEquals((2, 3, 3), var)
        var = variables["block_0/dilation_1/filter/causal_conv/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_0/dilation_1/filter/conv1x1/kernel"]
        self.assertShapeEquals((1, 7, 3), var)
        var = variables["block_0/dilation_1/filter/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_0/dilation_1/gate/causal_conv/kernel"]
        self.assertShapeEquals((2, 3, 3), var)
        var = variables["block_0/dilation_1/gate/causal_conv/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_0/dilation_1/gate/conv1x1/kernel"]
        self.assertShapeEquals((1, 7, 3), var)
        var = variables["block_0/dilation_1/gate/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_0/dilation_1/residual/conv1x1/kernel"]
        self.assertShapeEquals((1, 3, 3), var)
        var = variables["block_0/dilation_1/residual/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_0/dilation_1/skip/conv1x1/kernel"]
        self.assertShapeEquals((1, 3, 6), var)
        var = variables["block_0/dilation_1/skip/conv1x1/bias"]
        self.assertShapeEquals((6, ), var)

        var = variables["block_1/dilation_4/filter/causal_conv/kernel"]
        self.assertShapeEquals((2, 3, 3), var)
        var = variables["block_1/dilation_4/filter/causal_conv/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_1/dilation_4/filter/conv1x1/kernel"]
        self.assertShapeEquals((1, 7, 3), var)
        var = variables["block_1/dilation_4/filter/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_1/dilation_4/gate/causal_conv/kernel"]
        self.assertShapeEquals((2, 3, 3), var)
        var = variables["block_1/dilation_4/gate/causal_conv/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_1/dilation_4/gate/conv1x1/kernel"]
        self.assertShapeEquals((1, 7, 3), var)
        var = variables["block_1/dilation_4/gate/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_1/dilation_4/residual/conv1x1/kernel"]
        self.assertShapeEquals((1, 3, 3), var)
        var = variables["block_1/dilation_4/residual/conv1x1/bias"]
        self.assertShapeEquals((3, ), var)
        var = variables["block_1/dilation_4/skip/conv1x1/kernel"]
        self.assertShapeEquals((1, 3, 6), var)
        var = variables["block_1/dilation_4/skip/conv1x1/bias"]
        self.assertShapeEquals((6, ), var)

        var = variables["postprocess/conv1x1/kernel"]
        self.assertShapeEquals((1, 6, 6), var)
        var = variables["postprocess/conv1x1/bias"]
        self.assertShapeEquals((6, ), var)
        var = variables["dist_params/conv1x1/kernel"]
        self.assertShapeEquals((1, 6, 16), var)
        var = variables["dist_params/conv1x1/bias"]
        self.assertShapeEquals((16, ), var)

        # Verify total number of trainable parameters.

        num_preprocess_params = (
            hparams.preprocess_kernel_width * input_num_features *
            hparams.preprocess_output_size + hparams.preprocess_output_size)

        num_gated_params = (
            hparams.dilation_kernel_width * hparams.preprocess_output_size *
            hparams.preprocess_output_size + hparams.preprocess_output_size +
            1 * context_num_features * hparams.preprocess_output_size +
            hparams.preprocess_output_size) * 2
        num_residual_params = (1 * hparams.preprocess_output_size *
                               hparams.preprocess_output_size +
                               hparams.preprocess_output_size)
        num_skip_params = (
            1 * hparams.preprocess_output_size * hparams.skip_output_dim +
            hparams.skip_output_dim)
        num_block_params = (
            num_gated_params + num_residual_params + num_skip_params) * len(
                hparams.dilation_rates) * hparams.num_residual_blocks

        num_postprocess_params = (
            1 * hparams.skip_output_dim * hparams.skip_output_dim +
            hparams.skip_output_dim)

        num_dist_params = (
            1 * hparams.skip_output_dim * 2 * input_num_features +
            2 * input_num_features)

        total_params = (num_preprocess_params + num_block_params +
                        num_postprocess_params + num_dist_params)

        total_retrieved_params = 0
        for v in tf.trainable_variables():
            total_retrieved_params += np.prod(v.shape)

        self.assertEqual(total_params, total_retrieved_params)

        # Verify model runs and outputs losses of correct shape.

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            batch_size = 11
            feed_dict = {
                input_placeholder:
                np.random.random(
                    (batch_size, time_series_length, input_num_features)),
                context_placeholder:
                np.random.random(
                    (batch_size, time_series_length, context_num_features))
            }
            batch_losses, per_example_loss, total_loss = sess.run(
                [model.batch_losses, model.per_example_loss, model.total_loss],
                feed_dict=feed_dict)
            self.assertShapeEquals(
                (batch_size, time_series_length, input_num_features),
                batch_losses)
            self.assertShapeEquals((batch_size, ), per_example_loss)
            self.assertShapeEquals((), total_loss)
  def testOneTimeSeriesFeature(self):
    # Build config.
    feature_spec = {
        "time_feature_1": {
            "length": 20,
            "is_time_series": True,
        }
    }
    hidden_spec = {
        "time_feature_1": {
            "cnn_num_blocks": 2,
            "cnn_block_size": 2,
            "cnn_initial_num_filters": 4,
            "cnn_block_filter_factor": 1.5,
            "cnn_kernel_size": 3,
            "convolution_padding": "same",
            "pool_size": 2,
            "pool_strides": 2,
        }
    }
    config = configurations.base()
    config["inputs"]["features"] = feature_spec
    config["hparams"]["time_series_hidden"] = hidden_spec
    config = configdict.ConfigDict(config)

    # Build model.
    features = input_ops.build_feature_placeholders(config.inputs.features)
    labels = input_ops.build_labels_placeholder()
    model = astro_cnn_model.AstroCNNModel(features, labels, config.hparams,
                                          tf.estimator.ModeKeys.TRAIN)
    model.build()

    # TODO(shallue): TensorFlow 2.0 doesn't have global variable collections.
    # If we want to keep testing variable shapes in 2.0, we must keep track of
    # the individual Keras Layer objects in the model class.
    variables = {v.op.name: v for v in tf.global_variables()}

    # Validate Tensor shapes.
    block_1_conv_1 = variables["time_feature_1_hidden/block_1/conv_1/kernel"]
    self.assertShapeEquals((3, 1, 4), block_1_conv_1)

    block_1_conv_2 = variables["time_feature_1_hidden/block_1/conv_2/kernel"]
    self.assertShapeEquals((3, 4, 4), block_1_conv_2)

    block_2_conv_1 = variables["time_feature_1_hidden/block_2/conv_1/kernel"]
    self.assertShapeEquals((3, 4, 6), block_2_conv_1)

    block_2_conv_2 = variables["time_feature_1_hidden/block_2/conv_2/kernel"]
    self.assertShapeEquals((3, 6, 6), block_2_conv_2)

    self.assertItemsEqual(["time_feature_1"],
                          model.time_series_hidden_layers.keys())
    self.assertShapeEquals((None, 30),
                           model.time_series_hidden_layers["time_feature_1"])
    self.assertEqual(len(model.aux_hidden_layers), 0)
    self.assertIs(model.time_series_hidden_layers["time_feature_1"],
                  model.pre_logits_concat)

    # Execute the TensorFlow graph.
    scaffold = tf.train.Scaffold()
    scaffold.finalize()
    with self.session() as sess:
      sess.run([scaffold.init_op, scaffold.local_init_op])
      step = sess.run(model.global_step)
      self.assertEqual(0, step)

      # Fetch predictions.
      features = testing.fake_features(feature_spec, batch_size=16)
      labels = testing.fake_labels(config.hparams.output_dim, batch_size=16)
      feed_dict = input_ops.prepare_feed_dict(model, features, labels)
      predictions = sess.run(model.predictions, feed_dict=feed_dict)
      self.assertShapeEquals((16, 1), predictions)
    def test_build_model_categorical(self):
        time_series_length = 9
        input_num_features = 8
        context_num_features = 7

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 2,
            "skip_output_dim": 6,
            "preprocess_output_size": 3,
            "preprocess_kernel_width": 5,
            "num_residual_blocks": 2,
            "dilation_rates": [1, 2, 4],
            "output_distribution": {
                "type": "categorical",
                "num_classes": 256,
                "min_quantization_value": -1,
                "max_quantization_value": 1
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        variables = {v.op.name: v for v in tf.trainable_variables()}

        var = variables["dist_params/conv1x1/kernel"]
        self.assertShapeEquals(
            (1, hparams.skip_output_dim,
             hparams.output_distribution.num_classes * input_num_features),
            var)
        var = variables["dist_params/conv1x1/bias"]
        self.assertShapeEquals(
            (hparams.output_distribution.num_classes * input_num_features, ),
            var)

        # Verify model runs and outputs losses of correct shape.

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            batch_size = 11
            feed_dict = {
                input_placeholder:
                np.random.random(
                    (batch_size, time_series_length, input_num_features)),
                context_placeholder:
                np.random.random(
                    (batch_size, time_series_length, context_num_features))
            }
            batch_losses, per_example_loss, total_loss = sess.run(
                [model.batch_losses, model.per_example_loss, model.total_loss],
                feed_dict=feed_dict)
            self.assertShapeEquals(
                (batch_size, time_series_length, input_num_features),
                batch_losses)
            self.assertShapeEquals((batch_size, ), per_example_loss)
            self.assertShapeEquals((), total_loss)
  def testTwoTimeSeriesFeatures(self):
    # Build config.
    feature_spec = {
        "time_feature_1": {
            "length": 20,
            "is_time_series": True,
        },
        "time_feature_2": {
            "length": 5,
            "is_time_series": True,
        },
        "aux_feature_1": {
            "length": 1,
            "is_time_series": False,
        },
    }
    hidden_spec = {
        "time_feature_1": {
            "num_local_layers": 1,
            "local_layer_size": 20,
            "translation_delta": 1,
            "pooling_type": "max",
            "dropout_rate": 0.5,
        },
        "time_feature_2": {
            "num_local_layers": 2,
            "local_layer_size": 7,
            "translation_delta": 0,
            "dropout_rate": 0,
        }
    }
    config = configurations.base()
    config["inputs"]["features"] = feature_spec
    config["hparams"]["time_series_hidden"] = hidden_spec
    config = configdict.ConfigDict(config)

    # Build model.
    features = input_ops.build_feature_placeholders(config.inputs.features)
    labels = input_ops.build_labels_placeholder()
    model = astro_fc_model.AstroFCModel(features, labels, config.hparams,
                                        tf.estimator.ModeKeys.TRAIN)
    model.build()

    # Validate Tensor shapes.
    conv = testing.get_variable_by_name("time_feature_1_hidden/conv1d/kernel")
    self.assertShapeEquals((18, 1, 20), conv)

    fc_1 = testing.get_variable_by_name(
        "time_feature_2_hidden/fully_connected_1/weights")
    self.assertShapeEquals((5, 7), fc_1)

    fc_2 = testing.get_variable_by_name(
        "time_feature_2_hidden/fully_connected_2/weights")
    self.assertShapeEquals((7, 7), fc_2)

    self.assertItemsEqual(["time_feature_1", "time_feature_2"],
                          model.time_series_hidden_layers.keys())
    self.assertShapeEquals((None, 20),
                           model.time_series_hidden_layers["time_feature_1"])
    self.assertShapeEquals((None, 7),
                           model.time_series_hidden_layers["time_feature_2"])
    self.assertItemsEqual(["aux_feature_1"], model.aux_hidden_layers.keys())
    self.assertIs(model.aux_features["aux_feature_1"],
                  model.aux_hidden_layers["aux_feature_1"])
    self.assertShapeEquals((None, 28), model.pre_logits_concat)

    # Execute the TensorFlow graph.
    scaffold = tf.train.Scaffold()
    scaffold.finalize()
    with self.test_session() as sess:
      sess.run([scaffold.init_op, scaffold.local_init_op])
      step = sess.run(model.global_step)
      self.assertEqual(0, step)

      # Fetch predictions.
      features = testing.fake_features(feature_spec, batch_size=16)
      labels = testing.fake_labels(config.hparams.output_dim, batch_size=16)
      feed_dict = input_ops.prepare_feed_dict(model, features, labels)
      predictions = sess.run(model.predictions, feed_dict=feed_dict)
      self.assertShapeEquals((16, 1), predictions)
예제 #13
0
def _parse_configs():
    """Parses the configuration files from FLAGS.config_json."""
    configs = []
    for config_filename in FLAGS.config_json.split(","):
        if tf.gfile.Exists(config_filename):
            with tf.gfile.Open(config_filename) as f:
                config = json.load(f)
        else:
            raise ValueError(
                "Cannot find config file: {}".format(config_filename))
        expected_keys = {
            "name", "kepler_data_dir", "input_event_csv_file", "tce_id_dir",
            "astrowavenet_file_pattern", "injected_group", "scramble_type",
            "invert_light_curves"
        }
        if set(config.keys()) != expected_keys:
            raise ValueError("Expected config keys to be {}, got {}".format(
                expected_keys, config.keys()))

        # Add config settings common to all TCE types.
        config = configdict.ConfigDict(config)
        config.update({
            "gap_width": 0.75,
            "normalize_method": "spline",
            "normalize_args": {
                "bkspace_min": 0.5,
                "bkspace_max": 20,
                "bkspace_num": 20,
                "penalty_coeff": 1.0,
            },
            "model_dir": FLAGS.model_dir,
            "checkpoint_filename": FLAGS.checkpoint_filename,
            "column_value_whitelists": {
                _LABEL_COLUMN:
                ["PC", "AFP", "NTP", "INV", "INJ1", "INJ2", "SCR1"]
            },
            "emb_views": {
                "global_view_nbins":
                _GLOBAL_VIEW_NUM_BINS,
                "global_view_bin_width_factor":
                1 / _GLOBAL_VIEW_NUM_BINS,
                "local_view_nbins":
                _LOCAL_VIEW_NUM_BINS,
                "local_view_bin_width_factor":
                (_LOCAL_VIEW_NUM_DURATIONS / _LOCAL_VIEW_NUM_BINS),
                "local_view_num_durations":
                _LOCAL_VIEW_NUM_DURATIONS,
                "aggr_fn":
                "sum",
            },
            "flux_views": {
                "global_view_nbins": _GLOBAL_VIEW_NUM_BINS,
                "global_view_bin_width_factor": 1 / _GLOBAL_VIEW_NUM_BINS,
                "local_view_nbins": _LOCAL_VIEW_NUM_BINS,
                "local_view_bin_width_factor": 0.16,
                "local_view_num_durations": _LOCAL_VIEW_NUM_DURATIONS,
                "aggr_fn": "median",
            },
            "apply_relu_to_embeddings": False,
            "align_to_predictions": False,
            "interpolate_missing_time": True,
        })
        configs.append(config)
    return configs
예제 #14
0
    def pipeline(root):
        """Beam pipeline that generates predictions from an AstroWavenet model."""
        # Read filenames of all checkpoints.
        checkpoint_state = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if not checkpoint_state:
            raise ValueError("Failed to load checkpoint state from {}".format(
                FLAGS.model_dir))
        checkpoint_paths = [
            os.path.join(FLAGS.model_dir, base_name)
            for base_name in checkpoint_state.all_model_checkpoint_paths
        ]
        logging.info("Found %d checkpoints in %s", len(checkpoint_paths),
                     FLAGS.model_dir)

        # Read filenames of all input files.
        input_files = []
        for file_pattern in FLAGS.input_files.split(","):
            matches = tf.gfile.Glob(file_pattern)
            if not matches:
                raise ValueError(
                    "Found no files matching {}".format(file_pattern))
            logging.info("Reading from %d files matching %s", len(matches),
                         file_pattern)
            input_files.extend(matches)

        # Parse model configs.
        config = configdict.ConfigDict(
            configurations.get_config(FLAGS.config_name))
        config_overrides = json.loads(FLAGS.config_overrides)
        for key in config_overrides:
            if key not in ["dataset", "hparams"]:
                raise ValueError(
                    "Unrecognized config override: {}".format(key))
        config.hparams.update(config_overrides.get("hparams", {}))

        # Create output directory.
        if not tf.gfile.Exists(FLAGS.output_dir):
            tf.gfile.MakeDirs(FLAGS.output_dir)

        # Initialize DoFns.
        make_predictions = prediction_fns.MakePredictionsDoFn(
            config.hparams, config_overrides.get("dataset"))

        # Create pipeline.
        predictions = (root
                       | beam.Create(
                           itertools.product(checkpoint_paths, input_files))
                       | "make_predictions" >> beam.ParDo(make_predictions))
        predictions_per_example = (
            predictions
            | "key_by_example_id" >> beam.Map(key_by("example_id"))
            | "group_by_example_id" >> beam.GroupByKey())
        predictions_per_step = (
            predictions
            | "key_by_global_step" >> beam.Map(key_by("global_step"))
            | "group_by_global_step" >> beam.GroupByKey())

        # pylint: disable=expression-not-assigned
        if FLAGS.save_losses_per_step:
            save_losses = prediction_fns.SaveLossesDoFn(
                os.path.join(FLAGS.output_dir, "losses"))
            predictions_per_step | "save_losses" >> beam.ParDo(save_losses)
        if FLAGS.save_plots_per_step:
            make_plots = visualize_fns.MakePredictionPlotDoFn(
                os.path.join(FLAGS.output_dir, "prediction_plots"))
            predictions | "make_plots" >> beam.ParDo(make_plots)
        if FLAGS.save_all_predictions:
            save_predictions = prediction_fns.SavePredictionsDoFn(
                os.path.join(FLAGS.output_dir, "predictions"))
            (predictions_per_example
             | "save_predictions" >> beam.ParDo(save_predictions))
        if FLAGS.save_animations:
            make_animations = visualize_fns.MakeAnimationDoFn(
                os.path.join(FLAGS.output_dir, "animations"))
            (predictions_per_example
             | "make_animations" >> beam.ParDo(make_animations))
예제 #15
0
  def pipeline(root):
    """Beam pipeline for running transit searches with Box Least Squares."""
    # Parse config.
    config = configdict.ConfigDict(config_util.parse_json(FLAGS.config_json))

    # Choose periods.
    period_min = config.period_min
    period_max = config.period_max
    period_sampling_args = config.period_sampling_args or {}
    if config.period_sampling_method == "andrew":
      choose_periods = _choose_periods_andrew
    elif config.period_sampling_method == "uniform_frequency":
      choose_periods = _choose_periods_uniform_freq
    elif config.period_sampling_method == "logarithmic":
      choose_periods = np.geomspace
    elif config.period_sampling_method == "uniform_period":
      choose_periods = np.linspace
    else:
      raise ValueError("Unrecognized period_sampling_method: {}".format(
          config.period_sampling_method))

    all_periods = choose_periods(period_min, period_max, **period_sampling_args)

    # Choose nbins.
    nbins_args = config.nbins_args or {}
    all_nbins = []
    for period in all_periods:
      if config.nbins_method == "andrew":
        all_nbins.append(_choose_nbins_andrew(period, **nbins_args))
      elif config.nbins_method == "constant":
        all_nbins.append(nbins_args["num"])
      else:
        raise ValueError("Unrecognized nbins_method: {}".format(
            config.nbins_method))

    # Write the config.
    config_json = config.to_json(indent=2)
    root | beam.Create([config_json]) | "write_config" >> beam.io.WriteToText(
        os.path.join(FLAGS.output_dir, "config.json"),
        num_shards=1,
        shard_name_template="")

    # Initialize DoFns.
    # TODO(shallue): I think I can pass these as kwargs into ParDo.
    read_light_curve = light_curve_fns.ReadLightCurveDoFn(
        FLAGS.kepler_data_dir,
        injected_group=config.injected_group,
        scramble_type=config.scramble_type,
        invert_light_curves=config.invert_light_curves)

    # process_light_curve_for_astronet = light_curve_fns.ProcessLightCurveDoFn(
    #     gap_width=config.predictions.gap_width,
    #     normalize_method=config.predictions.normalize_method,
    #     normalize_args=config.predictions.normalize_args,
    #     upward_outlier_sigma_cut=config.predictions.upward_outlier_sigma_cut,
    #     output_name="light_curve_for_predictions")

    generate_periodogram = bls_fns.GeneratePeriodogramDoFn(
        all_periods, all_nbins, config.weight_min_factor,
        config.duration_density_min, config.duration_min_days,
        config.duration_density_max, config.duration_min_fraction)

    compute_top_results = bls_fns.TopResultsDoFn(config.score_methods,
                                                 config.ignore_negative_depth)

    get_top_result = bls_fns.GetTopResultDoFn(config.top_detection_score_method)

    fit_transit_params = transit_fns.FitTransitParametersDoFn()

    count_transits = transit_fns.CountTransitsDoFn(
        config.complete_transit_fraction)

    # make_predictions = prediction_fns.MakePredictionsDoFn(
    #     FLAGS.astronet_model, FLAGS.astronet_config_name,
    #     FLAGS.astronet_config_json, FLAGS.astronet_model_dir)

    postprocess_for_next_detection = bls_fns.PostProcessForNextDetectionDoFn(
        score_threshold=config.top_detection_score_threshold)

    # Read Kepler IDs.
    # Output: PCollection({"kepler_id"})
    kep_ids = (
        root
        | "read_kep_ids" >> beam.io.textio.ReadFromText(
            FLAGS.input_path, coder=kepler_id.KeplerIdCoder())
        | "create_input_dicts" >>
        beam.Map(lambda kep_id: {"kepler_id": kep_id.value}))

    # Read light curves.
    # Input: PCollection({"kepler_id"})
    # Output: PCollection({"kepler_id", "raw_light_curve"})
    raw_light_curves = (
        kep_ids
        | "read_light_curves" >> beam.ParDo(read_light_curve))
    # | "process_light_curve_for_astronet" >>
    # beam.ParDo(process_light_curve_for_astronet))

    if FLAGS.save_intermediate_output:
      _write_output(
          raw_light_curves,
          output_name="raw-light-curves",
          value_name="raw_light_curve",
          value_coder=beam.coders.ProtoCoder(light_curve_pb2.RawLightCurve))

    # csv_lines = []
    for planet_num in range(config.max_detections):
      if planet_num > config.clip_downward_outliers_after_planet_num:
        downward_outlier_sigma_cut = config.downward_outlier_sigma_cut
      else:
        downward_outlier_sigma_cut = None

      process_light_curve = light_curve_fns.ProcessLightCurveDoFn(
          gap_width=config.gap_width,
          normalize_method=config.normalize_method,
          normalize_args=config.normalize_args,
          upward_outlier_sigma_cut=config.upward_outlier_sigma_cut,
          downward_outlier_sigma_cut=downward_outlier_sigma_cut,
          remove_events_width_factor=config.remove_events_width_factor)

      # Process light curves.
      # Input: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "events_to_remove",  (optional)
      #  })
      # Output: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "light_curve",
      # })
      light_curves = (
          raw_light_curves | "process_light_curves-%d" % planet_num >>
          beam.ParDo(process_light_curve))

      # Generate periodograms.
      # Input: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "light_curve",
      #  })
      # Output: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "light_curve",
      #   "periodogram",
      # })
      periodograms = (
          light_curves | "generate_periodogram-%d" % planet_num >>
          beam.ParDo(generate_periodogram))

      # Compute top results.
      # Input: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "light_curve",
      #   "periodogram",
      # })
      # Output: PCollection({
      #   "kepler_id",
      #   "raw_light_curve",
      #   "light_curve",
      #   "periodogram",
      #   "top_results",
      #   "top_result",
      # })
      top_results = (
          periodograms
          | "compute_top_results-%d" % planet_num >>
          beam.ParDo(compute_top_results)
          | "get_top_result-%d" % planet_num >> beam.ParDo(get_top_result)
          | "count_transits-%d" % planet_num >> beam.ParDo(count_transits)
          | "fit_transit_params-%d" % planet_num >>
          beam.ParDo(fit_transit_params))
      # | "make_predictions-%d" % planet_num >> beam.ParDo(make_predictions))

      # csv_lines.append(top_results
      #                 | "extract_csv_%d" % planet_num >> beam.ParDo(
      #                     prediction_fns.ToCsvDoFn(planet_num=planet_num)))

      # Write the outputs.
      _write_output(
          top_results,
          output_name="top-results-%d" % planet_num,
          value_name="top_results",
          value_coder=beam.coders.ProtoCoder(bls_pb2.TopResults))
      # Write the outputs.
      _write_output(
          top_results,
          output_name="scored-result-with-transit-fit-%d" % planet_num,
          value_name="top_result",
          value_coder=beam.coders.ProtoCoder(bls_pb2.ScoredResult))
      if FLAGS.save_intermediate_output:
        _write_output(
            light_curves,
            output_name="light-curves-%d" % planet_num,
            value_name="light_curve",
            value_coder=beam.coders.ProtoCoder(light_curve_pb2.LightCurve))
        _write_output(
            periodograms,
            output_name="periodograms-%d" % planet_num,
            value_name="periodogram",
            value_coder=beam.coders.ProtoCoder(bls_pb2.Periodogram))

      # Process light curves for the next round.
      if planet_num < config.max_detections - 1:
        # Extract detected events.
        # Input: PCollection({
        #   "kepler_id",
        #   "raw_light_curve",
        #   "light_curve",
        #   "periodogram",
        #   "top_results",
        # })
        # Output: PCollection({
        #   "kepler_id",
        #   "raw_light_curve",
        #   "events_to_remove",
        # })
        raw_light_curves = (
            top_results
            | "postprocess-%d" % planet_num >>
            beam.ParDo(postprocess_for_next_detection))
예제 #16
0
    def test_output_normal_mixture(self):
        time_series_length = 6
        input_num_features = 2
        context_num_features = 7

        input_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, input_num_features],
            name="input")
        context_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, time_series_length, context_num_features],
            name="context")
        features = {
            "autoregressive_input": input_placeholder,
            "conditioning_stack": context_placeholder
        }
        mode = tf.estimator.ModeKeys.TRAIN
        hparams = configdict.ConfigDict({
            "dilation_kernel_width": 2,
            "skip_output_dim": 6,
            "preprocess_output_size": 3,
            "preprocess_kernel_width": 5,
            "num_residual_blocks": 2,
            "dilation_rates": [1, 2, 4],
            "output_distribution": {
                "type": "normal",
                "min_scale": 0,
                "predict_outlier_distribution": True
            }
        })

        model = astrowavenet_model.AstroWaveNet(features, hparams, mode)
        model.build()

        # Model predicts the loc and scale of the outlier and non-outlier Gaussian
        # distributions, and the probability of being an outlier.
        self.assertItemsEqual(
            ["loc", "scale", "outlier_prob", "outlier_loc", "outlier_scale"],
            model.dist_params.keys())
        self.assertShapeEquals((None, time_series_length, input_num_features),
                               model.dist_params["loc"])
        self.assertShapeEquals((None, time_series_length, input_num_features),
                               model.dist_params["scale"])
        self.assertShapeEquals((2, ), model.dist_params["outlier_prob"])
        self.assertShapeEquals((2, ), model.dist_params["outlier_loc"])
        self.assertShapeEquals((2, ), model.dist_params["outlier_scale"])

        scaffold = tf.train.Scaffold()
        scaffold.finalize()
        with self.cached_session() as sess:
            sess.run([scaffold.init_op, scaffold.local_init_op])
            step = sess.run(model.global_step)
            self.assertEqual(0, step)

            feed_dict = {
                input_placeholder: [
                    [[1, 9], [1, 9], [1, 9], [1, 9], [1, 9], [1, 9]],
                    [[2, 8], [2, 8], [2, 8], [2, 8], [2, 8], [2, 8]],
                ],
                # Context is not needed since we explicitly feed the dist params.
                model.dist_params["loc"]: [
                    [[1, 8], [1, 8], [1, 8], [1, 8], [1, 8], [1, 8]],
                    [[2, 9], [2, 9], [2, 9], [2, 9], [2, 9], [2, 9]],
                ],
                model.dist_params["scale"]: [
                    [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [1, 1], [2, 2],
                     [5, 5]],
                    [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [1, 1], [2, 2],
                     [5, 5]],
                ],
                model.dist_params["outlier_prob"]: [0, 0],
                model.dist_params["outlier_loc"]: [1, 8],
                model.dist_params["outlier_scale"]: [1, 0.1],
            }
            batch_losses, per_example_loss, num_examples, total_loss = sess.run(
                [
                    model.batch_losses, model.per_example_loss,
                    model.num_nonzero_weight_examples, model.total_loss
                ],
                feed_dict=feed_dict)

            # Outlier probability is 0.0, so predictions are from the non-outlier
            # distribution.
            np.testing.assert_array_almost_equal(
                [[[-1.38364656, 48.61635344], [-0.69049938, 11.80950062],
                  [0.22579135, 2.22579135], [0.91893853, 1.41893853],
                  [1.61208571, 1.73708571], [2.52837645, 2.54837645]],
                 [[-1.38364656, 48.61635344], [-0.69049938, 11.80950062],
                  [0.22579135, 2.22579135], [0.91893853, 1.41893853],
                  [1.61208571, 1.73708571], [2.52837645, 2.54837645]]],
                batch_losses)
            np.testing.assert_array_almost_equal([5.96392435, 5.96392435],
                                                 per_example_loss)
            np.testing.assert_almost_equal(2, num_examples)
            np.testing.assert_almost_equal(5.96392435, total_loss)

            # Outlier probability is 1.0, so predictions are from the outlier
            # distribution.
            feed_dict[model.dist_params["outlier_prob"]] = [1, 1]
            batch_losses, per_example_loss, num_examples, total_loss = sess.run(
                [
                    model.batch_losses, model.per_example_loss,
                    model.num_nonzero_weight_examples, model.total_loss
                ],
                feed_dict=feed_dict)
            np.testing.assert_array_almost_equal(
                [[[0.918939, 48.616352]] * 6, [[1.418939, -1.383647]] * 6],
                batch_losses)
            np.testing.assert_array_almost_equal([24.7676468, 0.017645916],
                                                 per_example_loss, 5)
            np.testing.assert_almost_equal(2, num_examples)
            np.testing.assert_almost_equal(12.392646358, total_loss, decimal=6)

            # Predictions are weighted from the non-outlier and outlier distributions.
            feed_dict[model.dist_params["outlier_prob"]] = [0.3, 0.5]
            batch_losses, per_example_loss, num_examples, total_loss = sess.run(
                [
                    model.batch_losses, model.per_example_loss,
                    model.num_nonzero_weight_examples, model.total_loss
                ],
                feed_dict=feed_dict)
            np.testing.assert_array_almost_equal(
                [[[-1.06893575, 48.61635208], [-0.41606259, 12.5026474],
                  [0.38831028, 2.91893864], [0.91893858, 2.11208582],
                  [1.34972155, 2.430233], [1.73991919, 3.24152374]],
                 [[-1.05263364, -0.69049942], [-0.38450652, -0.69050133],
                  [0.46027452, -0.71720666], [1.04454803, -0.74938428],
                  [1.55012715, -0.73367846], [2.05226898, -0.70991373]]],
                batch_losses)
            np.testing.assert_array_almost_equal([6.227806, -0.051759],
                                                 per_example_loss)
            np.testing.assert_almost_equal(2, num_examples)
            np.testing.assert_almost_equal(3.0880234, total_loss, decimal=6)