Пример #1
0
def _write_examples(pcollection):
  """Convenience function for writing serialized TensorFlow examples."""
  return utils.write_to_tfrecord(
      pcollection,
      output_dir=FLAGS.output_dir,
      output_name="examples",
      value_name="serialized_example")
Пример #2
0
def _write_output(pcollection, output_name, value_name, value_coder):
  """Convenience function for writing the output."""
  return utils.write_to_tfrecord(
      pcollection,
      output_dir=FLAGS.output_dir,
      output_name=output_name,
      value_name=value_name,
      value_coder=value_coder,
      num_shards=FLAGS.num_output_shards)
Пример #3
0
def _write_subset(dataset_name, name, values):
    """Writes the tf.Examples in a subset to TFRecord files."""
    if name == "train":
        num_shards = FLAGS.num_shards_train
    elif name == "val":
        num_shards = FLAGS.num_shards_val
    elif name == "test":
        num_shards = FLAGS.num_shards_test
    else:
        raise ValueError("Unrecognized subset name: {}".format(name))

    # Write the tf.Examples in TFRecord format.
    utils.write_to_tfrecord(
        values,
        output_dir=os.path.join(FLAGS.output_dir, dataset_name),
        output_name=name,
        value_name="example",
        value_coder=beam.coders.ProtoCoder(tf.train.Example),
        num_shards=num_shards,
        stage_name_suffix=dataset_name)
  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Write the config.
    config_json = json.dumps(config, indent=2)
    root | beam.Create([config_json]) | "write_config" >> beam.io.WriteToText(
        os.path.join(FLAGS.output_dir, "config.json"),
        num_shards=1,
        shard_name_template="")

    # Read input events table.
    events = _read_events(config)

    # Initialize DoFns.
    read_light_curve = light_curve_fns.ReadLightCurveDoFn(
        config.kepler_data_dir,
        injected_group=config.injected_group,
        scramble_type=config.scramble_type,
        invert=config.invert_light_curves)
    process_light_curve = light_curve_fns.ProcessLightCurveDoFn(
        gap_width=config.gap_width,
        normalize_method=config.normalize_method,
        normalize_args=config.normalize_args,
        upward_outlier_sigma_cut=config.upward_outlier_sigma_cut,
        remove_events_width_factor=config.remove_events_width_factor)
    generate_example = GenerateExampleDoFn()
    partition_fn = utils.TrainValTestPartitionFn(
        key_name="tce_id",
        partitions={
            "train": 0.8,
            "val": 0.1,
            "test": 0.1,
        },
        keys=events.tce_id.values)

    # Create pipeline.
    pipeline_inputs = _prepare_pipeline_inputs(events, config)
    results = (
        root
        | "create_pcollection" >> beam.Create(pipeline_inputs)
        | "read_light_curves" >> beam.ParDo(read_light_curve)
        | "process_light_curves" >> beam.ParDo(process_light_curve)
        | "generate_examples" >> beam.ParDo(generate_example)
        | "reshuffle" >> beam.Reshuffle()
        | "partition_results" >> beam.Partition(partition_fn,
                                                partition_fn.num_partitions))

    for name, subset in zip(partition_fn.partition_names, results):
      if name == "train":
        num_shards = FLAGS.num_shards_train
      elif name == "val":
        num_shards = FLAGS.num_shards_val
      elif name == "test":
        num_shards = FLAGS.num_shards_test
      else:
        raise ValueError("Unrecognized subset name: %s" % name)

      # Write the tf.Examples in TFRecord format.
      utils.write_to_tfrecord(
          subset,
          output_dir=FLAGS.output_dir,
          output_name=name,
          value_name="example",
          value_coder=beam.coders.ProtoCoder(tf.train.Example),
          num_shards=num_shards)
Пример #5
0
    def pipeline(root):
        """Beam pipeline for preprocessing Kepler events."""
        if not FLAGS.input_kepid_file:
            raise ValueError("--input_kepid_file is required")
        if not FLAGS.kepler_data_dir:
            raise ValueError("--kepler_data_dir is required")
        if not FLAGS.output_dir:
            raise ValueError("--output_dir is required")

        # Write the config.
        config_json = json.dumps(config, indent=2)
        root | beam.Create([config_json
                            ]) | "write_config" >> beam.io.WriteToText(
                                os.path.join(FLAGS.output_dir, "config.json"),
                                num_shards=1,
                                shard_name_template="")

        # Read input Kepler ids.
        with tf.gfile.Open(config.input_kepid_file) as f:
            kep_ids = [int(line.strip()) for line in f]
        logging.info("Read %d Kepler ids from %s", len(kep_ids),
                     config.input_kepid_file)

        # Initialize DoFns.
        process_fn = process_light_curve.ProcessLightCurveDoFn(
            config.kepler_data_dir,
            flux_column=config.flux_column,
            injected_group=config.injected_group,
            scramble_type=config.scramble_type,
            invert_light_curves=config.invert_light_curves,
            upward_outlier_clipping=config.upward_outlier_clipping,
            downward_outlier_clipping=config.downward_outlier_clipping,
            clip_lowest_n_values=config.clip_lowest_n_values,
            normalize_stddev=config.normalize_stddev)
        partition_fn = utils.TrainValTestPartitionFn(key_name="kepler_id",
                                                     partitions={
                                                         "train": 0.8,
                                                         "val": 0.1,
                                                         "test": 0.1,
                                                     },
                                                     keys=kep_ids)

        # Create pipeline.
        inputs = [{"kepler_id": kep_id} for kep_id in kep_ids]
        results = (root
                   | "create_pcollection" >> beam.Create(inputs)
                   | "process_light_curves" >> beam.ParDo(process_fn)
                   | "reshuffle" >> beam.Reshuffle()
                   | "partition_results" >> beam.Partition(
                       partition_fn, partition_fn.num_partitions))

        # Write the outputs in TFRecord format.
        for name, subset in zip(partition_fn.partition_names, results):
            if name == "train":
                num_shards = FLAGS.num_shards_train
            elif name == "val":
                num_shards = FLAGS.num_shards_val
            elif name == "test":
                num_shards = FLAGS.num_shards_test
            else:
                raise ValueError("Unrecognized subset name: {}".format(name))

            utils.write_to_tfrecord(subset,
                                    key="example",
                                    output_dir=FLAGS.output_dir,
                                    output_name=name,
                                    coder=beam.coders.ProtoCoder(
                                        tf.train.Example),
                                    num_shards=num_shards)