def _write_examples(pcollection): """Convenience function for writing serialized TensorFlow examples.""" return utils.write_to_tfrecord( pcollection, output_dir=FLAGS.output_dir, output_name="examples", value_name="serialized_example")
def _write_output(pcollection, output_name, value_name, value_coder): """Convenience function for writing the output.""" return utils.write_to_tfrecord( pcollection, output_dir=FLAGS.output_dir, output_name=output_name, value_name=value_name, value_coder=value_coder, num_shards=FLAGS.num_output_shards)
def _write_subset(dataset_name, name, values): """Writes the tf.Examples in a subset to TFRecord files.""" if name == "train": num_shards = FLAGS.num_shards_train elif name == "val": num_shards = FLAGS.num_shards_val elif name == "test": num_shards = FLAGS.num_shards_test else: raise ValueError("Unrecognized subset name: {}".format(name)) # Write the tf.Examples in TFRecord format. utils.write_to_tfrecord( values, output_dir=os.path.join(FLAGS.output_dir, dataset_name), output_name=name, value_name="example", value_coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=num_shards, stage_name_suffix=dataset_name)
def pipeline(root): """Beam pipeline for preprocessing Kepler events.""" # Write the config. config_json = json.dumps(config, indent=2) root | beam.Create([config_json]) | "write_config" >> beam.io.WriteToText( os.path.join(FLAGS.output_dir, "config.json"), num_shards=1, shard_name_template="") # Read input events table. events = _read_events(config) # Initialize DoFns. read_light_curve = light_curve_fns.ReadLightCurveDoFn( config.kepler_data_dir, injected_group=config.injected_group, scramble_type=config.scramble_type, invert=config.invert_light_curves) process_light_curve = light_curve_fns.ProcessLightCurveDoFn( gap_width=config.gap_width, normalize_method=config.normalize_method, normalize_args=config.normalize_args, upward_outlier_sigma_cut=config.upward_outlier_sigma_cut, remove_events_width_factor=config.remove_events_width_factor) generate_example = GenerateExampleDoFn() partition_fn = utils.TrainValTestPartitionFn( key_name="tce_id", partitions={ "train": 0.8, "val": 0.1, "test": 0.1, }, keys=events.tce_id.values) # Create pipeline. pipeline_inputs = _prepare_pipeline_inputs(events, config) results = ( root | "create_pcollection" >> beam.Create(pipeline_inputs) | "read_light_curves" >> beam.ParDo(read_light_curve) | "process_light_curves" >> beam.ParDo(process_light_curve) | "generate_examples" >> beam.ParDo(generate_example) | "reshuffle" >> beam.Reshuffle() | "partition_results" >> beam.Partition(partition_fn, partition_fn.num_partitions)) for name, subset in zip(partition_fn.partition_names, results): if name == "train": num_shards = FLAGS.num_shards_train elif name == "val": num_shards = FLAGS.num_shards_val elif name == "test": num_shards = FLAGS.num_shards_test else: raise ValueError("Unrecognized subset name: %s" % name) # Write the tf.Examples in TFRecord format. utils.write_to_tfrecord( subset, output_dir=FLAGS.output_dir, output_name=name, value_name="example", value_coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=num_shards)
def pipeline(root): """Beam pipeline for preprocessing Kepler events.""" if not FLAGS.input_kepid_file: raise ValueError("--input_kepid_file is required") if not FLAGS.kepler_data_dir: raise ValueError("--kepler_data_dir is required") if not FLAGS.output_dir: raise ValueError("--output_dir is required") # Write the config. config_json = json.dumps(config, indent=2) root | beam.Create([config_json ]) | "write_config" >> beam.io.WriteToText( os.path.join(FLAGS.output_dir, "config.json"), num_shards=1, shard_name_template="") # Read input Kepler ids. with tf.gfile.Open(config.input_kepid_file) as f: kep_ids = [int(line.strip()) for line in f] logging.info("Read %d Kepler ids from %s", len(kep_ids), config.input_kepid_file) # Initialize DoFns. process_fn = process_light_curve.ProcessLightCurveDoFn( config.kepler_data_dir, flux_column=config.flux_column, injected_group=config.injected_group, scramble_type=config.scramble_type, invert_light_curves=config.invert_light_curves, upward_outlier_clipping=config.upward_outlier_clipping, downward_outlier_clipping=config.downward_outlier_clipping, clip_lowest_n_values=config.clip_lowest_n_values, normalize_stddev=config.normalize_stddev) partition_fn = utils.TrainValTestPartitionFn(key_name="kepler_id", partitions={ "train": 0.8, "val": 0.1, "test": 0.1, }, keys=kep_ids) # Create pipeline. inputs = [{"kepler_id": kep_id} for kep_id in kep_ids] results = (root | "create_pcollection" >> beam.Create(inputs) | "process_light_curves" >> beam.ParDo(process_fn) | "reshuffle" >> beam.Reshuffle() | "partition_results" >> beam.Partition( partition_fn, partition_fn.num_partitions)) # Write the outputs in TFRecord format. for name, subset in zip(partition_fn.partition_names, results): if name == "train": num_shards = FLAGS.num_shards_train elif name == "val": num_shards = FLAGS.num_shards_val elif name == "test": num_shards = FLAGS.num_shards_test else: raise ValueError("Unrecognized subset name: {}".format(name)) utils.write_to_tfrecord(subset, key="example", output_dir=FLAGS.output_dir, output_name=name, coder=beam.coders.ProtoCoder( tf.train.Example), num_shards=num_shards)