Пример #1
0
    def generate_datasets(self) -> (tf.data.Dataset, tf.data.Dataset):
        """
        Generates the training and validation datasets.

        :return: The training and validation datasets.
        """
        self.obtain_meta_data_frame_for_available_lightcurves()
        positive_example_paths = self.meta_data_frame[
            'lightcurve_path'].tolist()
        positive_example_paths = list(
            set(positive_example_paths
                ))  # Remove duplicates from multi-planet targets.
        print(f'{len(positive_example_paths)} positive examples.')
        all_lightcurve_paths = list(
            map(str, self.lightcurve_directory.glob('**/*lc.fits')))
        negative_example_paths = list(
            set(all_lightcurve_paths) -
            set(self.meta_data_frame['lightcurve_path'].tolist()))
        print(f'{len(negative_example_paths)} negative examples.')
        positive_datasets = self.get_training_and_validation_datasets_for_file_paths(
            positive_example_paths)
        positive_training_dataset, positive_validation_dataset = positive_datasets
        negative_datasets = self.get_training_and_validation_datasets_for_file_paths(
            negative_example_paths)
        negative_training_dataset, negative_validation_dataset = negative_datasets
        training_dataset = self.get_ratio_enforced_dataset(
            positive_training_dataset,
            negative_training_dataset,
            positive_to_negative_data_ratio=1)
        validation_dataset = positive_validation_dataset.concatenate(
            negative_validation_dataset)
        if self.trial_directory is not None:
            self.log_dataset_file_names(training_dataset,
                                        dataset_name='training')
            self.log_dataset_file_names(validation_dataset,
                                        dataset_name='validation')
        training_dataset = training_dataset.shuffle(
            buffer_size=len(list(training_dataset)))
        training_dataset = map_py_function_to_dataset(
            training_dataset,
            self.training_preprocessing,
            number_of_parallel_calls=16,
            output_types=(tf.float32, tf.float32))
        training_dataset = training_dataset.padded_batch(
            self.batch_size, padded_shapes=([None, 2], [
                None
            ])).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        validation_dataset = map_py_function_to_dataset(
            validation_dataset,
            self.evaluation_preprocessing,
            number_of_parallel_calls=4,
            output_types=(tf.float32, tf.float32))
        validation_dataset = validation_dataset.padded_batch(
            1, padded_shapes=([None, 2], [
                None
            ])).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return training_dataset, validation_dataset
Пример #2
0
    def generate_datasets(self) -> (tf.data.Dataset, tf.data.Dataset):
        """
        Generates the training and validation datasets for the database.

        :return: The training and validation dataset.
        """
        synthetic_signal_paths_dataset = self.paths_dataset_from_list_or_generator_factory(
            self.get_all_synthetic_signal_paths)
        lightcurve_paths_datasets = self.get_training_and_validation_datasets_for_file_paths(
            self.get_all_lightcurve_paths)
        training_lightcurve_paths_dataset, validation_lightcurve_paths_dataset = lightcurve_paths_datasets
        explicit_negative_lightcurve_paths = self.paths_dataset_from_list_or_generator_factory(
            self.get_explicit_negative_lightcurve_paths)
        shuffled_training_lightcurve_paths_dataset = training_lightcurve_paths_dataset.repeat(
        ).shuffle(buffer_size=self.shuffle_buffer_size)
        shuffled_synthetic_signal_paths_dataset = synthetic_signal_paths_dataset.repeat(
        ).shuffle(buffer_size=self.shuffle_buffer_size)
        shuffled_explicit_negative_lightcurve_paths_dataset = explicit_negative_lightcurve_paths.repeat(
        ).shuffle(buffer_size=self.shuffle_buffer_size)
        zipped_training_paths_dataset = tf.data.Dataset.zip(
            (shuffled_training_lightcurve_paths_dataset,
             shuffled_synthetic_signal_paths_dataset,
             shuffled_explicit_negative_lightcurve_paths_dataset))
        output_types = (tf.float32, tf.float32)
        output_shapes = [(self.time_steps_per_example, 1), (1, )]
        lightcurve_training_dataset = map_py_function_to_dataset(
            zipped_training_paths_dataset,
            self.
            positive_injection_negative_and_explicit_negative_preprocessing,
            self.number_of_parallel_processes_per_map,
            output_types=output_types,
            output_shapes=output_shapes,
            flat_map=True)
        batched_training_dataset = lightcurve_training_dataset.batch(
            self.batch_size)
        prefetch_training_dataset = batched_training_dataset.prefetch(
            tf.data.experimental.AUTOTUNE)
        shuffled_validation_lightcurve_paths_dataset = validation_lightcurve_paths_dataset.repeat(
        ).shuffle(buffer_size=self.shuffle_buffer_size)
        zipped_validation_paths_dataset = tf.data.Dataset.zip(
            (shuffled_validation_lightcurve_paths_dataset,
             shuffled_synthetic_signal_paths_dataset,
             shuffled_explicit_negative_lightcurve_paths_dataset))
        lightcurve_validation_dataset = map_py_function_to_dataset(
            zipped_validation_paths_dataset,
            self.
            positive_injection_negative_and_explicit_negative_preprocessing,
            self.number_of_parallel_processes_per_map,
            output_types=output_types,
            output_shapes=output_shapes,
            flat_map=True)
        batched_validation_dataset = lightcurve_validation_dataset.batch(
            self.batch_size)
        prefetch_validation_dataset = batched_validation_dataset.prefetch(
            tf.data.experimental.AUTOTUNE)
        return prefetch_training_dataset, prefetch_validation_dataset
Пример #3
0
 def test_py_map_returns_specified_shape_when_shape_passed_in_wrapper(self):
     dataset = tf.data.Dataset.from_tensor_slices([[0, 0, 0], [1, 1, 1]])
     map_dataset = map_py_function_to_dataset(dataset=dataset,
                                              map_function=add_one,
                                              number_of_parallel_calls=4,
                                              output_shapes=(3, ))
     assert map_dataset.element_spec.shape == (3, )
Пример #4
0
    def generate_infer_path_and_light_curve_dataset(
            self, paths_dataset: tf.data.Dataset,
            load_times_fluxes_and_flux_errors_from_path_function: Callable[
                [Path], Tuple[np.ndarray, np.ndarray, Union[np.ndarray, None]]],
            load_auxiliary_information_for_path_function: Callable[[Path], np.ndarray]):
        """
        Generates a path and light curve dataset from a paths dataset using a passed function defining
        how to load the values from the light curve file.

        :param paths_dataset: The dataset of paths to use.
        :param load_times_fluxes_and_flux_errors_from_path_function: The function defining how to load the times and
                                                                     fluxes of a light curve from a path.
        :return: The resulting light curve example and label dataset.
        """
        preprocess_map_function = partial(self.preprocess_infer_light_curve,
                                          load_times_fluxes_and_flux_errors_from_path_function,
                                          load_auxiliary_information_for_path_function)
        if self.number_of_auxiliary_values == 0:
            output_types = (tf.string, tf.float32)
            output_shapes = [(), (self.time_steps_per_example, self.number_of_input_channels)]
        else:
            output_types = (tf.string, tf.float32, tf.float32)
            output_shapes = [(), (self.time_steps_per_example, self.number_of_input_channels),
                             (self.number_of_auxiliary_values,)]
        example_and_label_dataset = map_py_function_to_dataset(paths_dataset,
                                                               preprocess_map_function,
                                                               self.number_of_parallel_processes_per_map,
                                                               output_types=output_types,
                                                               output_shapes=output_shapes)
        return example_and_label_dataset
Пример #5
0
 def test_single_function_wrapper(self, dataset):
     mapped_dataset = map_py_function_to_dataset(dataset=dataset,
                                                 map_function=add_one,
                                                 number_of_parallel_calls=4,
                                                 output_types=tf.float32)
     batch_dataset = mapped_dataset.batch(batch_size=4)
     batch = next(iter(batch_dataset))
     batch_array = batch.numpy()
     assert np.array_equal(batch_array, np.array([1, 11, 21, 31]))
Пример #6
0
 def test_flat_map_can_be_used_from_single_function_wrapper(self):
     dataset = tf.data.Dataset.from_tensor_slices([[[0, 0], [10, 10]],
                                                   [[20, 20], [30, 30]]])
     mapped_dataset = map_py_function_to_dataset(dataset=dataset,
                                                 map_function=add_one,
                                                 number_of_parallel_calls=4,
                                                 output_types=tf.float32,
                                                 flat_map=True)
     batch_dataset = mapped_dataset.batch(batch_size=4)
     batch = next(iter(batch_dataset))
     batch_array = batch.numpy()
     assert np.array_equal(batch_array,
                           np.array([[1, 1], [11, 11], [21, 21], [31, 31]]))
Пример #7
0
    def generate_injected_light_curve_and_label_dataset(
            self, injectee_paths_dataset: tf.data.Dataset,
            injectee_load_times_fluxes_and_flux_errors_from_path_function: Callable[
                [Path], Tuple[np.ndarray, np.ndarray, Union[np.ndarray, None]]],
            load_auxiliary_information_for_path_function: Callable[[Path], np.ndarray],
            injectable_paths_dataset: tf.data.Dataset,
            injectable_load_times_magnifications_and_magnification_errors_from_path_function: Callable[
                [Path], Tuple[np.ndarray, np.ndarray, Union[np.ndarray, None]]],
            load_label_from_path_function: Callable[[Path], Union[float, np.ndarray]], evaluation_mode: bool = False,
            name: Optional[str] = None):
        """
        Generates a light curve and label dataset from an injectee and injectable paths dataset, using passed functions
        defining how to load the values from the light curve files for each and the label value to use.

        :param injectee_paths_dataset: The dataset of paths to use for the injectee light curves.
        :param injectee_load_times_fluxes_and_flux_errors_from_path_function: The function defining how to load the
            times and fluxes of an injectee light curve from a path.
        :param injectable_paths_dataset: The dataset of paths to use for the injectable light curves.
        :param injectable_load_times_magnifications_and_magnification_errors_from_path_function: The function defining
            how to load the times and magnifications of an injectable signal from a path.
        :param load_label_from_path_function: The function to load the label to use for the light curves in this dataset.
        :param evaluation_mode: Whether or not the preprocessing should occur in evaluation mode (for repeatability).
        :param name: The name of the dataset.
        :return: The resulting light curve example and label dataset.
        """
        preprocess_map_function = partial(
            self.preprocess_injected_light_curve,
            injectee_load_times_fluxes_and_flux_errors_from_path_function,
            load_auxiliary_information_for_path_function,
            injectable_load_times_magnifications_and_magnification_errors_from_path_function,
            load_label_from_path_function,
            evaluation_mode=evaluation_mode)
        preprocess_map_function = self.add_logging_queues_to_map_function(preprocess_map_function, name)
        if self.number_of_auxiliary_values == 0:
            output_types = (tf.float32, tf.float32)
            output_shapes = [(self.time_steps_per_example, self.number_of_input_channels),
                             (self.number_of_label_values,)]
        else:
            output_types = (tf.float32, tf.float32, tf.float32)
            output_shapes = [
                (self.time_steps_per_example, self.number_of_input_channels), (self.number_of_auxiliary_values,),
                (self.number_of_label_values,)]
        zipped_paths_dataset = tf.data.Dataset.zip((injectee_paths_dataset, injectable_paths_dataset))
        example_and_label_dataset = map_py_function_to_dataset(zipped_paths_dataset,
                                                               preprocess_map_function,
                                                               self.number_of_parallel_processes_per_map,
                                                               output_types=output_types,
                                                               output_shapes=output_shapes)
        return example_and_label_dataset
Пример #8
0
 def test_flat_map_with_output_shapes_are_applied_in_the_correct_order(
         self):
     dataset = tf.data.Dataset.from_tensor_slices([[[0, 0], [10, 10]],
                                                   [[20, 20], [30, 30]]])
     mapped_dataset = map_py_function_to_dataset(dataset=dataset,
                                                 map_function=add_one,
                                                 number_of_parallel_calls=4,
                                                 output_types=tf.float32,
                                                 flat_map=True,
                                                 output_shapes=(2, ))
     batch_dataset = mapped_dataset.batch(batch_size=4)
     batch = next(iter(batch_dataset))
     batch_array = batch.numpy()
     assert np.array_equal(batch_array,
                           np.array([[1, 1], [11, 11], [21, 21], [31, 31]]))
Пример #9
0
from ramjet.photometric_database.toi_database import ToiDatabase

log_name = get_latest_log_directory(
    logs_directory='logs')  # Uses the latest model in the log directory.
# log_name = 'baseline YYYY-MM-DD-hh-mm-ss'  # Specify the path to the model to use.
saved_log_directory = Path(f'{log_name}')
datetime_string = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

print('Setting up dataset...', flush=True)
database = ToiDatabase()
example_paths = database.get_all_lightcurve_paths()
example_paths_dataset = database.paths_dataset_from_list_or_generator_factory(
    example_paths)
mapped_dataset = map_py_function_to_dataset(
    example_paths_dataset,
    database.infer_preprocessing,
    number_of_parallel_calls=database.number_of_parallel_processes_per_map,
    output_types=(tf.string, tf.float32))
batch_dataset = mapped_dataset.batch(database.batch_size).prefetch(5)

print('Loading model...', flush=True)
model = SimpleLightcurveCnn()
model.load_weights(str(
    saved_log_directory.joinpath('model.ckpt'))).expect_partial()

print('Inferring...', flush=True)
columns = ['Lightcurve path', 'Prediction']
dtypes = [str, int]
predictions_data_frame = pd.read_csv(io.StringIO(''),
                                     names=columns,
                                     dtype=dict(zip(columns, dtypes)))