예제 #1
0
def prepare_dataloader(
    dataframe: pd.DataFrame,
    target_datetimes: typing.List[datetime.datetime],
    stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float, float]],
    target_time_offsets: typing.List[datetime.timedelta],
    config: typing.Dict[typing.AnyStr, typing.Any],
) -> tf.data.Dataset:
    """This function should be modified in order to prepare & return your own data loader.
    Note that you can use either the netCDF or HDF5 data. Each iteration over your data loader should return a
    2-element tuple containing the tensor that should be provided to the model as input, and the target values. In
    this specific case, you will not be able to provide the latter since the dataframe contains no GHI, and we are
    only interested in predictions, not training. Therefore, you must return a placeholder (or ``None``) as the second
    tuple element.
    Reminder: the dataframe contains imagery paths for every possible timestamp requested in ``target_datetimes``.
    However, we expect that you will use some of the "past" imagery (i.e. imagery at T<=0) for any T in
    ``target_datetimes``, but you should NEVER rely on "future" imagery to generate predictions (for T>0). We
    will be inspecting data loader implementations to ensure this is the case, and those who "cheat" will be
    dramatically penalized.
    See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information.
    Args:
        dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all
            relevant timestamp values over the test period.
        target_datetimes: a list of timestamps that your data loader should use to provide imagery for your model.
            The ordering of this list is important, as each element corresponds to a sequence of GHI values
            to predict. By definition, the GHI values must be provided for the offsets given by ``target_time_offsets``
            which are added to each timestamp (T=0) in this datetimes list.
        stations: a map of station names of interest paired with their coordinates (latitude, longitude, elevation).
        target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, T+6h]).
        config: configuration dictionary holding any extra parameters that might be required by the user. These
            parameters are loaded automatically if the user provided a JSON file in their submission. Submitting
            such a JSON file is completely optional, and this argument can be ignored if not needed.
    Returns:
        A ``tf.data.Dataset`` object that can be used to produce input tensors for your model. One tensor
        must correspond to one sequence of past imagery data. The tensors must be generated in the order given
        by ``target_sequences``.
    """
    ################################## MODIFY BELOW ##################################
    # WE ARE PROVIDING YOU WITH A DUMMY DATA GENERATOR FOR DEMONSTRATION PURPOSES.
    # MODIFY EVERYTHINGIN IN THIS BLOCK AS YOU SEE FIT

    from libs import helpers

    helpers.validate_user_config(config)

    data_loader = helpers.get_online_data_loader(
        user_config_dict=config,
        dataframe=dataframe,
        target_datetimes=target_datetimes,
        stations=stations,
        target_time_offsets=target_time_offsets,
        preprocessed_data_path=config['data_loader']['hyper_params']
        ['preprocessed_data_source']['test'])

    ################################### MODIFY ABOVE ##################################

    return data_loader
예제 #2
0
def main(config_path: typing.AnyStr,
         tensorboard_tracking_folder: typing.AnyStr):
    """
    Train a model

    :param config_path: path to the JSON config file that follows configs/user/schema.json
    :param tensorboard_tracking_folder: path where to store TensorBoard data and save trained model
    """
    user_config_dict = helpers.load_dict(config_path)
    helpers.validate_user_config(user_config_dict)

    if tensorboard_tracking_folder is not None:
        tensorboard_tracking_folder = Path(tensorboard_tracking_folder)
        tensorboard_tracking_folder.mkdir(parents=True, exist_ok=True)

    train_models(config=user_config_dict,
                 tensorboard_tracking_folder=tensorboard_tracking_folder)
예제 #3
0
def main(training_config_path: typing.AnyStr,
         validation_config_path: typing.AnyStr,
         user_config_path: typing.AnyStr,
         tensorboard_tracking_folder: typing.AnyStr):
    """
    Train a model

    :param training_config_path: path to the JSON config file used to store training set parameters
    :param validation_config_path: path to the JSON config file used to store validation set parameters
    :param user_config_path: path to the JSON config file used to store user model, dataloader and trainer parameters
    :param tensorboard_tracking_folder: path where to store TensorBoard data and save trained model
    """
    training_config_dict = helpers.load_dict(training_config_path)
    validation_config_dict = helpers.load_dict(validation_config_path)
    user_config_dict = helpers.load_dict(user_config_path)

    helpers.validate_admin_config(training_config_dict)
    helpers.validate_admin_config(validation_config_dict)
    helpers.validate_user_config(user_config_dict)

    training_source = user_config_dict['data_loader']['hyper_params'][
        'preprocessed_data_source']['training']
    validation_source = user_config_dict['data_loader']['hyper_params'][
        'preprocessed_data_source']['validation']

    training_data_loader = helpers.get_online_data_loader(
        user_config_dict,
        training_config_dict,
        preprocessed_data_path=training_source)
    validation_data_loader = helpers.get_online_data_loader(
        user_config_dict,
        validation_config_dict,
        preprocessed_data_path=validation_source)

    print("Eager mode", tf.executing_eagerly())

    mirrored_strategy = helpers.get_mirrored_strategy()

    train_models(user_config_dict=user_config_dict,
                 training_config_dict=training_config_dict,
                 training_data_loader=training_data_loader,
                 validation_data_loader=validation_data_loader,
                 tensorboard_tracking_folder=tensorboard_tracking_folder,
                 mirrored_strategy=mirrored_strategy)
예제 #4
0
def generate_all_predictions(
    target_stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float,
                                                             float]],
    target_datetimes: typing.List[datetime.datetime],
    target_time_offsets: typing.List[datetime.timedelta],
    dataframe: pd.DataFrame,
    user_config: typing.Dict[typing.AnyStr, typing.Any],
) -> np.ndarray:
    """Generates and returns model predictions g<iven the data prepared by a data loader."""
    # we will create one data loader per station to make sure we avoid mixups in predictions
    predictions = []
    ################################### TEAM 3's REQUIRED EDIT ##################################
    # Justification:
    # We need to start by pre processing all stations to be able to satisfy the 30 minute evaluation restriction
    from tools.netcdf_crop import netcdf_preloader
    from libs import helpers

    helpers.validate_user_config(user_config)
    if user_config['data_loader']['hyper_params']['should_preprocess_data']:
        netcdf_preloader(dataframe=dataframe,
                         target_datetimes=target_datetimes,
                         stations=target_stations,
                         path_output=user_config['data_loader']['hyper_params']
                         ['preprocessed_data_source']['test'])
    ################################### TEAM 3's REQUIRED EDIT ##################################
    for station_idx, station_name in enumerate(target_stations):
        # usually, we would create a single data loader for all stations, but we just want to avoid trouble...
        stations = {station_name: target_stations[station_name]}
        print(
            f"preparing data loader & model for station '{station_name}' ({station_idx + 1}/{len(target_stations)})"
        )
        data_loader = prepare_dataloader(dataframe, target_datetimes, stations,
                                         target_time_offsets, user_config)
        model = prepare_model(stations, target_time_offsets, user_config)
        station_preds = generate_predictions(data_loader,
                                             model,
                                             pred_count=len(target_datetimes))
        assert len(station_preds) == len(
            target_datetimes
        ), "number of predictions mismatch with requested datetimes"
        predictions.append(station_preds)
    return np.concatenate(predictions, axis=0)
예제 #5
0
def prepare_model(
    stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float, float]],
    target_time_offsets: typing.List[datetime.timedelta],
    config: typing.Dict[typing.AnyStr, typing.Any],
) -> tf.keras.Model:
    """This function should be modified in order to prepare & return your own prediction model.
    See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information.
    Args:
        stations: a map of station names of interest paired with their coordinates (latitude, longitude, elevation).
        target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, T+6h]).
        config: configuration dictionary holding any extra parameters that might be required by the user. These
            parameters are loaded automatically if the user provided a JSON file in their submission. Submitting
            such a JSON file is completely optional, and this argument can be ignored if not needed.
    Returns:
        A ``tf.keras.Model`` object that can be used to generate new GHI predictions given imagery tensors.
    """

    ################################### MODIFY BELOW ##################################

    from libs import helpers

    helpers.validate_user_config(config)

    mirrored_strategy = helpers.get_mirrored_strategy()

    if mirrored_strategy is not None and mirrored_strategy.num_replicas_in_sync > 1:
        with mirrored_strategy.scope():
            model = helpers.prepare_model(
                user_config_dict=config,
                stations=stations,
                target_time_offsets=target_time_offsets)
    else:
        model = helpers.prepare_model(user_config_dict=config,
                                      stations=stations,
                                      target_time_offsets=target_time_offsets)

    ################################### MODIFY ABOVE ##################################

    return model
예제 #6
0
def generate_predictions(input_file_path: str, pred_file_path: str):
    """Generates predictions for the machine translation task (EN->FR).
    You are allowed to modify this function as needed, but one again, you cannot
    modify any other part of this file. We will be importing only this function
    in our final evaluation script. Since you will most definitely need to import
    modules for your code, you must import these inside the function itself.
    Args:
        input_file_path: the file path that contains the input data.
        pred_file_path: the file path where to store the predictions.
    Returns: None
    """

    ##### MODIFY BELOW #####
    import tensorflow as tf

    from libs import helpers
    from libs.data_loaders.abstract_dataloader import AbstractDataloader
    from libs.models import transformer

    import tqdm

    import logging
    from libs.data_loaders.abstract_dataloader import create_masks_fm
    from libs.data_loaders.dataloader_bilingual_huggingface import BilingualTranslationHFSubword
    from libs.data_loaders.dataloader_bilingual_tensorflow import BilingualTranslationTFSubword
    from libs.data_loaders.mass_subword import MassSubwordDataLoader
    from libs.models.transformer import Encoder, Decoder

    logger = tf.get_logger()
    logger.setLevel(logging.DEBUG)

    import numpy as np
    import random
    from libs.seeds import TENSOR_FLOW_SEED, NUMPY_SEED, RANDOM_SEED

    tf.random.set_seed(TENSOR_FLOW_SEED)
    np.random.seed(NUMPY_SEED)
    random.seed(RANDOM_SEED)

    best_config_file = '/project/cq-training-1/project2/teams/team03/models/transformer_mass_v1_translation_with_pretraining_eval.json'
    # best_config_file = 'configs/user/transformers-fm/TFM_TINY_BBPE_eval.json'
    logger.info(f"Using best config file: {best_config_file}")
    best_config = helpers.load_dict(best_config_file)
    helpers.validate_user_config(best_config)

    # TODO: Edit our AbstractDataloader to support a raw_english_test_set_file_path. Currently it only supports
    #   preprocessed data defined directly in best_config.
    data_loader: AbstractDataloader = helpers.get_online_data_loader(
        config=best_config, raw_english_test_set_file_path=input_file_path)

    if best_config["model"]["definition"][
            "module"] == 'libs.models.transformerv2':
        model = transformer.load_transformer(best_config)
    else:
        mirrored_strategy = helpers.get_mirrored_strategy()
        if mirrored_strategy is not None and mirrored_strategy.num_replicas_in_sync > 1:
            with mirrored_strategy.scope():
                model: tf.keras.Model = helpers.prepare_model(
                    config=best_config)
        else:
            model: tf.keras.Model = helpers.prepare_model(config=best_config)

    #    batch_size = 32  # 32 is max for 6GB GPU memory
    batch_size = 128
    data_loader.build(batch_size=batch_size)
    test_dataset = data_loader.test_dataset

    all_predictions = []
    if isinstance(data_loader, MassSubwordDataLoader):
        all_predictions = transformer.inference(data_loader.tokenizer, model,
                                                test_dataset)
    else:
        if isinstance(data_loader, BilingualTranslationTFSubword) or \
                isinstance(data_loader, BilingualTranslationHFSubword):
            sample_to_display = 10

            encoder: Encoder = model.get_layer("encoder")
            decoder: Decoder = model.get_layer("decoder")
            final_layer: tf.keras.layers.Dense = model.layers[-1]

            for inputs, mask in tqdm.tqdm(test_dataset,
                                          total=data_loader.test_steps):

                mini_batch_size = inputs.shape[0]
                dec_inp = tf.Variable(
                    tf.zeros(
                        (mini_batch_size, data_loader.get_seq_length() + 1),
                        dtype=tf.int32))

                bos_tensor = tf.convert_to_tensor(data_loader.bos)
                bos_tensor = tf.reshape(bos_tensor, [1, 1])
                bos_tensor = tf.tile(bos_tensor,
                                     multiples=[mini_batch_size, 1])

                dec_inp[:, 0].assign(bos_tensor[:, 0])  # BOS token

                # WARNING: IF THE MODEL USED WAS FROM A TF FILE, A LOT OF WARNINGS WILL APPEAR
                #  Workaround: Use the hdf5 format to load the final model
                # https://github.com/tensorflow/tensorflow/issues/35146
                def get_preds(encoder, decoder, final_layer, dec_inp, inputs,
                              mask, max_seq):
                    enc_output: tf.Tensor = encoder.__call__(inputs=inputs,
                                                             mask=mask,
                                                             training=False)

                    for timestep in range(max_seq):
                        _, combined_mask, dec_padding_mask = create_masks_fm(
                            inp=inputs, tar=dec_inp[:, :-1])

                        dec_output, attention_weights = decoder(
                            inputs=dec_inp[:, :-1],
                            enc_output=enc_output,
                            look_ahead_mask=combined_mask,
                            padding_mask=dec_padding_mask)

                        outputs = final_layer(
                            inputs=dec_output
                        )  # (batch_size, seq_length, vocab_size)
                        pred = tf.argmax(outputs[:, timestep, :], axis=-1)
                        pred = tf.cast(pred, dtype=tf.int32)
                        dec_inp[:, timestep + 1].assign(pred)
                    return dec_inp

                predictions = get_preds(
                    encoder=encoder,
                    decoder=decoder,
                    final_layer=final_layer,
                    dec_inp=dec_inp,
                    inputs=inputs,
                    mask=mask,
                    # TODO Decision to be made, 100 seq length doesn't seem to hurt perfs
                    max_seq=100)  # data_loader.get_seq_length())
                for prediction in predictions.numpy():
                    if sample_to_display > 0:
                        logger.info(
                            f"Example of generated translation: {data_loader.decode(prediction)}"
                        )
                        sample_to_display -= 1
                    all_predictions += [data_loader.decode(prediction)]

        else:
            raise NotImplementedError(
                f"No method to generate for class {data_loader.__class__.__name__}"
            )

    with open(pred_file_path, 'w+') as file_handler:
        for prediction in all_predictions:
            file_handler.write(f'{prediction}\n')