def __init__(self, config: Config):
     super().__init__()
     self.config = config
     self.val_split = config.experiment.val_split
     self.batch_size = config.experiment.batch_size
     self.shuffle = config.experiment.shuffle
     self.dataset_train = ...
     self.dataset_val = ...
     self.dataset_test = ...
     self.train_parameters = process_config(
         config.experiment.train_parameters_config_file)
     self.prediction_offset = config.experiment.prediction_offset
     self.synop_file = config.experiment.synop_file
     self.target_param = config.experiment.target_parameter
     self.sequence_length = config.experiment.sequence_length
     self.labels, self.label_mean, self.label_std = prepare_synop_dataset(
         self.synop_file, [self.target_param],
         dataset_dir=SYNOP_DATASETS_DIRECTORY,
         from_year=config.experiment.synop_from_year,
         to_year=config.experiment.synop_to_year)
     available_ids = get_available_gfs_date_keys(self.train_parameters,
                                                 self.prediction_offset,
                                                 self.sequence_length)
     self.IDs = initialize_GFS_date_keys_for_sequence(
         available_ids, self.labels, self.train_parameters,
         self.target_param, self.sequence_length)
Пример #2
0
    def __init__(self, config: Config):
        """Initialization"""
        self.train_parameters = process_config(
            config.experiment.train_parameters_config_file)
        self.target_param = config.experiment.target_parameter
        self.synop_file = config.experiment.synop_file
        self.prediction_offset = config.experiment.prediction_offset
        self.target_coords = config.experiment.target_coords

        synop_data, synop_mean, synop_std = prepare_synop_dataset(
            self.synop_file, [self.target_param],
            dataset_dir=SYNOP_DATASETS_DIRECTORY,
            from_year=config.experiment.synop_from_year,
            to_year=config.experiment.synop_to_year)

        synop_data_dates = synop_data['date']
        labels = pd.concat([synop_data_dates, synop_data[self.target_param]],
                           axis=1).to_numpy().tolist()
        _, self.gfs_data, self.targets = match_gfs_with_synop_sequence(
            labels, labels, self.target_coords[0], self.target_coords[1],
            self.prediction_offset, self.train_parameters)

        self.targets = self.targets.reshape((len(self.targets), 1))

        if config.experiment.normalization_type == NormalizationType.STANDARD:
            self.gfs_data = (self.gfs_data - np.mean(
                self.gfs_data, axis=0)) / np.std(self.gfs_data, axis=0)
        else:
            self.gfs_data = (self.gfs_data - np.min(self.gfs_data, axis=0)) / (
                np.max(self.gfs_data, axis=0) - np.min(self.gfs_data, axis=0))

        assert len(self.gfs_data) == len(self.targets)
        self.data = list(zip(self.gfs_data, self.targets))
        print(synop_mean)
        print(synop_std)
    def __init__(self, config: Config, list_IDs, train=True, normalize=True):
        'Initialization'
        self.list_IDs = list_IDs
        self.train_parameters = process_config(
            config.experiment.train_parameters_config_file)
        self.target_param = config.experiment.target_parameter
        self.synop_file = config.experiment.synop_file
        self.labels, self.label_mean, self.label_std = prepare_synop_dataset(
            self.synop_file, [self.target_param],
            dataset_dir=SYNOP_DATASETS_DIRECTORY)
        self.dim = config.experiment.cnn_input_size
        self.channels = len(self.train_parameters)
        self.normalization_type = config.experiment.normalization_type

        length = len(self.list_IDs)
        training_data, test_data = self.list_IDs[:int(length *
                                                      0.8)], self.list_IDs[
                                                          int(length * 0.8):]
        if train:
            data = training_data
        else:
            data = test_data

        self.data = data
        self.mean, self.std = [], []
        self.normalize = normalize
        if normalize:
            if config.experiment.normalization_type == NormalizationType.STANDARD:
                self.mean, self.std = initialize_mean_and_std(
                    self.list_IDs, self.train_parameters, self.dim)
            else:
                self.min, self.max = initialize_min_max(
                    self.list_IDs, self.train_parameters)
Пример #4
0
    def prepare_data(self, *args, **kwargs):
        self.synop_data = prepare_synop_dataset(
            self.synop_file,
            list(list(zip(*self.train_params))[1]),
            dataset_dir=SYNOP_DATASETS_DIRECTORY,
            from_year=self.synop_from_year,
            to_year=self.synop_to_year,
            norm=False)

        dates = get_correct_dates_for_sequence(self.synop_data,
                                               self.sequence_length,
                                               self.future_sequence_length,
                                               self.prediction_offset)

        self.synop_data = self.synop_data.reset_index()

        # Get indices which correspond to 'dates' - 'dates' are the ones, which start a proper sequence without breaks
        self.synop_data_indices = self.synop_data[self.synop_data["date"].isin(
            dates)].index
        # data was not normalized, so take all frames which will be used, compute std and mean and normalize data
        self.synop_data, synop_mean, synop_std = normalize_synop_data(
            self.synop_data, self.synop_data_indices, self.feature_names,
            self.sequence_length + self.prediction_offset +
            self.future_sequence_length, self.normalization_type)
        self.synop_mean = synop_mean[self.target_param_index]
        self.synop_std = synop_std[self.target_param_index]
        print(f"Synop mean: {synop_mean[self.target_param_index]}")
        print(f"Synop std: {synop_std[self.target_param_index]}")
Пример #5
0
def prepare_data(gfs_dir: str, synop_dir: str, start_seq: int, end_seq: int,
                 gfs_past: int, gfs_future: int, synop_length: int,
                 train_split: int, column_name_label: str):
    features = [
        consts.DIRECTION_COLUMN, consts.VELOCITY_COLUMN, consts.GUST_COLUMN,
        consts.TEMPERATURE, consts.PRESSURE, consts.CURRENT_WEATHER
    ]
    synop_dataset, _, _ = prepare_synop_dataset(
        synop_dir,
        list(list(zip(*features))[1]),
        dataset_dir=SYNOP_DATASETS_DIRECTORY)
    gfs_dataset, gfs_hour_diff = prepare_gfs_sequence_dataset(
        gfs_dir, start_seq, end_seq, gfs_past, gfs_future)

    least_recent_date = get_oldest_gfs_date(gfs_dataset)
    recent_synop_date = synop_dataset["date"].max()

    synop_dataset = synop_dataset[
        synop_dataset["date"] >= pd.to_datetime(least_recent_date) -
        timedelta(hours=synop_length)]
    gfs_dataset = filter_gfs(gfs_dataset, recent_synop_date, gfs_hour_diff,
                             start_seq + end_seq)
    synop_dataset_input = []
    gfs_dataset_input = []
    dataset_label = []

    print("Creating data set")
    for key in tqdm.tqdm(gfs_dataset.keys()):
        gfs_creation_date = datetime.strptime(key, '%Y-%m-%d-%HZ')
        single_gfs = gfs_dataset[key]
        single_gfs = convert_wind(single_gfs, "U-wind, m/s", "V-wind, m/s")
        for hour in range(gfs_hour_diff):
            try:
                synop_index = synop_dataset.index[synop_dataset['date'] ==
                                                  gfs_creation_date][0]
                synop_input_index = hour + synop_index - synop_length + 1
                synop_input = synop_dataset.loc[
                    synop_input_index:synop_input_index + synop_length]

                label = synop_dataset.loc[hour + synop_index + start_seq:hour +
                                          synop_index + end_seq]

                synop_input = synop_input.drop(
                    ['date', 'year', 'day', 'month'], axis=1).values
                gfs_input = normalize(single_gfs.drop(['date'], axis=1).values)
                label = label[column_name_label].values

                synop_dataset_input.append(synop_input)
                gfs_dataset_input.append(gfs_input)

                dataset_label.append(label)
            except:
                pass
    return np.array(synop_dataset_input), np.array(
        gfs_dataset_input), np.array(dataset_label)
Пример #6
0
def explore_synop(localisation_name: str, code_fallback: int):
    relevant_features = [f for f in SYNOP_FEATURES if f[1] not in ['year', 'month', 'day', 'hour']]
    synop_file = f"{localisation_name}_{code_fallback}_data.csv"
    if not os.path.exists(os.path.join(SYNOP_DATASETS_DIRECTORY, synop_file)):
        prepare_synop_csv(localisation_name, code_fallback, SYNOP_FEATURES)

    data = prepare_synop_dataset(synop_file, list(list(zip(*relevant_features))[1]), norm=False,
                                 dataset_dir=SYNOP_DATASETS_DIRECTORY)

    explore_synop_correlations(data, relevant_features, localisation_name)
    explore_synop_patterns(data, relevant_features, localisation_name)
Пример #7
0
def main(station: str, target_param: str, prediction_offset: int,
         sequence_length: int, from_year: int):
    synop_file = STATION_META[station]['synop_file']
    synop_data = prepare_synop_dataset(synop_file, [target_param],
                                       norm=False,
                                       dataset_dir=SYNOP_DATASETS_DIRECTORY,
                                       from_year=from_year)

    labels = synop_data[['date', target_param]]

    available_gfs_date_keys = get_available_gfs_date_keys(
        target_param_to_gfs_name_level(target_param), prediction_offset,
        sequence_length, from_year)

    compare_gfs_with_synop(labels, available_gfs_date_keys, target_param,
                           STATION_META[station]['lat'],
                           STATION_META[station]['lon'], prediction_offset,
                           sequence_length)
Пример #8
0
def prepare_data(past_len=12, future_offset=12, train_split_factor=0.75):
    features = []
    dataset_train, _, _ = prepare_synop_dataset("preprocess/synop_data/135_data.csv", "")
    gfs_dataset = prepare_gfs_dataset_for_single_point_time("preprocess/wind_and_temp", 'date', future_offset)

    least_recent_date = gfs_dataset["date"].min()
    dataset_train = dataset_train[dataset_train["date"] >= least_recent_date - timedelta(hours=past_len)]
    gfs_dataset = gfs_dataset[gfs_dataset["date"] > least_recent_date + timedelta(hours=future_offset)]
    gfs_dataset = gfs_dataset.sort_values(by=["date"])

    train_synop_input, gfs_input, train_synop_label = create_sequence(dataset_train, gfs_dataset, past_len,
                                                                      future_offset)
    train_size = int(train_synop_input.shape[0] * train_split_factor)

    x_train = [train_synop_input[:train_size], gfs_input[:train_size]]
    y_train = train_synop_label[:train_size]
    x_valid = [train_synop_input[train_size:], gfs_input[train_size:]]
    y_valid = train_synop_label[train_size:]

    return x_train, y_train, x_valid, y_valid
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.val_split = config.experiment.val_split
        self.batch_size = config.experiment.batch_size
        self.shuffle = config.experiment.shuffle
        self.dataset_train = ...
        self.dataset_val = ...
        self.dataset_test = ...
        self.synop_file = config.experiment.synop_file
        self.train_params = config.experiment.synop_train_features
        self.target_param = config.experiment.target_parameter
        self.sequence_length = config.experiment.sequence_length
        self.labels = prepare_synop_dataset(
            self.synop_file,
            list(list(zip(*self.train_params))[1]),
            dataset_dir=SYNOP_DATASETS_DIRECTORY,
            from_year=config.experiment.synop_from_year,
            to_year=config.experiment.synop_to_year,
            norm=False)

        self.dates = get_correct_dates_for_sequence(
            self.labels, self.sequence_length, 1,
            config.experiment.prediction_offset)
def prepare_target_attribute_dataset(synop_data_file, target_attribute, init_date, end_date):
    dataset, _, _ = prepare_synop_dataset(synop_data_file, [target_attribute], SYNOP_DATASETS_DIRECTORY)
    return filter_for_dates(dataset, init_date, end_date)