def test_shuffle(self): # Test cross-epoch random order and seed determinism data = np.arange(10) targets = data * 2 dataset = timeseries.timeseries_dataset_from_array(data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123) first_seq = None for x, y in dataset.take(1): self.assertNotAllClose(x, np.arange(0, 5)) self.assertAllClose(x[:, 0] * 2, y) first_seq = x # Check that a new iteration with the same dataset yields different results for x, _ in dataset.take(1): self.assertNotAllClose(x, first_seq) # Check determism with same seed dataset = timeseries.timeseries_dataset_from_array(data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123) for x, _ in dataset.take(1): self.assertAllClose(x, first_seq)
def predict(frame: DataFrame, split_fraction: float = PredictorUtils.SPLIT_FRACTION, past: int = PredictorUtils.PAST, batch_size: int = PredictorUtils.BATCH_SIZE, step: int = PredictorUtils.STEP, future: int = PredictorUtils.FUTURE, show_visualization: bool = False) -> Optional[PredictionDTO]: if len(frame) >= past and isfile(PredictorUtils.PATH_MODEL_FILE): train_split: int = int(split_fraction * int(frame.shape[0])) features: DataFrame = PredictorUtils.create_features(train_split, frame, False) x_val: ndarray = features.iloc[-past:][[i for i in range(len(PredictorUtils.SELECTED))]].values y_val: MaskedArray = zeros(past) sequence_length: int = int(past / step) dataset_val: BatchDataset = timeseries_dataset_from_array( x_val, y_val, sequence_length=sequence_length, sampling_rate=step, batch_size=batch_size, ) model: Functional = PredictorUtils.load_model() for x, y in dataset_val.take(5): predictions: ndarray = model.predict(x)[0] print(predictions) if show_visualization: PredictorUtils.show_plot( [x[0][:, 1].numpy(), predictions], int(future / step), 'Single Step Prediction', ) return PredictionDTO(predictions[0] - x_val[-1:][0][PredictorUtils.CLOSE_COLUMN])
def plot_prediction(frame: DataFrame, split_fraction: float = PredictorUtils.SPLIT_FRACTION, past: int = PredictorUtils.PAST, batch_size: int = PredictorUtils.BATCH_SIZE, step: int = PredictorUtils.STEP, future: int = PredictorUtils.FUTURE) -> None: if len(frame) >= past and isfile(PredictorUtils.PATH_MODEL_FILE): train_split: int = int(split_fraction * int(frame.shape[0])) features: DataFrame = PredictorUtils.create_features(train_split, frame) start: int = past + future x_val: ndarray = features.iloc[-start:-future][[i for i in range( len(PredictorUtils.SELECTED))]].values y_val: MaskedArray = zeros(past) full_range: ndarray = features.iloc[-start:][[PredictorUtils.CLOSE_COLUMN]].values[::step] sequence_length: int = int(past / step) dataset_val: BatchDataset = timeseries_dataset_from_array( x_val, y_val, sequence_length=sequence_length, sampling_rate=step, batch_size=batch_size, ) model: Functional = PredictorUtils.load_model() for x, y in dataset_val.take(5): predictions: ndarray = model.predict(x)[0] print(predictions) PredictorUtils.show_plot( [full_range, predictions], int(future / step), 'Single Step Prediction', True )
def test_errors(self): # bad start index with self.assertRaisesRegex(ValueError, 'start_index must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, start_index=-1) with self.assertRaisesRegex(ValueError, 'start_index must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, start_index=11) # bad end index with self.assertRaisesRegex(ValueError, 'end_index must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, end_index=-1) with self.assertRaisesRegex(ValueError, 'end_index must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, end_index=11) # bad sampling_rate with self.assertRaisesRegex(ValueError, 'sampling_rate must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, sampling_rate=0) # bad sequence stride with self.assertRaisesRegex(ValueError, 'sequence_stride must be '): _ = timeseries.timeseries_dataset_from_array(np.arange(10), None, 3, sequence_stride=0)
def test_start_and_end_index(self): data = np.arange(100) dataset = timeseries.timeseries_dataset_from_array(data, None, sequence_length=9, batch_size=5, sequence_stride=3, sampling_rate=2, start_index=10, end_index=90) for batch in dataset: self.assertAllLess(batch[0], 90) self.assertAllGreater(batch[0], 9)
def test_timeseries_regression(self): # Test simple timeseries regression use case data = np.arange(10) offset = 3 targets = data[offset:] dataset = timeseries.timeseries_dataset_from_array( data, targets, sequence_length=offset, batch_size=1) i = 0 for batch in dataset: self.assertLen(batch, 2) inputs, targets = batch self.assertEqual(inputs.shape, (1, 3)) # Check values self.assertAllClose(targets[0], data[offset + i]) self.assertAllClose(inputs[0], data[i:i + offset]) i += 1 self.assertEqual(i, 7) # Expect 7 batches
def test_no_targets(self): data = np.arange(50) dataset = timeseries.timeseries_dataset_from_array(data, None, sequence_length=10, batch_size=5) # Expect 9 batches i = None for i, batch in enumerate(dataset): if i < 8: self.assertEqual(batch.shape, (5, 10)) elif i == 8: self.assertEqual(batch.shape, (1, 10)) for j in range(min(5, len(batch))): # Check each sample in the batch self.assertAllClose(batch[j], np.arange(i * 5 + j, i * 5 + j + 10)) self.assertEqual(i, 8)
def test_basics(self): # Test ordering, targets, sequence length, batch size data = np.arange(100) targets = data * 2 dataset = timeseries.timeseries_dataset_from_array(data, targets, sequence_length=9, batch_size=5) # Expect 19 batches for i, batch in enumerate(dataset): self.assertLen(batch, 2) inputs, targets = batch if i < 18: self.assertEqual(inputs.shape, (5, 9)) if i == 18: # Last batch: size 2 self.assertEqual(inputs.shape, (2, 9)) # Check target values self.assertAllClose(targets, inputs[:, 0] * 2) for j in range(min(5, len(inputs))): # Check each sample in the batch self.assertAllClose(inputs[j], np.arange(i * 5 + j, i * 5 + j + 9))
def test_sequence_stride(self): data = np.arange(100) targets = data * 2 dataset = timeseries.timeseries_dataset_from_array(data, targets, sequence_length=9, batch_size=5, sequence_stride=3) for i, batch in enumerate(dataset): self.assertLen(batch, 2) inputs, targets = batch if i < 6: self.assertEqual(inputs.shape, (5, 9)) if i == 6: # Last batch: size 1 self.assertEqual(inputs.shape, (1, 9)) # Check target values self.assertAllClose(inputs[:, 0] * 2, targets) for j in range(min(5, len(inputs))): # Check each sample in the batch start_index = i * 5 * 3 + j * 3 end_index = start_index + 9 self.assertAllClose(inputs[j], np.arange(start_index, end_index))
def fit(cls, frame: DataFrame, split_fraction: float = PredictorUtils.SPLIT_FRACTION, step: int = PredictorUtils.STEP, past: int = PredictorUtils.PAST, future: int = PredictorUtils.FUTURE, batch_size: int = PredictorUtils.BATCH_SIZE, sufficient_data: int = PredictorUtils.SUFFICIENT_DATA, show_visualization: bool = False) -> None: """ There are between 29 and 32 data records per day """ if len(frame) < sufficient_data: return # Raw Data Visualization if show_visualization: PredictorUtils.show_raw_visualization(frame) PredictorUtils.show_heatmap(frame) # Data Preprocessing print( 'The selected parameters are:', ', '.join([PredictorUtils.TITLES[i] for i in PredictorUtils.SELECTED]), ) train_split: int = int(split_fraction * int(frame.shape[0])) features: DataFrame = PredictorUtils.create_features(train_split, frame) train_data: DataFrame = features.loc[0: train_split - 1] val_data: DataFrame = features.loc[train_split:] # Training dataset start: int = past + future end: int = start + train_split x_train: ndarray = train_data[[i for i in range(len(PredictorUtils.SELECTED))]].values y_train: DataFrame = features.iloc[start:end][[PredictorUtils.CLOSE_COLUMN]] sequence_length: int = int(past / step) dataset_train: BatchDataset = timeseries_dataset_from_array( x_train, y_train, sequence_length=sequence_length, sampling_rate=step, batch_size=batch_size, ) # Validation dataset x_end: int = len(val_data) - past - future label_start: int = train_split + past + future x_val: ndarray = val_data.iloc[:x_end][[i for i in range(len(PredictorUtils.SELECTED))]].values y_val: DataFrame = features.iloc[label_start:][[PredictorUtils.CLOSE_COLUMN]] dataset_val: BatchDataset = timeseries_dataset_from_array( x_val, y_val, sequence_length=sequence_length, sampling_rate=step, batch_size=batch_size, ) inputs: Optional[EagerTensor] = None targets: Optional[EagerTensor] = None for batch in dataset_train.take(1): inputs, targets = batch print('Input shape:', inputs.numpy().shape) print('Target shape:', targets.numpy().shape) # Training inputs: KerasTensor = Input(shape=(inputs.shape[1], inputs.shape[2])) lstm_out: KerasTensor = LSTM(32)(inputs) outputs: KerasTensor = Dense(1)(lstm_out) model: Model = Model(inputs=inputs, outputs=outputs) model.compile(optimizer=Adam(learning_rate=cls.LEARNING_RATE), loss='mse') model.summary() es_callback: EarlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5) checkpoint_callback: ModelCheckpoint = ModelCheckpoint( monitor='val_loss', filepath=PredictorUtils.PATH_CHECKPOINT_FILE, verbose=1, save_weights_only=True, save_best_only=True, ) Utils.create_dir(PredictorUtils.PATH_MODEL_DIR) history: History = model.fit( dataset_train, epochs=cls.EPOCHS, validation_data=dataset_val, callbacks=[es_callback, checkpoint_callback], ) model.save(PredictorUtils.PATH_MODEL_FILE) if show_visualization: PredictorUtils.visualize_loss(history, 'Training and Validation Loss')