def test_order_by(): data = [(i, 1, 2) for i in range(5)] dataset = DataSet(data) dataset.order_by('clinic_id', descending=True) for i in range(5): assert dataset[i][0] == 4 - i
def test_aggregate_on(): data = [(1, 2, i) for i in range(5)] dataset = DataSet(data) dataset_data = dataset.data agg = dataset.aggregate_on('clinic_id', key=lambda x: x) assert agg == {1: dataset_data}
def test_aggregate_on_dates(test_data): dataset = DataSet(test_data) agg = dataset.aggregate_on('date_received', key=lambda d: str(d.date())) assert len(agg['2020-01-01']) == 1 assert len(agg['2020-01-03']) == 2 assert len(agg['2020-01-04']) == 1 assert len(agg['2020-01-05']) == 1 assert len(agg['2020-01-07']) == 4 assert len(agg['2020-01-11']) == 2
def run(years=1, year_target=5, epochs=100, epochs_target=500): print(f'Generating {years} year(s) of data...') data_start = time.time() data = create_data(datetime(2020 - years, 1, 1), datetime(2020, 1, 1)) dataset = DataSet(data) print('Data generation done') print(f'Took {time.time() - data_start}s\n') model = RadiusVariance(seq_size=30, radius=15) ml_dataset = model.create_ml_dataset(dataset) print(f'Training for {epochs} epochs...') train_start = time.time() model.get_model().compile(optimizer=Adam(lr=0.001), loss='mse') model.get_model().fit(x=ml_dataset.inputs, y=ml_dataset.outputs, epochs=epochs) print('Training done') train_time = int(time.time() - train_start) print(f'Took {train_time}s\n') total_time = train_time * (year_target / years) * (epochs_target / epochs) print( f'It would take this machine an estimated {int(total_time // 60)}m{int(total_time % 60)}s ' + f'to train {year_target} years of data for {epochs_target} epochs.')
def test_create_ml_dataset_empty(): model = RadiusVariance(seq_size=3, radius=2) dataset = DataSet([]) ml_dataset = model.create_ml_dataset(dataset) assert np.array_equal(ml_dataset.inputs[0], np.array([])) assert np.array_equal(ml_dataset.inputs[1], np.empty((0, 43))) assert np.array_equal(ml_dataset.outputs[0], np.empty((0, 2)))
def visualize_dataset_arrivals(dataset: DataSet, output_file: str): dataset.order_by('date_received') first = dataset[0].date_received last = dataset[-1].date_received agg = dataset.aggregate_on('date_received', lambda dr: str(dr.date())) x = list(_date_range(first, last)) x2 = [] y = [] i = 0 for x_val in x: i += 1 x2.append(i) y.append( len(agg[str(x_val.date())]) if str(x_val.date()) in agg else 0) plt.plot_date(x, y, markersize=2) plt.gcf().autofmt_xdate() plt.savefig(output_file)
def create_ml_dataset(self, dataset: DataSet) -> MLDataSet: """ Build a MLDataSet compatible with RadiusVariance using the provided DataSet. :param dataset: The DataSet to construct the MLDataSet from. :return: The MLDataSet """ if len(dataset): dataset.order_by('date_received') first = dataset[0] last = dataset[-1] min_date = first.date_received + self._timedelta(self.radius) max_date = last.date_received - self._timedelta(self.radius) if self.time_interval == TimeInterval.WEEK: date_aggregation = dataset.aggregate_on('date_received', lambda dr: self._datestr(dr)) else: date_aggregation = dataset.aggregate_on('date_received', lambda dr: self._datestr(dr)) date_range = list(self._date_range(min_date, max_date)) else: date_range = [] x = [np.zeros((len(date_range), 1)), np.zeros((len(date_range), 12 + 31))] y = [np.zeros((len(date_range), 2))] for i, date in enumerate(date_range): date_str = self._datestr(date) val = len(date_aggregation[date_str]) if date_str in date_aggregation else 0 radius_vals = [] for j in range(-self.radius, self.radius + 1): j_date_str = self._datestr(date + self._timedelta(j)) radius_vals.append(len(date_aggregation[j_date_str]) if j_date_str in date_aggregation else 0) variance = np.var(radius_vals) one_hot_date = np.zeros(12 + 31) one_hot_date[date.date().month] = 1 one_hot_date[11 + date.date().day] = 1 x[0][i] = val x[1][i] = one_hot_date y[0][i] = [val, variance] return MLDataSet(x[0:1], x[1:], y, self.seq_size)
def test_dataset(): data = [] with open('tests/test_data.txt') as f: for line in f.readlines(): line = line.rstrip().split(',') data.append(( int(line[0]), int(line[1]), datetime.strptime(line[2], '%Y-%m-%d'), )) return DataSet(data)
def _load_dataset_from_file(file_name: Text) -> DataSet: data = [] with open(file_name) as f: for line in f.readlines(): line = line.rstrip().split(',') data.append(( int(line[0]), int(line[1]), datetime.strptime(line[2], '%Y-%m-%d'), )) return DataSet(data)
def cyclic(start_date: datetime, end_date: datetime, random_multiple=0) -> DataSet: fn = lambda d: max(round( 20 + d/300 + 2*math.sin(2 * math.pi * d/365) + random_multiple*random.random() ), 0) data = [] i = 0 for d, date in enumerate(date_range(start_date, end_date)): arrivals = fn(d) for arrival in range(arrivals): data.append((1, get_random_severity(), date)) i += 1 return DataSet(data)
def test_create_ml_dataset_radius_affects_length(test_dataset): model = RadiusVariance(seq_size=3, radius=3) dataset = DataSet(test_dataset) ml_dataset = model.create_ml_dataset(dataset) assert ml_dataset.inputs[0].shape[0] == 2
def test_create_ml_dataset_correct_length(test_dataset): model = RadiusVariance(seq_size=1, radius=1) dataset = DataSet(test_dataset) ml_dataset = model.create_ml_dataset(dataset) assert ml_dataset.inputs[0].shape[0] == 8
def test_dataset_index(): data = [('a', 'b', 'c'), (1, 2, 3)] dataset = DataSet(data) assert dataset[0][0] == 'a' assert dataset[1][0] == 1
def test_dataset_len(): data = 5 * [('a', 'b', 'c')] dataset = DataSet(data) assert len(dataset) == 5