def get_group_data_by_duplicate(name, num_duplicates, num_groups): dataset = get_dataset(name) dataset_group = [[] for i in range(num_groups)] whole_data_list = [] no_duplicate_whole_data_list = [] ret = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) for i in range(num_ts): train_entry = next(it) no_duplicate_whole_data_list.append({ "target": train_entry["target"], "start": train_entry["start"] }) for j in range(num_duplicates): dataset_group[i % num_groups].append({ "target": train_entry["target"], "start": train_entry["start"], }) whole_data_list.append({ "target": train_entry["target"], "start": train_entry["start"], }) random.shuffle(whole_data_list) random.shuffle(no_duplicate_whole_data_list) ret.append( ListDataset(no_duplicate_whole_data_list, freq=dataset.metadata.freq)) ret.append(ListDataset(whole_data_list, freq=dataset.metadata.freq)) for group in dataset_group: random.shuffle(group) ret.append(ListDataset(group, freq=dataset.metadata.freq)) return ret, dataset.metadata.freq
def get_whole_data(name): dataset = get_dataset(name) dataset_group = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) for i in range(num_ts): train_entry = next(it) dataset_group.append({ "target": train_entry["target"], "start": train_entry["start"] }) return ListDataset(dataset_group, freq=dataset.metadata.freq)
def get_group_data_by_var(name, num_groups, len_sample=9): dataset = get_dataset(name) dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) group_boundary = [1e3, 5e3, 1e4, 5e4, 1e5, 5e5] for i in range(num_ts): train_entry = next(it) unsplit_ts = train_entry["target"][0:800] unsplit_start = train_entry["start"] whole_data.append({"target": unsplit_ts, "start": unsplit_start}) for ts_sample_start in range(len(unsplit_ts) - len_sample): group_id = 0 print( torch.var( torch.FloatTensor( unsplit_ts[ts_sample_start:ts_sample_start + len_sample]))) continue dataset_group[group_id].append({ "target": unsplit_ts[ts_sample_start:ts_sample_start + len_sample], "start": unsplit_start, }) unsplit_start += pd.Timedelta(hours=1) import pdb pdb.set_trace() random.shuffle(whole_data) print("append once") ret.append(ListDataset(whole_data, freq=dataset.metadata.freq)) print("append twice") ret.append(ListDataset(whole_data, freq=dataset.metadata.freq)) print("append data") for group in dataset_group: random.shuffle(group) ret.append(ListDataset(group, freq=dataset.metadata.freq)) print("write whole data") with open("synthetic_traffic_time_whole_data.csv", "wb") as output: pickle.dump(ret[0:2], output) print("write group data") with open("synthetic_traffic_time_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def group_electricity_cv( num_ts=10, num_groups=14, context_length=72, prediction_length=12, file_name="default", ): dataset = get_dataset("electricity", regenerate=True) len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) date_checkpoint = [ "2012-03-01", "2012-06-01", "2012-09-01", "2012-12-01", "2013-03-01", "2013-06-01", "2013-09-01", "2013-12-01", "2014-03-01", ] # get ready the training data for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] t = unsplit_start start_date = 4 for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = ((start_date + 1) % 7) + sid * 7 start_date += 1 ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": t, "feat_static_cat": np.array([gid]), }) dataset_group[gid].append({ "target": ts_slice, "start": t, "feat_static_cat": np.array([gid]), }) unsplit_start += pd.Timedelta(hours=prediction_length) # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print( "Generating the electricity training data, the total number of training examples:", len(train_full_data), ) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) print("Finished pre-processing of the electricity dataset") return True dataset = get_dataset("traffic") len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) date_checkpoint = ["2016-01-01"] # get ready the training data for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] t = unsplit_start start_date = 4 for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = ((start_date + 1) % 7) + sid * 7 start_date += 1 ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": t, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[gid].append({ "target": ts_slice, "start": t, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta(hours=prediction_length) # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print("total number of training examples: ", len(train_full_data)) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] print("ratio for each group: ", ret["group_ratio"]) random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) return True
def group_exchangerate_cv( num_ts=10, num_groups=14, context_length=15, prediction_length=10, file_name="default", ): dataset = get_dataset("exchange_rate", regenerate=True) len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) date_checkpoint = ["1994-01-01", "1998-01-01", "2002-01-01"] for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = i * 4 + sid ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[gid].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta("1D") * prediction_length # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print( "Generating the exchange rate training data, the total number of training examples:", len(train_full_data), ) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) print("Finished pre-processing the exchange rate dataset") return True
def get_m4_by_freq( context_length=72, prediction_length=24, len_per_ts=200, num_ts=50, num_groups=6, file_name="m4_freq", ): dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() datasets_name = [ "m4_hourly", "m4_daily", "m4_weekly", "m4_monthly", "m4_quarterly", "m4_yearly", ] hours_factor = [ 1, 24, 24 * 7, 24 * 7 * 30, 24 * 7 * 30 * 3, 24 * 7 * 30 * 3 * 4, ] for i in range(num_groups): dataset = get_dataset(datasets_name[i]) len_sample = context_length + prediction_length it = iter(dataset.train) for j in range(num_ts): train_entry = next(it) unsplit_ts = train_entry["target"] # unsplit_start = train_entry['start'] unsplit_start = pd.Timestamp("1990-01-01") for ts_sample_start in range(0, len_per_ts - len_sample, prediction_length): if len_sample > len(unsplit_ts): continue ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] if len(ts_slice) < len_sample: continue nu = 1 + sum(ts_slice) / len_sample ts_slice = [i / nu for i in ts_slice] whole_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[i].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) # unsplit_start += pd.Timedelta(hours=prediction_length*hours_factor[i]) unsplit_start += pd.Timedelta(hours=prediction_length) # for j in range(len(dataset_group)): # print(len(dataset_group[i])) # import pdb;pdb.set_trace() print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_" + file_name + "_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def KMeans_inside_dataset( num_ts_=1, num_groups=16, context_length=72, prediction_length=24, file_name="default", ): dataset = get_dataset("traffic") dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() it = iter(dataset.train) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) num_ts = num_ts_ len_sample = context_length + prediction_length index = 0 feature = torch.Tensor([]) for i in range(num_ts): train_entry = next(it) target = train_entry["target"] for ts_sample_start in range(0, len(target) - len_sample, prediction_length): ts_slice = target[ts_sample_start:ts_sample_start + len_sample] feature = torch.cat(( feature, torch.Tensor([ ts_slice.mean(), ts_slice.var(), index % 7, index // 90, ]), )) index += 1 feature = feature.reshape(index, 4) feature = _get_pre_features(feature).contiguous() # print(feature) # import pdb;pdb.set_trace() cl, c = KMeans(feature, num_groups) it = iter(dataset.train) sample_id = 0 for i in range(num_ts): train_entry = next(it) target = train_entry["target"] unsplit_start = train_entry["start"] for ts_sample_start in range(0, len(target) - len_sample, prediction_length): ts_slice = target[ts_sample_start:ts_sample_start + len_sample] gid = cl[sample_id] dataset_group[gid].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) whole_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta(hours=prediction_length) sample_id += 1 print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_" + file_name + "_group_data.csv", "wb") as output: pickle.dump(ret, output) return True