def transfor_meta_info(meta_infos, inverse=False): if inverse: data = [value[0] for key, value in sorted(meta_infos.inverse.items(), key=lambda x: x[0])] else: data = [value for key, value in sorted(meta_infos.d.items(), key=lambda x: x[0])] meta_one_hot, meta_encoder = one_hot_conversion(data) return meta_one_hot, meta_encoder
def generate_one_hot_encoding(values): """ generate the label and one_hot encoder for time features :param values: :return: """ values_encoded, label_encoder, one_hot_encoder = one_hot_conversion(values) return values_encoded, label_encoder, one_hot_encoder
def convert_attribute(sites_attribute): ret = OrderedDict() # extract value secotor_values = [value.sector for value in sites_attribute.values()] time_zone_values = [value.time_zone for value in sites_attribute.values()] # one_hot_conversion sectors_one_hot, sector_label_encoder, sector_onehot_encoder = one_hot_conversion( secotor_values) tzs_one_hot, tz_label_encoder, tz_onehot_encoder = one_hot_conversion( time_zone_values) for (key, value), sector_one_hot, tz_one_hot in zip(sites_attribute.items(), sectors_one_hot, tzs_one_hot): example = [log(value.sq_ft), value.lat, value.lng] example.extend(sector_one_hot.tolist()) example.extend(tz_one_hot.tolist()) ret[value.site_id] = example return ret, (sector_label_encoder, sector_onehot_encoder), (tz_label_encoder, tz_onehot_encoder)
def load_data(site_infos, resample_interval="180T", norm_type="softplus"): """ load the data in a pandas datafra 1) read the attribute 2) remove anomalies 3) resample the timeseries in interval of 3 hours 4) normalize the price for sqrt meter 5) concatenate and trucante nan 6) apply some sort of normalization according specification 7) create day in week feature 8) create hour in day features :param site_infos: info of each site :param resample_interval: resample interval :param norm_type: type of normalization :return: """ sites_dataframe = OrderedDict() norm_fn = setup_norm(norm_type) for site_id in site_infos.keys(): site_df = pd.read_csv( path.join(BASE_DIR, "utility", "csvs", "{}.csv".format(site_id))) site_df = site_df.set_index(pd.DatetimeIndex(site_df['dttm_utc'])) # check presence of anomaly, in case remove them anomaly_idx = site_df["anomaly"].notnull() if anomaly_idx.any(): site_df = site_df[anomaly_idx == False] site_df = site_df.drop("anomaly", axis=1) # resample site_df = site_df[["value", "estimated"]].resample(resample_interval).sum() # sqt normalization site_df["value"] /= sites_info[site_id].sq_ft sites_dataframe[site_id] = site_df sites_dataframe = pd.concat(sites_dataframe.values(), axis=1, keys=sites_dataframe.keys()) # remove nan timeseries sites_dataframe = sites_dataframe.dropna() # normalize df if norm_type == "start_dif": sites_normalized_dataframe, start_values = norm_fn(sites_dataframe) elif norm_type == "softlog" or norm_type == "softplus": sites_normalized_dataframe = norm_fn(sites_dataframe) else: sites_normalized_dataframe = sites_dataframe # extract day and hours from index idx = sites_normalized_dataframe.index days, time = (idx.strftime("%A"), idx.strftime("%H")) # convert to categorycal days_onehot, days_label_encoder, days_onehot_encoder = one_hot_conversion( days) days_onehot = pd.DataFrame(days_onehot, index=idx, columns=days_label_encoder.classes_) if resample_interval[-1] == "T": times_onehot, times_label_encoder, times_onehot_encoder = one_hot_conversion( time) times_onehot = pd.DataFrame(times_onehot, index=idx, columns=times_label_encoder.classes_) if norm_type == "start_dif" and resample_interval[-1] == "T": return sites_normalized_dataframe, start_values, days_onehot, times_onehot elif norm_type == "softlog" and resample_interval[-1] == "T": return sites_normalized_dataframe, days_onehot, times_onehot elif norm_type == "start_dif" and resample_interval[-1] == "D": return sites_normalized_dataframe, start_values, days_onehot elif norm_type == "softlog" and resample_interval[-1] == "D": return sites_normalized_dataframe, days_onehot else: return sites_normalized_dataframe, days_onehot, times_onehot
def generate_embedding(stations, G, top_k=6): station_id_to_idx = bidict() station_id_to_exp_idx = MyBidict() # station_id = 408134 station_id = 400000 station_data = read_station_data(station_id, stations.loc[station_id, "Lanes"]) days_groups = get_days_datapoints(station_data) # one hot encoders _, day_label_encoder, day_one_hot_encoder = generate_one_hot_encoding(station_data.index.day) _, hour_label_encoder, hour_one_hot_encoder = generate_one_hot_encoding(station_data.index.hour) _, minutes_label_encoder, minutes_one_hot_encoder = generate_one_hot_encoding(station_data.index.minute) nodes = list(filter(lambda x: type(x) == int, G.nodes)) num_exp = day_one_hot_encoder.active_features_.size seq_len = days_groups[0].size - 6 features_len = 4 + day_one_hot_encoder.active_features_.size + hour_one_hot_encoder.active_features_.size + minutes_one_hot_encoder.active_features_.size input_embeddings = torch.FloatTensor(num_exp * len(nodes), seq_len, features_len).zero_() target_embeddings = torch.FloatTensor(num_exp * len(nodes), seq_len, 1).zero_() neighbor_embeddings = torch.FloatTensor(num_exp * len(nodes), top_k, seq_len, features_len).zero_() edge_type = torch.ones(num_exp * len(nodes), top_k, seq_len, 1) neigh_mask = torch.zeros(num_exp * len(nodes), top_k).byte() nodes_data = {} for node_idx, node in enumerate(nodes): if node in nodes_data: node_data = nodes_data[node] else: node_data = read_station_data(node, stations.loc[station_id, "Lanes"]) assert not np.isnan(node_data.values).any() nodes_data[node] = node_data neighbors_data = [] for neighbor_id, distance in G.neighbors(node): if neighbor_id in nodes_data: neighbor_data = nodes_data[neighbor_id] else: neighbor_data = read_station_data(neighbor_id, stations.loc[station_id, "Lanes"]) assert not np.isnan(neighbor_data.values).any() nodes_data[neighbor_id] = neighbor_data neighbors_data.append((neighbor_id, neighbor_data)) station_id_to_idx[node] = node_idx # node embedding for day_idx, day_timestep in enumerate(days_groups): day_one_hot, _, _ = one_hot_conversion(day_timestep.day, day_label_encoder, day_one_hot_encoder) hour_one_hot, _, _ = one_hot_conversion(day_timestep.hour, hour_label_encoder, hour_one_hot_encoder) minute_one_hot, _, _ = one_hot_conversion(day_timestep.minute, minutes_label_encoder, minutes_one_hot_encoder) node_data_value = np.concatenate([node_data.loc[day_timestep].values, day_one_hot, hour_one_hot, minute_one_hot], axis=1) input_embeddings[((node_idx * num_exp) + day_idx):((node_idx * num_exp) + day_idx + 1)] = torch.from_numpy(node_data_value[:-6]) target_embeddings[((node_idx*num_exp)+day_idx):((node_idx*num_exp)+day_idx+1)] = torch.from_numpy(node_data_value[6:, 0]) # neighbor embedding for neighbor_idx, (neighbor_id, neighbor_data) in enumerate(neighbors_data): try: neighbor_data_value = np.concatenate([neighbor_data.loc[day_timestep].values, day_one_hot, hour_one_hot, minute_one_hot], axis=1) neighbor_embeddings[((node_idx*num_exp)+day_idx), neighbor_idx] = torch.from_numpy(neighbor_data_value[:-6]) except Exception as e: print(neighbor_idx, neighbor_id, day_idx) print(e) raise e station_id_to_exp_idx[node] = list(range(node_idx * num_exp, (node_idx + 1) * num_exp)) if node_idx % 10 == 0: print(node_idx) return input_embeddings, target_embeddings, neighbor_embeddings, edge_type, neigh_mask, station_id_to_idx, station_id_to_exp_idx