def transfor_meta_info(meta_infos, inverse=False):
    if inverse:
        data = [value[0] for key, value in sorted(meta_infos.inverse.items(), key=lambda x: x[0])]
    else:
        data = [value for key, value in sorted(meta_infos.d.items(), key=lambda x: x[0])]
    meta_one_hot, meta_encoder = one_hot_conversion(data)
    return meta_one_hot, meta_encoder
def generate_one_hot_encoding(values):
    """
    generate the label and one_hot encoder for time features
    :param values: 
    :return: 
    """
    values_encoded, label_encoder, one_hot_encoder = one_hot_conversion(values)
    return values_encoded, label_encoder, one_hot_encoder
def convert_attribute(sites_attribute):
    ret = OrderedDict()

    # extract value
    secotor_values = [value.sector for value in sites_attribute.values()]
    time_zone_values = [value.time_zone for value in sites_attribute.values()]

    # one_hot_conversion
    sectors_one_hot, sector_label_encoder, sector_onehot_encoder = one_hot_conversion(
        secotor_values)
    tzs_one_hot, tz_label_encoder, tz_onehot_encoder = one_hot_conversion(
        time_zone_values)

    for (key,
         value), sector_one_hot, tz_one_hot in zip(sites_attribute.items(),
                                                   sectors_one_hot,
                                                   tzs_one_hot):
        example = [log(value.sq_ft), value.lat, value.lng]
        example.extend(sector_one_hot.tolist())
        example.extend(tz_one_hot.tolist())
        ret[value.site_id] = example

    return ret, (sector_label_encoder,
                 sector_onehot_encoder), (tz_label_encoder, tz_onehot_encoder)
def load_data(site_infos, resample_interval="180T", norm_type="softplus"):
    """
    load the data in a pandas datafra
    1) read the attribute
    2) remove anomalies
    3) resample the timeseries in interval of 3 hours
    4) normalize the price for sqrt meter
    5) concatenate and trucante nan
    6) apply some sort of normalization according specification
    7) create day in week feature
    8) create hour in day features
    :param site_infos: info of each site
    :param resample_interval: resample interval
    :param norm_type: type of normalization
    :return:
    """
    sites_dataframe = OrderedDict()
    norm_fn = setup_norm(norm_type)

    for site_id in site_infos.keys():
        site_df = pd.read_csv(
            path.join(BASE_DIR, "utility", "csvs", "{}.csv".format(site_id)))
        site_df = site_df.set_index(pd.DatetimeIndex(site_df['dttm_utc']))

        # check presence of anomaly, in case remove them
        anomaly_idx = site_df["anomaly"].notnull()
        if anomaly_idx.any():
            site_df = site_df[anomaly_idx == False]
        site_df = site_df.drop("anomaly", axis=1)
        # resample
        site_df = site_df[["value",
                           "estimated"]].resample(resample_interval).sum()
        # sqt normalization
        site_df["value"] /= sites_info[site_id].sq_ft
        sites_dataframe[site_id] = site_df

    sites_dataframe = pd.concat(sites_dataframe.values(),
                                axis=1,
                                keys=sites_dataframe.keys())
    # remove nan timeseries
    sites_dataframe = sites_dataframe.dropna()

    # normalize df
    if norm_type == "start_dif":
        sites_normalized_dataframe, start_values = norm_fn(sites_dataframe)
    elif norm_type == "softlog" or norm_type == "softplus":
        sites_normalized_dataframe = norm_fn(sites_dataframe)
    else:
        sites_normalized_dataframe = sites_dataframe

    # extract day and hours from index
    idx = sites_normalized_dataframe.index
    days, time = (idx.strftime("%A"), idx.strftime("%H"))

    # convert to categorycal
    days_onehot, days_label_encoder, days_onehot_encoder = one_hot_conversion(
        days)
    days_onehot = pd.DataFrame(days_onehot,
                               index=idx,
                               columns=days_label_encoder.classes_)

    if resample_interval[-1] == "T":
        times_onehot, times_label_encoder, times_onehot_encoder = one_hot_conversion(
            time)
        times_onehot = pd.DataFrame(times_onehot,
                                    index=idx,
                                    columns=times_label_encoder.classes_)

    if norm_type == "start_dif" and resample_interval[-1] == "T":
        return sites_normalized_dataframe, start_values, days_onehot, times_onehot
    elif norm_type == "softlog" and resample_interval[-1] == "T":
        return sites_normalized_dataframe, days_onehot, times_onehot
    elif norm_type == "start_dif" and resample_interval[-1] == "D":
        return sites_normalized_dataframe, start_values, days_onehot
    elif norm_type == "softlog" and resample_interval[-1] == "D":
        return sites_normalized_dataframe, days_onehot
    else:
        return sites_normalized_dataframe, days_onehot, times_onehot
def generate_embedding(stations, G, top_k=6):
    station_id_to_idx = bidict()
    station_id_to_exp_idx = MyBidict()

    # station_id = 408134
    station_id = 400000
    station_data = read_station_data(station_id, stations.loc[station_id, "Lanes"])
    days_groups = get_days_datapoints(station_data)

    # one hot encoders
    _, day_label_encoder, day_one_hot_encoder = generate_one_hot_encoding(station_data.index.day)
    _, hour_label_encoder, hour_one_hot_encoder = generate_one_hot_encoding(station_data.index.hour)
    _, minutes_label_encoder, minutes_one_hot_encoder = generate_one_hot_encoding(station_data.index.minute)

    nodes = list(filter(lambda x: type(x) == int, G.nodes))
    num_exp = day_one_hot_encoder.active_features_.size
    seq_len = days_groups[0].size - 6
    features_len = 4 + day_one_hot_encoder.active_features_.size + hour_one_hot_encoder.active_features_.size + minutes_one_hot_encoder.active_features_.size

    input_embeddings = torch.FloatTensor(num_exp * len(nodes), seq_len, features_len).zero_()
    target_embeddings = torch.FloatTensor(num_exp * len(nodes), seq_len, 1).zero_()
    neighbor_embeddings = torch.FloatTensor(num_exp * len(nodes), top_k, seq_len, features_len).zero_()
    edge_type = torch.ones(num_exp * len(nodes), top_k, seq_len, 1)
    neigh_mask = torch.zeros(num_exp * len(nodes), top_k).byte()

    nodes_data = {}
    for node_idx, node in enumerate(nodes):
        if node in nodes_data:
            node_data = nodes_data[node]
        else:
            node_data = read_station_data(node, stations.loc[station_id, "Lanes"])
            assert not np.isnan(node_data.values).any()
            nodes_data[node] = node_data

        neighbors_data = []
        for neighbor_id, distance in G.neighbors(node):
            if neighbor_id in nodes_data:
                neighbor_data = nodes_data[neighbor_id]
            else:
                neighbor_data = read_station_data(neighbor_id, stations.loc[station_id, "Lanes"])
                assert not np.isnan(neighbor_data.values).any()
                nodes_data[neighbor_id] = neighbor_data
            neighbors_data.append((neighbor_id, neighbor_data))

        station_id_to_idx[node] = node_idx

        # node embedding
        for day_idx, day_timestep in enumerate(days_groups):
            day_one_hot, _, _ = one_hot_conversion(day_timestep.day, day_label_encoder, day_one_hot_encoder)
            hour_one_hot, _, _ = one_hot_conversion(day_timestep.hour, hour_label_encoder, hour_one_hot_encoder)
            minute_one_hot, _, _ = one_hot_conversion(day_timestep.minute, minutes_label_encoder, minutes_one_hot_encoder)

            node_data_value = np.concatenate([node_data.loc[day_timestep].values, day_one_hot, hour_one_hot, minute_one_hot], axis=1)
            input_embeddings[((node_idx * num_exp) + day_idx):((node_idx * num_exp) + day_idx + 1)] = torch.from_numpy(node_data_value[:-6])
            target_embeddings[((node_idx*num_exp)+day_idx):((node_idx*num_exp)+day_idx+1)] = torch.from_numpy(node_data_value[6:, 0])

            # neighbor embedding
            for neighbor_idx, (neighbor_id, neighbor_data) in enumerate(neighbors_data):
                try:
                    neighbor_data_value = np.concatenate([neighbor_data.loc[day_timestep].values, day_one_hot, hour_one_hot, minute_one_hot], axis=1)
                    neighbor_embeddings[((node_idx*num_exp)+day_idx), neighbor_idx] = torch.from_numpy(neighbor_data_value[:-6])
                except Exception as e:
                    print(neighbor_idx, neighbor_id, day_idx)
                    print(e)
                    raise e

        station_id_to_exp_idx[node] = list(range(node_idx * num_exp, (node_idx + 1) * num_exp))

        if node_idx % 10 == 0:
            print(node_idx)

    return input_embeddings, target_embeddings, neighbor_embeddings, edge_type, neigh_mask, station_id_to_idx, station_id_to_exp_idx