Exemplo n.º 1
0
    def forward(self, bg: dgl.DGLGraph, spatial_features: torch.Tensor,
                temporal_features: torch.Tensor,
                external_features: torch.Tensor):
        """
        get predictions
        :param bg: batched graphs,
             with the total number of nodes is `node_num`,
             including `batch_size` disconnected subgraphs
        :param spatial_features: shape [node_num, F_1]
        :param temporal_features: shape [node_num, F_2, T]
        :param external_features: shape [batch_size, F_3]
        :return: a tensor, shape [batch_size], with the prediction results for each graphs
        """

        if get_attribute("use_SBlock"):
            # shape [nodes * 10]
            s_out = self.spatial_gcn(bg,
                                     self.spatial_embedding(spatial_features))

        else:
            # remove spatial layer
            s_out = self.replace_spatial_gcn(spatial_features)

        if get_attribute("use_STBlock"):
            # temporal_embeddings of shape [node_num, 20, T_in]
            temporal_embeddings = self.temporal_embedding(
                bg, temporal_features)
        else:
            # remove temporal layer
            temporal_embeddings = torch.transpose(
                self.replace_temporal_layer(
                    torch.transpose(temporal_features, -1, -2)), -1, -2)

        # t_out of shape [1, node_num, 10]
        # _, (t_out, _) = self.temporal_agg(torch.transpose(temporal_embeddings, -1, -2))
        t_out = self.temporal_agg(temporal_embeddings)
        t_out.squeeze_()

        if get_attribute("use_Embedding"):
            e_out = self.external_embedding(external_features)
        else:
            # remove external embedding layer
            e_out = external_features

        try:
            nums_nodes, id = bg.batch_num_nodes(), 0
        except:
            nums_nodes, id = bg.batch_num_nodes, 0
        s_features, t_features = list(), list()
        for num_nodes in nums_nodes:
            s_features.append(s_out[id])
            t_features.append(t_out[id])
            id += num_nodes

        s_features = torch.stack(s_features)
        t_features = torch.stack(t_features)

        # torch.cat((x, y), -1), x: 2 * 3, y: 2 * 5, result: 2 * 8
        return self.output_layer(torch.cat((s_features, t_features, e_out),
                                           -1))
Exemplo n.º 2
0
def create_model():

    print(f"{get_attribute('data')}/{get_attribute('save_model_folder')}")

    model = temporal_set_prediction(
        items_total=get_attribute('items_total'),
        item_embedding_dim=get_attribute('item_embed_dim'))

    return model
Exemplo n.º 3
0
def get_class_weights(data_path):
    with open(data_path, 'r') as file:
        data_dict = json.load(file)
    train_data = data_dict['train']
    item_frequency = torch.ones(get_attribute('items_total'))
    num_baskets = 0
    for user, baskets in train_data.items():
        for basket in baskets:
            num_baskets += 1
            for item in basket:
                item_frequency[item] += 1
    item_frequency /= num_baskets
    max_item_frequency = torch.ones(
        get_attribute('items_total')) * torch.max(item_frequency)

    weights = max_item_frequency / item_frequency

    weights = weights / torch.max(weights)

    return weights
Exemplo n.º 4
0
    def __init__(self, f_1: int, f_2: int, f_3: int):
        """
        :param f_1: the number of static features each node, default 22
        :param f_2: the number of dynamic features each node, default 1
        :param f_3: the number of features overall
        """
        super(DSTGCN, self).__init__()

        if get_attribute("use_SBlock"):
            self.spatial_embedding = fully_connected_layer(f_1, [20], 15)
            self.spatial_gcn = StackedSBlocks([
                GCN(15, [15, 15, 15], 15),
                GCN(15, [15, 15, 15], 15),
                GCN(15, [14, 13, 12, 11], 10)
            ])
        else:
            # replace spatial layer
            self.replace_spatial_gcn = fully_connected_layer(f_1, [20, 15], 10)

        if get_attribute("use_STBlock"):
            self.temporal_embedding = StackedSTBlocks(
                [STBlock(f_2, 4),
                 STBlock(5, 5),
                 STBlock(10, 10)])
        else:
            # replace spatial-temporal layer
            self.replace_temporal_layer = fully_connected_layer(
                f_2, [5, 10], 20)

        # self.temporal_agg = nn.AvgPool1d(24)
        self.temporal_agg = nn.AvgPool1d(1)

        self.external_embedding = fully_connected_layer(
            f_3, [(f_3 * (4 - i) + 10 * i) // 4 for i in (1, 4)], 10)

        if get_attribute("use_Embedding"):
            self.output_layer = nn.Sequential(nn.ReLU(), nn.Linear(40, 1),
                                              nn.Sigmoid())
        else:
            self.output_layer = nn.Sequential(nn.ReLU(), nn.Linear(73, 1),
                                              nn.Sigmoid())
Exemplo n.º 5
0
def create_loss(loss_type):
    if loss_type == 'bpr_loss':
        loss_func = BPRLoss()
    elif loss_type == 'mse_loss':
        loss_func = WeightMSELoss()
    elif loss_type == 'weight_mse_loss':
        loss_func = WeightMSELoss(
            weights=get_class_weights(get_attribute('data_path')))
    elif loss_type == "multi_label_soft_loss":
        loss_func = nn.MultiLabelSoftMarginLoss(reduction="mean")
    else:
        raise ValueError("Unknown loss function.")
    return loss_func
Exemplo n.º 6
0
def get_truth_data(truth_data):
    """
    Args:
        truth_data: list, shape (baskets_num, items_num)
    Returns:
        turth: tensor, shape (baskets_num, items_total)
    """
    truth_list = []
    for basket in truth_data:
        one_hot_items = F.one_hot(basket,
                                  num_classes=get_attribute('items_total'))
        one_hot_basket, _ = torch.max(one_hot_items, dim=0)
        truth_list.append(one_hot_basket)
    truth = torch.stack(truth_list)

    return truth
Exemplo n.º 7
0
def get_data_loaders(k_order, batch_size):
    """
    Args:
        k_order: int
        batch_size: int

    Returns:
        data_loader: DataLoader
    """
    network_path = r'../data/beijing_roadnet.gpickle'
    node_attr_path = r'../data/edges_data.h5'
    accident_path = r'../data/accident.h5'
    weather_path = "../data/weather.h5"
    speed_path = "../data/all_grids_speed.h5"

    sf_mean, sf_std = np.array(
        get_attribute('spatial_features_mean')), np.array(
            get_attribute('spatial_features_std'))
    tf_mean, tf_std = np.array(
        get_attribute('temporal_features_mean')), np.array(
            get_attribute('temporal_features_std'))
    ef_mean, ef_std = np.array(
        get_attribute('external_features_mean')), np.array(
            get_attribute('external_features_std'))

    network = nx.read_gpickle(network_path)
    # XCoord  YCoord LENGTH  NUM_NODE
    nodes = pd.read_hdf(node_attr_path)
    # 'valid_time', 'temp', 'dewPt', 'rh', 'pressure', 'wspd', 'feels_like',  ......
    weather = pd.read_hdf(weather_path)

    speed = fill_speed(pd.read_hdf(speed_path))

    dls = dict()
    for key in ['train', 'validate', 'test']:
        # longitude   latitude  time  node_id  accident
        accident = pd.read_hdf(accident_path, key=key)
        dataset = AccidentDataset(k_order,
                                  network,
                                  nodes,
                                  accident,
                                  weather,
                                  speed,
                                  sf_scaler=(sf_mean, sf_std),
                                  tf_scaler=(tf_mean, tf_std),
                                  ef_scaler=(ef_mean, ef_std))
        dls[key] = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=False,
                              collate_fn=collate_fn,
                              num_workers=16)
    return dls
Exemplo n.º 8
0
        metric_list = [metric[key] for metric in test_metrics]
        mean_value = np.mean(metric_list)
        if key in ["MSE", "RMSE", "MAE"]:
            best_value = np.min(metric_list)
        else:
            best_value = np.max(metric_list)
        std_value = np.std(metric_list, ddof=1)
        metrics[f"mean_{key}"] = float(mean_value)
        metrics[f"best_{key}"] = float(best_value)
        metrics[f"std_{key}"] = float(std_value)

    scores = sorted(metrics.items(), key=lambda item: item[0], reverse=False)
    scores = {item[0]: item[1] for item in scores}

    scores_str = json.dumps(scores, indent=4)

    results_folder = f"../results/{get_attribute('data')}"
    if not os.path.exists(results_folder):
        os.makedirs(results_folder, exist_ok=True)

    save_path = f"{results_folder}/{get_attribute('model_name')}_result.json"
    with open(save_path, 'w') as file:
        file.write(scores_str)
    print(f'save path is {save_path}')
    print(f"metric -> {scores_str}")


if __name__ == '__main__':
    main(train_repeat_times=get_attribute("train_repeat_times"))
    sys.exit()
Exemplo n.º 9
0
    for key in ['train', 'validate', 'test']:
        # longitude   latitude  time  node_id  accident
        accident = pd.read_hdf(accident_path, key=key)
        dataset = AccidentDataset(k_order,
                                  network,
                                  nodes,
                                  accident,
                                  weather,
                                  speed,
                                  sf_scaler=(sf_mean, sf_std),
                                  tf_scaler=(tf_mean, tf_std),
                                  ef_scaler=(ef_mean, ef_std))
        dls[key] = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=False,
                              collate_fn=collate_fn,
                              num_workers=16)
    return dls


if __name__ == "__main__":
    dls = get_data_loaders(get_attribute("K_hop"), get_attribute('batch_size'))
    for key in ["train", "validate", "test"]:
        for step, (g, spatial_features, temporal_features, external_features,
                   y) in tqdm(enumerate(dls[key])):
            # input_data, truth_data
            # if step == 0:
            #     print(g, spatial_features.shape, temporal_features.shape, external_features.shape, y.shape)
            pass
Exemplo n.º 10
0
def get_data_loaders(k_order, batch_size):
    """
    Args:
        k_order: int
        batch_size: int

    Returns:
        data_loader: DataLoader
    """
    network_path = r'../data/newyork_roadnet.gpickle'
    node_attr_path = r'../data/edges_data.h5'
    accident_path = r'../data/accident_10_2016.h5'
    # weather_path = '../data/weather.h5'
    weather_path = '../data/weather_2016.csv'
    # speed_path = '../data/all_grids_speed.h5'
    speed_path = '../data/all_grids_speed_data.json'

    sf_mean, sf_std = np.array(
        get_attribute('spatial_features_mean')), np.array(
            get_attribute('spatial_features_std'))
    tf_mean, tf_std = np.array(
        get_attribute('temporal_features_mean')), np.array(
            get_attribute('temporal_features_std'))
    ef_mean, ef_std = np.array(
        get_attribute('external_features_mean')), np.array(
            get_attribute('external_features_std'))

    network = nx.read_gpickle(network_path)
    # XCoord  YCoord LENGTH  NUM_NODE
    # nodes = pd.read_hdf(node_attr_path)
    nodes = pd.read_csv(node_attr_path)
    nodes['spatial_features'] = nodes.apply(
        lambda x: json.loads(x['spatial_features']), axis=1)
    # nodes.drop('spatial_features')
    # nodes['spatial_features'] = nodes['spatial_features_tmp']
    # nodes.drop('spatial_features_tmp')
    # 'valid_time', 'temp', 'dewPt', 'rh', 'pressure', 'wspd', 'feels_like',  ......
    # weather = pd.read_hdf(weather_path)
    weather = pd.read_csv(weather_path, index_col=0, parse_dates=True)

    # speed = fill_speed(pd.read_hdf(speed_path))
    # speed_data = pd.read_csv(speed_path, index_col=0, parse_dates=True)
    # speed = fill_speed_csv(speed_data)

    with open(speed_path) as speed_file:
        speed_data = json.loads(speed_file.readline())
    speed = fill_speed_json(speed_data)

    dls = dict()
    for key in ['train', 'validate', 'test']:
        # longitude   latitude  time  node_id  accident
        accident = pd.read_hdf(accident_path, key=key)
        dataset = AccidentDataset(k_order,
                                  network,
                                  nodes,
                                  accident,
                                  weather,
                                  speed,
                                  sf_scaler=(sf_mean, sf_std),
                                  tf_scaler=(tf_mean, tf_std),
                                  ef_scaler=(ef_mean, ef_std))
        dls[key] = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              drop_last=False,
                              collate_fn=collate_fn,
                              num_workers=0)
    return dls
Exemplo n.º 11
0
def train():
    model = create_model()
    # 创建data_loader
    train_data_loader = get_data_loader(
        data_path=get_attribute('data_path'),
        data_type='train',
        batch_size=get_attribute('batch_size'),
        item_embedding_matrix=model.item_embedding)
    valid_data_loader = get_data_loader(
        data_path=get_attribute('data_path'),
        data_type='validate',
        batch_size=get_attribute('batch_size'),
        item_embedding_matrix=model.item_embedding)
    loss_func = create_loss(loss_type=get_attribute('loss_function'))

    # 训练
    model_folder = f"../save_model_folder/{get_attribute('data')}/{get_attribute('save_model_folder')}"
    tensorboard_folder = f"../runs/{get_attribute('data')}/{get_attribute('save_model_folder')}"

    shutil.rmtree(model_folder, ignore_errors=True)
    os.makedirs(model_folder, exist_ok=True)
    shutil.rmtree(tensorboard_folder, ignore_errors=True)
    os.makedirs(tensorboard_folder, exist_ok=True)

    if get_attribute("optim") == "Adam":
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=get_attribute("learning_rate"),
            weight_decay=get_attribute("weight_decay"))
    elif get_attribute("optim") == "SGD":
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=get_attribute("learning_rate"),
                                    momentum=0.9)
    else:
        raise NotImplementedError()

    train_model(model=model,
                train_data_loader=train_data_loader,
                valid_data_loader=valid_data_loader,
                loss_func=loss_func,
                epochs=get_attribute('epochs'),
                optimizer=optimizer,
                model_folder=model_folder,
                tensorboard_folder=tensorboard_folder)
Exemplo n.º 12
0
from utils.util import convert_to_gpu
from train.train_main import create_model
from utils.util import load_model

if __name__ == '__main__':
    model_path = f"../save_model_folder/{get_attribute('data')}/{get_attribute('save_model_folder')}" \
        f"/model_epoch_19.pkl"
    print(f'model path -> {model_path}')

    model = create_model()
    model = load_model(model, model_path)
    model = convert_to_gpu(model)
    print(model)

    test_data_loader = get_data_loader(
        data_path=get_attribute('data_path'),
        data_type='test',
        batch_size=get_attribute('batch_size'),
        item_embedding_matrix=model.item_embedding)

    print('===== Test predict result =====')
    scores = evaluate(model, test_data_loader)

    scores = sorted(scores.items(), key=lambda item: item[0], reverse=False)
    scores = {item[0]: item[1] for item in scores}

    scores_str = json.dumps(scores, indent=4)
    print(f'scores -> {scores_str}')

    model_folder = f"../results/{get_attribute('data')}"
    if not os.path.exists(model_folder):
Exemplo n.º 13
0
def convert_to_gpu(data):
    if get_attribute('cuda') != -1 and torch.cuda.is_available():
        data = data.cuda(get_attribute('cuda'))
    return data
Exemplo n.º 14
0
from scipy import spatial
import pandas as pd
import networkx as nx
import random
from collections import defaultdict
from tqdm import tqdm

from utils.load_config import get_attribute
from transform_coord.coord_converter import convert_by_type

K = get_attribute("K_hop")

# 正负样本比
pos_neg_rate = 1
trainDataPercent = 0.7
validationDataPercent = 0.1

longitudeMin = 116.09608
longitudeMax = 116.71040
latitudeMin = 39.69086
latitudeMax = 40.17647

# 坐标转换
longitudeMin, latitudeMin = convert_by_type(lng=longitudeMin,
                                            lat=latitudeMin,
                                            type="g2w")
longitudeMax, latitudeMax = convert_by_type(lng=longitudeMax,
                                            lat=latitudeMax,
                                            type="g2w")

accident_path = "/home/yule/桌面/traffic_accident_data/accident.csv"
Exemplo n.º 15
0
import torch
from utils.data_container import get_data_loaders
from tqdm import tqdm
from utils.util import convert_train_truth_to_gpu
from utils.util import convert_to_gpu

if __name__ == '__main__':
    model_path = f"../saves/spatial_temporal_external/DSTGCN/model_0.pkl"
    print(f'model path -> {model_path}')
    model = create_model()
    model.load_state_dict(torch.load(model_path)["model_state_dict"])
    print(f'model epoch -> {torch.load(model_path)["epoch"]}')
    model = convert_to_gpu(model)
    print(model)

    data_loaders = get_data_loaders(get_attribute('K_hop'),
                                    get_attribute('batch_size'))
    phase = "test"
    tqdm_loader = tqdm(enumerate(data_loaders[phase]))
    predictions, targets = list(), list()
    for step, (g, spatial_features, temporal_features, external_features,
               truth_data) in tqdm_loader:
        torch.zero_(external_features)

        features, truth_data = convert_train_truth_to_gpu(
            [spatial_features, temporal_features, external_features],
            truth_data)
        outputs = model(g, *features)
        outputs = torch.squeeze(
            outputs)  # squeeze [batch-size, 1] to [batch-size]
Exemplo n.º 16
0
def main(train_repeat_times):
    # 创建data_loader
    data_loaders = get_data_loaders(get_attribute('K_hop'), get_attribute('batch_size'))

    test_metrics = []

    for train_time in range(train_repeat_times):

        print(f"train DSTGCN model for the {train_time}-th time ...")

        model = create_model()
        loss_func = create_loss(loss_type=get_attribute('loss_function'))

        # 训练
        model_folder = f"../saves/{get_attribute('data')}/{get_attribute('model_name')}"
        tensorboard_folder = f"../runs/{get_attribute('data')}/{get_attribute('model_name')}"

        shutil.rmtree(model_folder, ignore_errors=True)
        os.makedirs(model_folder, exist_ok=True)
        shutil.rmtree(tensorboard_folder, ignore_errors=True)
        os.makedirs(tensorboard_folder, exist_ok=True)

        if get_attribute("optim") == "Adam":
            optimizer = optim.Adam(model.parameters(),
                                   lr=get_attribute("learning_rate"),
                                   weight_decay=get_attribute("weight_decay"))
        elif get_attribute("optim") == "SGD":
            optimizer = optim.SGD(model.parameters(),
                                  lr=get_attribute("learning_rate"),
                                  momentum=0.9)
        else:
            raise NotImplementedError()

        num_processes = 10
        model.share_memory()
        processes = []
        for i in range(num_processes):
            process = mp.Process(target=train_model, args=(model, data_loaders, loss_func, optimizer, model_folder, tensorboard_folder, i))
            process.start()
            processes.append(process)
        for pid in processes:
            pid.join()
        # test_metric = train_model(model=model,
        #                           data_loaders=data_loaders,
        #                           loss_func=loss_func,
        #                           optimizer=optimizer,
        #                           model_folder=model_folder,
        #                           tensorboard_folder=tensorboard_folder,
        #                           pid=1)

        # test_metrics.append(test_metric)

    # MSE, RMSE, MAE, PCC, P-VALUE, PRECISION, RECALL, F1-SCORE, AUC
    metrics = {}
    for key in test_metrics[0].keys():
        metric_list = [metric[key] for metric in test_metrics]
        mean_value = np.mean(metric_list)
        if key in ["MSE", "RMSE", "MAE"]:
            best_value = np.min(metric_list)
        else:
            best_value = np.max(metric_list)
        std_value = np.std(metric_list, ddof=1)
        metrics[f"mean_{key}"] = float(mean_value)
        metrics[f"best_{key}"] = float(best_value)
        metrics[f"std_{key}"] = float(std_value)

    scores = sorted(metrics.items(), key=lambda item: item[0], reverse=False)
    scores = {item[0]: item[1] for item in scores}

    scores_str = json.dumps(scores, indent=4)

    results_folder = f"../results/{get_attribute('data')}"
    if not os.path.exists(results_folder):
        os.makedirs(results_folder, exist_ok=True)

    save_path = f"{results_folder}/{get_attribute('model_name')}_result.json"
    with open(save_path, 'w') as file:
        file.write(scores_str)
    print(f'save path is {save_path}')
    print(f"metric -> {scores_str}")
Exemplo n.º 17
0
def train_model(model: nn.Module,
                data_loaders: Dict[str, DataLoader],
                loss_func: callable,
                optimizer,
                model_folder: str,
                tensorboard_folder: str,
                pid: int):

    phases = ['train', 'validate', 'test']

    writer = SummaryWriter(tensorboard_folder)
    num_epochs = get_attribute('epochs')

    since = time.perf_counter()

    model = convert_to_gpu(model)
    loss_func = convert_to_gpu(loss_func)

    save_dict, best_f1_score = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 0

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.5, patience=2, threshold=1e-3, min_lr=1e-6)
    test_metric = None
    try:
        for epoch in range(num_epochs):

            running_loss, running_metrics = {phase: 0.0 for phase in phases}, {phase: dict() for phase in phases}
            save_validate_this_epoch = False
            for phase in phases:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()

                steps, predictions, targets = 0, list(), list()
                tqdm_loader = tqdm(enumerate(data_loaders[phase]))
                for step, (g, spatial_features, temporal_features, external_features, truth_data) in tqdm_loader:

                    if list(external_features.size())[0] != get_attribute("batch_size"):
                        continue

                    if not get_attribute("use_spatial_features"):
                        torch.zero_(spatial_features)
                    if not get_attribute("use_temporal_features"):
                        torch.zero_(temporal_features)
                    if not get_attribute("use_external_features"):
                        torch.zero_(external_features)

                    features, truth_data = convert_train_truth_to_gpu(
                        [spatial_features, temporal_features, external_features], truth_data)

                    with torch.set_grad_enabled(phase == 'train'):
                        _outputs = model(g, *features)
                        outputs = torch.squeeze(_outputs)  # squeeze [batch-size, 1] to [batch-size]
                        loss = loss_func(truth=truth_data, predict=outputs)
                        if phase == 'train':
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()

                    targets.append(truth_data.cpu().numpy())
                    with torch.no_grad():
                        predictions.append(outputs.cpu().detach().numpy())

                    running_loss[phase] += loss * truth_data.size(0)
                    steps += truth_data.size(0)

                    tqdm_loader.set_description(
                        f'{pid:2} pid: {phase:8} epoch: {epoch:3}, {phase:8} loss: {running_loss[phase] / steps:3.6}')

                    # For the issue that the CPU memory increases while training. DO NOT know why, but it works.
                    torch.cuda.empty_cache()

                print(f'{phase} metric ...')
                _cp = np.concatenate(predictions)
                _ct = np.concatenate(targets)
                scores = evaluate(_cp, _ct)
                running_metrics[phase] = scores
                print(scores)

                if phase == 'validate' and scores['F1-SCORE'] > best_f1_score:
                    best_f1_score = scores['F1-SCORE']
                    save_validate_this_epoch = True
                    save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()),
                                     epoch=epoch,
                                     optimizer_state_dict=copy.deepcopy(optimizer.state_dict()))
                    print(f"save model as {model_folder}/model_{epoch}.pkl")
                    save_model(f"{model_folder}/model_{epoch}.pkl", **save_dict)

            scheduler.step(running_loss['train'])

            if save_validate_this_epoch:
                test_metric = running_metrics["test"].copy()

            for metric in running_metrics['train'].keys():
                writer.add_scalars(metric, {
                    f'{phase} {metric}': running_metrics[phase][metric] for phase in phases},
                                   global_step=epoch)
            writer.add_scalars('Loss', {
                f'{phase} loss': running_loss[phase] / len(data_loaders[phase].dataset) for phase in phases},
                               global_step=epoch)
    finally:

        time_elapsed = time.perf_counter() - since
        print(f"cost {time_elapsed} seconds")

        save_model(f"{model_folder}/best_model.pkl", **save_dict)

    return test_metric
Exemplo n.º 18
0
def collate_set_across_user(batch_data):
    """
    Args:
        batch_data: list, shape (batch_size, XXX)

    Returns:
        graph:
        train_data: list, shape (batch_size, baskets_num - 1, items_num)
        truth_data: list of tensors, shape (batch_size, items_total) or (batch_size, baskets_num - 1, items_total)
    """
    # g, nodes_feature, edges_weight, nodes, user_data
    # zip * -> unpack
    ret = list()
    for idx, item in enumerate(zip(*batch_data)):
        # assert type(item) == tuple
        if isinstance(item[0], dgl.DGLGraph):
            ret.append(dgl.batch(item))
        elif isinstance(item[0], torch.Tensor):
            if idx == 2:
                # pad edges_weight sequence in time dimension batch, (T, N*N)
                # (T_max, N*N)
                max_length = max([data.shape[0] for data in item])
                edges_weight, lengths = list(), list()
                for data in item:
                    if max_length != data.shape[0]:
                        edges_weight.append(
                            torch.cat(
                                (data,
                                 torch.stack([
                                     torch.eye(int(data.shape[1]**
                                                   0.5)).flatten()
                                     for _ in range(max_length - data.shape[0])
                                 ],
                                             dim=0)),
                                dim=0))
                    else:
                        edges_weight.append(data)
                    lengths.append(data.shape[0])
                # (T_max, N_1*N_1 + N_2*N_2 + ... + N_b*N_b)
                ret.append(torch.cat(edges_weight, dim=1))
                # (batch, )
                ret.append(torch.tensor(lengths))
            else:
                # nodes_feature -> (N_1 + N_2, .. + N_b, item_embedding) or nodes -> (N_1 + N_2, .. + N_b, )
                ret.append(torch.cat(item, dim=0))
        elif isinstance(item[0], list):
            data_list = item
        else:
            raise ValueError(
                f'batch must contain tensors or graphs; found {type(item[0])}')

    truth_data = get_truth_data([dt[-1] for dt in data_list])
    ret.append(truth_data)

    # tensor (batch, items_total), for frequency calculation
    users_frequency = np.zeros([len(batch_data), get_attribute('items_total')])
    for idx, baskets in enumerate(data_list):
        for basket in baskets:
            for item in basket:
                users_frequency[idx, item] = users_frequency[idx, item] + 1
    users_frequency = normalize(users_frequency, axis=1, norm='max')
    ret.append(torch.Tensor(users_frequency))

    # (g, nodes_feature, edges_weight, lengths, nodes, truth_data, individual_frequency)
    return tuple(ret)