Пример #1
0
def ctgcn_structural_embedding(dataset, learning_type='unsupervise'):
    base_path = os.path.abspath(
        os.path.join(os.getcwd(), '../data/' + dataset + '/CTGCN'))
    origin_folder = os.path.join('..', '1.format')
    origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
    embedding_folder = os.path.join('..', '2.embedding/CTGCN_S')
    core_folder = 'ctgcn_cores'
    core_base_path = os.path.abspath(os.path.join(base_path, core_folder))
    node_file = os.path.join('..', 'nodes_set/nodes.csv')

    duration = 1

    max_time_num = len(os.listdir(core_base_path))
    node_path = os.path.abspath(os.path.join(base_path, node_file))
    nodes_set = pd.read_csv(node_path, names=['node'])
    node_num = nodes_set.shape[0]

    data_loader = DataLoader(nodes_set['node'].tolist(), max_time_num)
    print('max time num: ', max_time_num)

    t1 = time.time()
    print('start CTGCN_S embedding on ' + dataset)
    if learning_type == 'unsupervise':
        for idx in range(0, max_time_num, duration):
            print('idx = ', idx)
            time_num = min(duration, max_time_num - idx)
            adj_list = data_loader.get_core_adj_list(core_base_path,
                                                     start_idx=idx,
                                                     duration=time_num)
            x_list, max_degree, _ = data_loader.get_degree_feature_list(
                origin_base_path, start_idx=idx, duration=time_num)

            ctgcn_model = CTGCN(input_dim=max_degree,
                                hidden_dim=500,
                                output_dim=128,
                                trans_num=3,
                                diffusion_num=1,
                                duration=time_num,
                                bias=True,
                                rnn_type='GRU',
                                version='S',
                                trans_version='N')
            ctgcn_loss = UnsupervisedLoss()
            ctgcn = UnsupervisedEmbedding(base_path=base_path,
                                          origin_folder=origin_folder,
                                          embedding_folder=embedding_folder,
                                          node_list=nodes_set['node'].tolist(),
                                          model=ctgcn_model,
                                          loss=ctgcn_loss,
                                          max_time_num=max_time_num)
            ctgcn.learn_embedding(adj_list,
                                  x_list,
                                  single_output=False,
                                  epoch=20,
                                  batch_size=4096 * 8,
                                  lr=0.001,
                                  start_idx=idx,
                                  weight_decay=5e-4,
                                  model_file='ctgcn_s',
                                  embedding_type='structure',
                                  export=True)
    elif learning_type == 'supervise':
        label_file = os.path.join('..', 'nodes_set/trans_label.csv')
        label_path = os.path.abspath(os.path.join(base_path, label_file))
        df_label = pd.read_csv(label_path, sep='\t')
        label_list = df_label['label'].values

        for idx in range(0, max_time_num, duration):
            print('idx = ', idx)
            time_num = min(duration, max_time_num - idx)
            adj_list = data_loader.get_core_adj_list(core_base_path,
                                                     start_idx=idx,
                                                     duration=time_num)
            x_list, max_degree, _ = data_loader.get_degree_feature_list(
                origin_base_path, start_idx=idx, duration=time_num)

            ctgcn_model = CTGCN(input_dim=max_degree,
                                hidden_dim=500,
                                output_dim=128,
                                trans_num=3,
                                diffusion_num=1,
                                duration=time_num,
                                bias=True,
                                rnn_type='GRU',
                                version='S',
                                trans_version='N')
            ctgcn_loss = SupervisedLoss()
            ctgcn_classifier = MLPClassifier(128,
                                             64,
                                             label_list.max() + 1,
                                             layer_num=1,
                                             duration=time_num,
                                             bias=True,
                                             trans_version='L')
            ctgcn = SupervisedEmbedding(base_path=base_path,
                                        origin_folder=origin_folder,
                                        embedding_folder=embedding_folder,
                                        node_list=nodes_set['node'].tolist(),
                                        model=ctgcn_model,
                                        loss=ctgcn_loss,
                                        classifier=ctgcn_classifier,
                                        max_time_num=max_time_num)
            ctgcn.learn_embedding(adj_list,
                                  x_list,
                                  label_list,
                                  single_output=False,
                                  epoch=20,
                                  batch_size=4096 * 8,
                                  lr=0.001,
                                  start_idx=idx,
                                  weight_decay=5e-4,
                                  model_file='ctgcn_s',
                                  classifier_file='ctgcn_s_cls',
                                  embedding_type='structure',
                                  export=True)
    else:
        raise AttributeError('Unsupported learning type!')

    t2 = time.time()
    print('finish CTGCN_S embedding! cost time: ', t2 - t1, ' seconds!')
    return
Пример #2
0
def evolvegcn_embedding(dataset, learning_type='unsupervise'):
    base_path = os.path.abspath(
        os.path.join(os.getcwd(), '../data/' + dataset + '/CTGCN'))
    origin_folder = os.path.join('..', '1.format')
    origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
    embedding_folder = os.path.join('..', '2.embedding/EvolveGCNH')
    node_file = os.path.join('..', 'nodes_set/nodes.csv')
    duration = 15

    max_time_num = len(os.listdir(origin_base_path))
    node_path = os.path.abspath(os.path.join(base_path, node_file))
    nodes_set = pd.read_csv(node_path, names=['node'])

    data_loader = DataLoader(nodes_set['node'].tolist(), max_time_num)

    t1 = time.time()
    print('start EvolveGCN embedding!')
    if learning_type == 'unsupervise':
        walk_pair_folder = 'evolvegcn_walk_pairs'
        node_freq_folder = 'evolvegcn_node_freq'
        walk_pair_base_path = os.path.abspath(
            os.path.join(base_path, walk_pair_folder))
        node_freq_base_path = os.path.abspath(
            os.path.join(base_path, node_freq_folder))
        for idx in range(0, max_time_num, duration):
            print('idx = ', idx)
            adj_list = data_loader.get_date_adj_list(origin_base_path,
                                                     start_idx=idx,
                                                     duration=duration)
            x_list, max_degree, _ = data_loader.get_degree_feature_list(
                origin_base_path, start_idx=idx, duration=duration)
            node_pair_list = data_loader.get_node_pair_list(
                walk_pair_base_path, start_idx=idx, duration=duration)
            neg_freq_list = data_loader.get_neg_freq_list(node_freq_base_path,
                                                          start_idx=idx,
                                                          duration=duration)

            evolvegcn_model = EvolveGCN(input_dim=max_degree,
                                        hidden_dim=128,
                                        output_dim=128,
                                        duration=duration,
                                        egcn_type='EGCNH')
            evolvegcn_loss = UnsupervisedLoss(neg_num=20,
                                              Q=20,
                                              node_pair_list=node_pair_list,
                                              neg_freq_list=neg_freq_list)
            evolvegcn = UnsupervisedEmbedding(
                base_path=base_path,
                origin_folder=origin_folder,
                embedding_folder=embedding_folder,
                node_list=nodes_set['node'].tolist(),
                model=evolvegcn_model,
                loss=evolvegcn_loss,
                max_time_num=max_time_num)
            evolvegcn.learn_embedding(adj_list,
                                      x_list,
                                      epoch=5,
                                      batch_size=4096 * 8,
                                      lr=0.001,
                                      start_idx=idx,
                                      weight_decay=5e-4,
                                      model_file='evolvegcnh',
                                      export=True)
            break
    elif learning_type == 'supervise':
        label_file = os.path.join('..', 'nodes_set/trans_label.csv')
        label_path = os.path.abspath(os.path.join(base_path, label_file))
        df_label = pd.read_csv(label_path, sep='\t')
        label_list = df_label['label'].values

        for idx in range(0, max_time_num, duration):
            print('idx = ', idx)
            adj_list = data_loader.get_date_adj_list(origin_base_path,
                                                     start_idx=idx,
                                                     duration=duration)
            x_list, max_degree, _ = data_loader.get_degree_feature_list(
                origin_base_path, start_idx=idx, duration=duration)

            evolvegcn_model = EvolveGCN(input_dim=max_degree,
                                        hidden_dim=128,
                                        output_dim=128,
                                        duration=duration,
                                        egcn_type='EGCNH')
            evolvegcn_loss = SupervisedLoss()
            evolvegcn_classifier = MLPClassifier(128,
                                                 64,
                                                 label_list.max() + 1,
                                                 layer_num=1,
                                                 duration=duration,
                                                 bias=True,
                                                 trans_version='L')
            evolvegcn = SupervisedEmbedding(
                base_path=base_path,
                origin_folder=origin_folder,
                embedding_folder=embedding_folder,
                node_list=nodes_set['node'].tolist(),
                model=evolvegcn_model,
                loss=evolvegcn_loss,
                classifier=evolvegcn_classifier,
                max_time_num=max_time_num)
            evolvegcn.learn_embedding(adj_list,
                                      x_list,
                                      label_list,
                                      epoch=50,
                                      batch_size=4096 * 8,
                                      lr=0.001,
                                      start_idx=idx,
                                      weight_decay=5e-4,
                                      model_file='evolvegcnh',
                                      classifier_file='evolvegcnh_cls',
                                      export=True)
    else:
        raise AttributeError('Unsupported learning type!')
    t2 = time.time()
    print('finish EvolveGCN embedding! cost time: ', t2 - t1, ' seconds!')
    return