示例#1
0
def train(args):
    dataset = MovieLens(args.data_name,
                        args.ctx,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm)
    print("Loading data finished ...\n")

    args.src_key = dataset.name_user
    args.dst_key = dataset.name_movie
    args.src_in_units = dataset.user_feature.shape[1]
    args.dst_in_units = dataset.movie_feature.shape[1]
    args.nratings = dataset.possible_rating_values.size

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    if args.gen_r_use_classification:
        nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                                ctx=args.ctx,
                                                dtype=np.float32)
        rating_loss_net = gluon.loss.SoftmaxCELoss()
    else:
        rating_mean = dataset.train_rating_values.mean()
        rating_std = dataset.train_rating_values.std()
        rating_loss_net = gluon.loss.L2Loss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_rating_pairs = mx.nd.array(dataset.train_rating_pairs,
                                     ctx=args.ctx,
                                     dtype=np.int64)
    train_gt_ratings = mx.nd.array(dataset.train_rating_values,
                                   ctx=args.ctx,
                                   dtype=np.float32)

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    for iter_idx in range(1, args.train_max_iter):
        if args.gen_r_use_classification:
            train_gt_label = mx.nd.array(np.searchsorted(
                dataset.possible_rating_values, dataset.train_rating_values),
                                         ctx=args.ctx,
                                         dtype=np.int32)
        with mx.autograd.record():
            pred_ratings = net(dataset.train_graph, train_rating_pairs)
            if args.gen_r_use_classification:
                loss = rating_loss_net(pred_ratings, train_gt_label).mean()
            else:
                loss = rating_loss_net(
                    mx.nd.reshape(pred_ratings, shape=(-1, )),
                    (train_gt_ratings - rating_mean) / rating_std).mean()
            #loss.wait_to_read()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)  #, ignore_stale_grad=True)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.gen_r_use_classification:
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                                 nd_possible_rating_values.reshape(
                                     (1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        else:
            rmse = mx.nd.square(
                pred_ratings.reshape((-1, )) * rating_std + rating_mean -
                train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(
                iter_idx, avg_gnorm / args.train_log_interval,
                count_loss / iter_idx, count_rmse / count_num)
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
示例#2
0
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))


if __name__ == '__main__':
    args = config()

    devices = list(map(int, args.gpu.split(',')))
    n_gpus = len(devices)

    # For GCMC based on sampling, we require node has its own features.
    # Otherwise (node_id is the feature), the model can not scale
    dataset = MovieLens(args.data_name,
                        'cpu',
                        mix_cpu_gpu=args.mix_cpu_gpu,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio,
                        valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    # cpu
    if devices[0] == -1:
        run(0, 0, args, ['cpu'], dataset)
    # gpu
    elif n_gpus == 1:
        run(0, n_gpus, args, devices, dataset)
def train(args):
    print(args)
    dataset = MovieLens(
        args.data_name,
        args.ctx,
        use_one_hot_fea=args.use_one_hot_fea,
        symm=args.gcn_agg_norm_symm,
        test_ratio=args.data_test_ratio,
        valid_ratio=args.data_valid_ratio,
    )
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx)
    net.hybridize()
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                            ctx=args.ctx,
                                            dtype=np.float32)
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {"learning_rate": args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ["iter", "loss", "rmse"],
        ["%d", "%.4f", "%.4f"],
        os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id),
    )
    valid_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id),
    )
    test_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id),
    )

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        with mx.autograd.record():
            pred_ratings = net(
                dataset.train_enc_graph,
                dataset.train_dec_graph,
                dataset.user_feature,
                dataset.movie_feature,
            )
            loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)
        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, "net%d.txt" % args.save_id)))

        real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape(
                                 (1, -1))).sum(axis=1)
        rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx,
                avg_gnorm / args.train_log_interval,
                count_loss / iter_idx,
                count_rmse / count_num,
                np.average(dur),
            )
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment="valid")
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                net.save_parameters(filename=os.path.join(
                    args.save_dir, "best_valid_net{}.params".format(
                        args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment="test")
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ", Test RMSE={:.4f}".format(test_rmse)
            else:
                no_better_valid += 1
                if (no_better_valid > args.train_early_stopping_patience
                        and trainer.learning_rate <= args.train_min_lr):
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}".
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
示例#4
0
def train(args):
    print(args)
    dataset = MovieLens(args.data_name,
                        args.device,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio,
                        valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature)
        loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        rmse = ((real_pred_ratings - train_gt_ratings)**2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss / iter_idx, count_rmse / count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor,
                                 args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
示例#5
0
文件: train.py 项目: ghk829/dgl
def train(args):
    print(args)
    if args.data_name == 'jukebox':
        dataset = JukeboxDataset('dataset/listen_count.txt')
    else:
        dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.MSELoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 1
    count_num = 1
    count_loss = 0
    count_step = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    def batch(iterable, n=1):
        current_batch = []
        for item in iterable:
            current_batch.append(item)
            if len(current_batch) == n:
                yield current_batch
                current_batch = []
        if current_batch:
            yield current_batch
    batches = []
    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        unique_item_list = dataset.train['item_id'].unique().tolist()

        ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                   dataset.user_feature, dataset.movie_feature)
        from tqdm import tqdm
        if iter_idx ==1:
            for row in tqdm(list(dataset.train.itertuples())):
                user, item, rating = row.user_id, row.item_id, row.rating
                userid = dataset.global_user_id_map[user]
                observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist()
                negatives = set()
                while len(negatives) < 1:
                    sample = random.choice(unique_item_list)
                    if sample not in observed:
                        negatives.add(sample)
                        batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample]))

        for bt in tqdm(list(batch(batches, 2**14))):
            uidfeat = ufeat[[e[0] for e in bt]]
            posfeat = ifeat[[e[1] for e in bt]]
            negfeat = ifeat[[e[2] for e in bt]]

            pos_scores = uidfeat @ net.decoder.Q @ posfeat.T
            neg_scores = uidfeat @ net.decoder.Q @ negfeat.T

            lmbd = 1e-5
            mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean()
            mf_loss = -1 * mf_loss

            regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q))
            emb_loss = lmbd * regularizer
            print('mf_loss', mf_loss)
            print('emb_loss', emb_loss)
            optimizer.zero_grad()
            loss = mf_loss + emb_loss
            count_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
            optimizer.step()
            ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                       dataset.user_feature, dataset.movie_feature)
            count_step += 1

        print('train done')

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (count_step + 1))
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/(count_step + 1), count_rmse/count_num,
                np.average(dur))
            count_rmse = 1
            count_num = 1

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid')
            ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid')
            print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support)
            valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore,
                                  support=support)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
示例#6
0
def train(args):
    print(args)
    dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1

    enc_graph = dataset.train_enc_graph
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32)
    g_user_fea = mx.nd.zeros((dataset.num_user,))
    g_movie_fea = mx.nd.zeros((dataset.num_movie,))
    train_truths = dataset.train_truths
    train_labels = dataset.train_labels

    print("Start training ...")
    dur = []

    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()

        num_edges = dataset.train_truths.shape[0]
        seed = mx.nd.arange(num_edges, dtype='int64')
        edges = mx.nd.shuffle(seed)
        # each iteration will go through all edges
        for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size):
            edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges]
            head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy())

            head_subgraphs = {}
            tail_subgraphs = {}
            head_node_ids = np.unique(head_ids.asnumpy())
            tail_node_ids = np.unique(tail_ids.asnumpy())
            for i, _ in enumerate(args.rating_vals):
                t = enc_graph.canonical_etypes[i * 2]
                rev_t = enc_graph.canonical_etypes[i * 2 + 1]

                head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t)
                tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t)

                if head_in_edges.shape[0] > 0:
                    head_subgraphs[rev_t] = head_in_edges

                if tail_in_edges.shape[0] > 0:
                    tail_subgraphs[t] = tail_in_edges

            head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True)
            tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True)
            edge_ids = edge_ids.as_in_context(args.ctx)
            true_relation_ratings = train_truths[edge_ids]
            true_relation_labels = train_labels[edge_ids]

            head_NID = head_subgraph.nodes['user'].data[dgl.NID]
            tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID]

            g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32')
            g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32')

            true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx)
            true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx)

            with mx.autograd.record():
                pred_ratings = net(head_subgraph, tail_subgraph,
                                   true_head_idx, true_tail_idx)
                loss = rating_loss_net(pred_ratings, true_relation_labels).mean()
                loss.backward()
            gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx)
            trainer.step(1.0, ignore_stale_grad=True)
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar()
            rmse = np.sqrt(rmse)
            loss = loss.asscalar()
            if sample_idx % 100 == 0:
                train_loss_logger.log(iter=iter_idx, idx=sample_idx,
                                  loss=loss, rmse=rmse)
                print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx,
                    sample_idx, gnorm, loss, rmse))
            gc.collect()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
           logging_str = "Iter={}, time={:.4f}".format(
                iter_idx, np.average(dur))

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
示例#7
0
    cmd_input = 'python3 ' + ' '.join(sys.argv)
    with open(os.path.join(args.save_dir, 'cmd_input.txt'), 'a') as f:
        f.write(cmd_input)
        f.write("\n")
    print('Command line input: ' + cmd_input + ' is saved.')

    return args


if __name__ == '__main__':
    args = config()
    random.seed(args.seed)
    np.random.seed(args.seed)

    movielens = MovieLens(args.data_name,
                          testing=args.testing,
                          test_ratio=args.data_test_ratio,
                          valid_ratio=args.data_valid_ratio)

    if args.n_gpus == 1:
        train(0, args.n_gpus, args, args.devices, movielens)
    else:
        procs = []
        for proc_id in range(args.n_gpus):
            p = mp.Process(target=train,
                           args=(proc_id, args.n_gpus, args, args.devices,
                                 movielens))
            p.start()
            procs.append(p)
        for p in procs:
            p.join()
示例#8
0
    if config.data == 'CDs':
        from data import Amazon
        data_set = Amazon.CDs()
    elif config.data == 'Books':
        from data import Amazon
        data_set = Amazon.Books()
    elif config.data == 'Children':
        from data import GoodReads
        data_set = GoodReads.Children()
    elif config.data == 'Comics':
        from data import GoodReads
        data_set = GoodReads.Comics()
    elif config.data == 'ML20M':
        from data import MovieLens
        data_set = MovieLens.ML20M()
    elif config.data == 'ML1M':
        from data import MovieLens
        data_set = MovieLens.ML1M()

    #generate datasets in the 80-20-CUT setting
    train_set, val_set, train_val_set, test_set, num_users, num_items = data_set.generate_dataset(
        index_shift=1)

    #generate datasets in the 3-LOS setting
    if config.setting == 'LOS':
        assert len(train_set) == len(val_set) and len(test_set) == len(
            train_set)

        for i in range(len(train_set)):
            user = train_set[i] + val_set[i] + test_set[i]
示例#9
0
def train(args):
    ### prepare data and set model
    movielens = MovieLens(args.data_name,
                          testing=args.testing,
                          test_ratio=args.data_test_ratio,
                          valid_ratio=args.data_valid_ratio)
    if args.testing:
        test_dataset = MovieLensDataset(movielens.test_rating_pairs,
                                        movielens.test_rating_values,
                                        movielens.train_graph, args.hop,
                                        args.sample_ratio,
                                        args.max_nodes_per_hop)
    else:
        test_dataset = MovieLensDataset(movielens.valid_rating_pairs,
                                        movielens.valid_rating_values,
                                        movielens.train_graph, args.hop,
                                        args.sample_ratio,
                                        args.max_nodes_per_hop)
    train_dataset = MovieLensDataset(movielens.train_rating_pairs,
                                     movielens.train_rating_values,
                                     movielens.train_graph, args.hop,
                                     args.sample_ratio, args.max_nodes_per_hop)

    train_loader = th.utils.data.DataLoader(train_dataset,
                                            batch_size=args.batch_size,
                                            shuffle=True,
                                            num_workers=args.num_workers,
                                            collate_fn=collate_movielens)
    test_loader = th.utils.data.DataLoader(test_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=False,
                                           num_workers=args.num_workers,
                                           collate_fn=collate_movielens)

    model = IGMC(
        in_feats=(args.hop + 1) * 2,
        latent_dim=[32, 32, 32, 32],
        num_relations=5,  # movielens.num_rating, 
        num_bases=4,
        regression=True,
        edge_dropout=args.edge_dropout,
        #  side_features=args.use_features,
        #  n_side_features=n_features,
        #  multiply_by=args.multiply_by
    ).to(args.device)
    loss_fn = nn.MSELoss().to(args.device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.train_lr,
                           weight_decay=0)
    print("Loading network finished ...\n")

    ### prepare the logger
    logger = MetricLogger(args.save_dir, args.valid_log_interval)

    best_epoch = 0
    best_rmse = np.inf
    ### declare the loss information
    print("Start training ...")
    for epoch_idx in range(1, args.train_epochs + 1):
        print('Epoch', epoch_idx)

        train_loss = train_epoch(model, loss_fn, optimizer, args.arr_lambda,
                                 train_loader, args.device,
                                 args.train_log_interval)
        test_rmse = evaluate(model, test_loader, args.device)
        eval_info = {
            'epoch': epoch_idx,
            'train_loss': train_loss,
            'test_rmse': test_rmse,
        }
        print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(
            *eval_info.values()))

        if epoch_idx % args.train_lr_decay_step == 0:
            for param in optimizer.param_groups:
                param['lr'] = args.train_lr_decay_factor * param['lr']

        logger.log(eval_info, model, optimizer)
        if best_rmse > test_rmse:
            best_rmse = test_rmse
            best_epoch = epoch_idx
    eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(
        best_rmse, best_epoch)
    print(eval_info)
    with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f:
        f.write(eval_info)
示例#10
0
文件: benchmark.py 项目: Menooker/dgl
def train(args):
    #print(args.ctx)
    #print(args)
    dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")
    graph=dataset.train_enc_graph
    msg_in_unit = args.gcn_agg_units // len(dataset.possible_rating_values)
    movie_in_unit = dataset.movie_feature_shape[1]
    users_in_unit = dataset.user_feature_shape[1]
    print((movie_in_unit, users_in_unit, msg_in_unit))
    num_u = graph.number_of_nodes('user')
    
    dgl.kernel.tvm_enabled = args.tvm != 0 
    print("TVM enabled?", dgl.kernel.tvm_enabled)
    def get_copy_reduce():
        allfeat = [mx.ndarray.random.randn(movie_in_unit, msg_in_unit,dtype=np.float32) for i in dataset.possible_rating_values]
        def do_copy_reduce():
            funcs=OrderedDict()
            for i, rating in enumerate(dataset.possible_rating_values):
                rating = str(rating)
                graph.nodes['movie'].data['h%d' % i] = allfeat[i]
                #funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
                funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
                # message passing
            graph.multi_update_all(funcs, "stack")
            #graph.nodes['user'].data.pop('h')
            return graph.nodes['user'].data.pop('h').reshape(num_u, -1)
        return do_copy_reduce
        #mx.nd.save()
    #time.sleep(20)
    #(943, 75) (1682, 75) (72000,) (72000,) (943, 75)
    def get_copy_reduce_bwd():
        x_mx = mx.ndarray.random.randn(movie_in_unit, msg_in_unit,dtype=np.float32)
        X = zerocopy_to_dgl_ndarray(x_mx)
        import dgl
        out = dgl.ndarray.empty((users_in_unit, msg_in_unit))
        grad_out = zerocopy_to_dgl_ndarray(mx.ndarray.random.randn(users_in_unit, msg_in_unit,dtype=np.float32))
        outgrad_x=mx.ndarray.zeros((movie_in_unit, msg_in_unit),dtype=np.float32)
        grad_x = zerocopy_to_dgl_ndarray_for_write(outgrad_x)
        etid = graph.get_etype_id('rev-1')
        stid, dtid = getattr(graph,"_graph").metagraph.find_edge(etid)
        gidx=AdaptedHeteroGraph(graph, stid, dtid, etid).get_immutable_gidx(MyContext())
        print(grad_out.shape)
        def do_copy_reduce_bwd():
            dgl.kernel.backward_copy_reduce("sum", gidx, 0, X, out, grad_out, grad_x)
            return outgrad_x
        return do_copy_reduce_bwd

    def get_binary_op_dot_bwd(islhs):
        dot_lhs = mx.ndarray.random.randn(users_in_unit, 75,dtype=np.float32)
        dot_rhs = mx.ndarray.random.randn(movie_in_unit, 75,dtype=np.float32)
        dot_out = mx.ndarray.random.randn(45450,dtype=np.float32)
        dot_outgrad = mx.ndarray.random.randn(45450,dtype=np.float32)
        dot_lhsgrad = mx.ndarray.zeros((users_in_unit, 75),dtype=np.float32)
        dot_rhsgrad = mx.ndarray.zeros((movie_in_unit, 75),dtype=np.float32)
        import dgl
        A = zerocopy_to_dgl_ndarray(dot_lhs)
        B = zerocopy_to_dgl_ndarray(dot_rhs)
        out = zerocopy_to_dgl_ndarray(dot_out)
        grad_out = zerocopy_to_dgl_ndarray(dot_outgrad)

        G = graph.local_var()
        etid = 0
        stid, dtid = getattr(G,"_graph").metagraph.find_edge(etid)
        gidx=AdaptedHeteroGraph(graph, stid, dtid, etid).get_immutable_gidx(MyContext())
        def do_binary_op_dot_bwd():
            if islhs:
                grad_A = zerocopy_to_dgl_ndarray_for_write(dot_lhsgrad)
                dgl.kernel.backward_lhs_binary_op_reduce("none", "dot", gidx, 0, 1, A, B, out, grad_out, grad_A)
                return dot_lhsgrad
            else:
                grad_B = zerocopy_to_dgl_ndarray_for_write(dot_rhsgrad)
                dgl.kernel.backward_rhs_binary_op_reduce("none", "dot", gidx, 0, 1, A, B, out, grad_out, grad_B)
                return dot_rhsgrad
        return do_binary_op_dot_bwd
                

    workloads = {
        'copyreduce': (get_copy_reduce, 100),
        'copyreduce_bwd': (get_copy_reduce_bwd, 10000),
        'binary_dot_bwd_lhs': (lambda:get_binary_op_dot_bwd(True),10000),
        'binary_dot_bwd_rhs': (lambda:get_binary_op_dot_bwd(False), 10000),
    }
    workload = workloads[args.workload][0]
    times = workloads[args.workload][1]
    if args.mode == "save":
        ret=workload()()
        mx.nd.save(args.workload + ".mxnd",ret)
    elif args.mode == "compare":
        r = workload()()
        loaded = mx.nd.load(args.workload + ".mxnd")[0]
        print(loaded)
        print(r.shape, loaded.shape)
        #print(mx.test_utils.almost_equal(r.asnumpy(),loaded.asnumpy()))
        for idx, row in enumerate(r):
            lrow = loaded[idx]
            for j in range(len(row)):
                r=row[j].asscalar()
                l=lrow[j].asscalar()
                if abs((r-l)/ (r+l + 1e-11)) > 1e-3:
                    print(idx, j, r, l)
    else:
        workload = workload()
        for i in range(3):
            workload()
        t0 = time.time()
        for i in range(times):
            workload()
        print(time.time()-t0)
        print("DONE")
示例#11
0
            sample_ratio=self.sample_ratio,
            max_nodes_per_hop=self.max_nodes_per_hop)

        return subgraph, g_label


def collate_movielens(data):
    g_list, label_list = map(list, zip(*data))
    g = dgl.batch_hetero(g_list)
    g_label = th.stack(label_list)
    return g, g_label


if __name__ == "__main__":
    from data import MovieLens
    movielens = MovieLens("ml-100k", testing=True)

    train_dataset = MovieLensDataset(movielens.train_rating_pairs,
                                     movielens.train_rating_values,
                                     movielens.train_graph,
                                     hop=1,
                                     sample_ratio=1.0,
                                     max_nodes_per_hop=200)

    train_loader = th.utils.data.DataLoader(train_dataset,
                                            batch_size=4,
                                            shuffle=True,
                                            num_workers=0,
                                            collate_fn=collate_movielens)
    batch = next(iter(train_loader))
    inputs = batch[0].to(th.device('cuda:0'))