Пример #1
0
def train(args):
    print(args)
    # dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
    #                     test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    dataset = DataSetLoader(args.data_name, args.device,
                use_one_hot_fea=args.use_one_hot_fea,
                symm=args.gcn_agg_norm_symm,
                test_ratio=args.data_test_ratio,
                valid_ratio=args.data_valid_ratio)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature)
        loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/iter_idx, count_rmse/count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #2
0
def train(args):
    print(args)
    dataset = MovieLens(
        args.data_name,
        args.ctx,
        use_one_hot_fea=args.use_one_hot_fea,
        symm=args.gcn_agg_norm_symm,
        test_ratio=args.data_test_ratio,
        valid_ratio=args.data_valid_ratio,
    )
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx)
    net.hybridize()
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                            ctx=args.ctx,
                                            dtype=np.float32)
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {"learning_rate": args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ["iter", "loss", "rmse"],
        ["%d", "%.4f", "%.4f"],
        os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id),
    )
    valid_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id),
    )
    test_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id),
    )

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        with mx.autograd.record():
            pred_ratings = net(
                dataset.train_enc_graph,
                dataset.train_dec_graph,
                dataset.user_feature,
                dataset.movie_feature,
            )
            loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)
        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, "net%d.txt" % args.save_id)))

        real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape(
                                 (1, -1))).sum(axis=1)
        rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx,
                avg_gnorm / args.train_log_interval,
                count_loss / iter_idx,
                count_rmse / count_num,
                np.average(dur),
            )
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment="valid")
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                net.save_parameters(filename=os.path.join(
                    args.save_dir, "best_valid_net{}.params".format(
                        args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment="test")
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ", Test RMSE={:.4f}".format(test_rmse)
            else:
                no_better_valid += 1
                if (no_better_valid > args.train_early_stopping_patience
                        and trainer.learning_rate <= args.train_min_lr):
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}".
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #3
0
def train(args):
    dataset = MovieLens(args.data_name,
                        args.ctx,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm)
    print("Loading data finished ...\n")

    args.src_key = dataset.name_user
    args.dst_key = dataset.name_movie
    args.src_in_units = dataset.user_feature.shape[1]
    args.dst_in_units = dataset.movie_feature.shape[1]
    args.nratings = dataset.possible_rating_values.size

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    if args.gen_r_use_classification:
        nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                                ctx=args.ctx,
                                                dtype=np.float32)
        rating_loss_net = gluon.loss.SoftmaxCELoss()
    else:
        rating_mean = dataset.train_rating_values.mean()
        rating_std = dataset.train_rating_values.std()
        rating_loss_net = gluon.loss.L2Loss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_rating_pairs = mx.nd.array(dataset.train_rating_pairs,
                                     ctx=args.ctx,
                                     dtype=np.int64)
    train_gt_ratings = mx.nd.array(dataset.train_rating_values,
                                   ctx=args.ctx,
                                   dtype=np.float32)

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    for iter_idx in range(1, args.train_max_iter):
        if args.gen_r_use_classification:
            train_gt_label = mx.nd.array(np.searchsorted(
                dataset.possible_rating_values, dataset.train_rating_values),
                                         ctx=args.ctx,
                                         dtype=np.int32)
        with mx.autograd.record():
            pred_ratings = net(dataset.train_graph, train_rating_pairs)
            if args.gen_r_use_classification:
                loss = rating_loss_net(pred_ratings, train_gt_label).mean()
            else:
                loss = rating_loss_net(
                    mx.nd.reshape(pred_ratings, shape=(-1, )),
                    (train_gt_ratings - rating_mean) / rating_std).mean()
            #loss.wait_to_read()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)  #, ignore_stale_grad=True)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.gen_r_use_classification:
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                                 nd_possible_rating_values.reshape(
                                     (1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        else:
            rmse = mx.nd.square(
                pred_ratings.reshape((-1, )) * rating_std + rating_mean -
                train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(
                iter_idx, avg_gnorm / args.train_log_interval,
                count_loss / iter_idx, count_rmse / count_num)
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #4
0
def train(args):
    print(args)
    dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1

    enc_graph = dataset.train_enc_graph
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32)
    g_user_fea = mx.nd.zeros((dataset.num_user,))
    g_movie_fea = mx.nd.zeros((dataset.num_movie,))
    train_truths = dataset.train_truths
    train_labels = dataset.train_labels

    print("Start training ...")
    dur = []

    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()

        num_edges = dataset.train_truths.shape[0]
        seed = mx.nd.arange(num_edges, dtype='int64')
        edges = mx.nd.shuffle(seed)
        # each iteration will go through all edges
        for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size):
            edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges]
            head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy())

            head_subgraphs = {}
            tail_subgraphs = {}
            head_node_ids = np.unique(head_ids.asnumpy())
            tail_node_ids = np.unique(tail_ids.asnumpy())
            for i, _ in enumerate(args.rating_vals):
                t = enc_graph.canonical_etypes[i * 2]
                rev_t = enc_graph.canonical_etypes[i * 2 + 1]

                head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t)
                tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t)

                if head_in_edges.shape[0] > 0:
                    head_subgraphs[rev_t] = head_in_edges

                if tail_in_edges.shape[0] > 0:
                    tail_subgraphs[t] = tail_in_edges

            head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True)
            tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True)
            edge_ids = edge_ids.as_in_context(args.ctx)
            true_relation_ratings = train_truths[edge_ids]
            true_relation_labels = train_labels[edge_ids]

            head_NID = head_subgraph.nodes['user'].data[dgl.NID]
            tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID]

            g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32')
            g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32')

            true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx)
            true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx)

            with mx.autograd.record():
                pred_ratings = net(head_subgraph, tail_subgraph,
                                   true_head_idx, true_tail_idx)
                loss = rating_loss_net(pred_ratings, true_relation_labels).mean()
                loss.backward()
            gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx)
            trainer.step(1.0, ignore_stale_grad=True)
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar()
            rmse = np.sqrt(rmse)
            loss = loss.asscalar()
            if sample_idx % 100 == 0:
                train_loss_logger.log(iter=iter_idx, idx=sample_idx,
                                  loss=loss, rmse=rmse)
                print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx,
                    sample_idx, gnorm, loss, rmse))
            gc.collect()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
           logging_str = "Iter={}, time={:.4f}".format(
                iter_idx, np.average(dur))

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #5
0
def train_eval(args):
    logging_config(folder=args.save_dir,
                   name='log{:d}'.format(args.save_id),
                   no_console=False)
    logging.info(args)

    ### check context
    use_cuda = args.gpu >= 0 and th.cuda.is_available()
    if use_cuda:
        th.cuda.set_device(args.gpu)

    ### load data
    dataset = DataLoader(data_name=args.data_name, seed=args.seed)
    print(dataset)
    model = Model(use_KG=True,
                  input_node_dim=args.entity_embed_dim,
                  gnn_model=args.gnn_model,
                  num_gnn_layers=args.gnn_num_layer,
                  n_hidden=args.gnn_hidden_size,
                  dropout=args.dropout_rate,
                  n_entities=dataset.n_KG_entity,
                  n_relations=dataset.n_KG_relation,
                  relation_dim=args.relation_embed_dim,
                  reg_lambda_kg=args.regs,
                  reg_lambda_gnn=args.regs)
    if use_cuda:
        model.cuda()
    logging.info(model)
    ### optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    valid_metric_logger = MetricLogger(
        ['epoch', 'recall', 'ndcg', 'is_best'], ['%d', '%.5f', '%.5f', '%d'],
        os.path.join(args.save_dir, 'valid{:d}.csv'.format(args.save_id)))
    test_metric_logger = MetricLogger(
        ['epoch', 'recall', 'ndcg'], ['%d', '%.5f', '%.5f'],
        os.path.join(args.save_dir, 'test{:d}.csv'.format(args.save_id)))
    best_epoch = -1
    best_recall = 0.0

    train_g = dataset.train_g
    nid_th = th.LongTensor(train_g.ndata["id"])
    etype_th = th.LongTensor(train_g.edata["type"])
    if use_cuda:
        nid_th, etype_th = nid_th.cuda(), etype_th.cuda()
    train_g.ndata['id'] = nid_th
    train_g.edata['type'] = etype_th

    test_g = dataset.test_g
    nid_th = th.LongTensor(test_g.ndata["id"])
    etype_th = th.LongTensor(test_g.edata["type"])
    if use_cuda:
        nid_th, etype_th = nid_th.cuda(), etype_th.cuda()
    test_g.ndata['id'] = nid_th
    test_g.edata['type'] = etype_th

    item_id_range = th.LongTensor(dataset.item_id_range).cuda() if use_cuda \
        else th.LongTensor(dataset.item_id_range)

    for epoch in range(1, args.max_epoch + 1):
        ### train kg
        time1 = time()
        kg_sampler = dataset.KG_sampler(batch_size=args.batch_size_kg)
        iter = 0
        total_loss = 0.0
        for h, r, pos_t, neg_t, _ in kg_sampler:
            iter += 1
            model.train()
            h_th = th.LongTensor(h)
            r_th = th.LongTensor(r)
            pos_t_th = th.LongTensor(pos_t)
            neg_t_th = th.LongTensor(neg_t)
            if use_cuda:
                h_th, r_th, pos_t_th, neg_t_th = h_th.cuda(), r_th.cuda(
                ), pos_t_th.cuda(), neg_t_th.cuda()
            loss = model.transR(h_th, r_th, pos_t_th, neg_t_th)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if (iter % args.print_every) == 0 or iter == 1:
                logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format(
                    epoch, iter, total_loss / iter))
        logging.info('Time for KGE: {:.1f}s, loss {:.4f}'.format(
            time() - time1, total_loss / iter))

        ### train GNN
        if args.use_attention:
            time1 = time()
            print("Compute attention weight in train ...")
            with th.no_grad():
                A_w = model.compute_attention(train_g)
            train_g.edata['w'] = A_w
            print("Time: {:.2f}s".format(time() - time1))
        time1 = time()
        cf_sampler = dataset.CF_pair_sampler(batch_size=args.batch_size)
        iter = 0
        total_loss = 0.0
        for user_ids, item_pos_ids, item_neg_ids, _ in cf_sampler:
            iter += 1
            model.train()
            user_ids_th = th.LongTensor(user_ids)
            item_pos_ids_th = th.LongTensor(item_pos_ids)
            item_neg_ids_th = th.LongTensor(item_neg_ids)
            if use_cuda:
                user_ids_th, item_pos_ids_th, item_neg_ids_th = \
                    user_ids_th.cuda(), item_pos_ids_th.cuda(), item_neg_ids_th.cuda()
            embedding = model.gnn(train_g, train_g.ndata['id'])
            loss = model.get_loss(embedding, user_ids_th, item_pos_ids_th,
                                  item_neg_ids_th)
            loss.backward()
            # th.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)  # clip gradients
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if (iter % args.print_every) == 0 or iter == 1:
                logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format(
                    epoch, iter, total_loss / iter))
        logging.info('Time for GNN: {:.1f}s, loss {:.4f}'.format(
            time() - time1, total_loss / iter))

        if epoch % args.evaluate_every == 0:
            time1 = time()
            val_recall, val_ndcg = eval(model, train_g,
                                        dataset.train_user_dict,
                                        dataset.valid_user_dict, item_id_range,
                                        use_cuda, args.use_attention)

            info = "Epoch{}, [{:.1f}s] val recall:{:.5f}, val ndcg:{:.5f}".format(
                epoch,
                time() - time1, val_recall, val_ndcg)
            # save best model
            if val_recall > best_recall:
                valid_metric_logger.log(epoch=epoch,
                                        recall=val_recall,
                                        ndcg=val_ndcg,
                                        is_best=1)
                best_recall = val_recall
                #best_ndcg = val_ndcg
                best_epoch = epoch
                time1 = time()
                test_recall, test_ndcg = eval(model, test_g,
                                              dataset.train_valid_user_dict,
                                              dataset.test_user_dict,
                                              item_id_range, use_cuda,
                                              args.use_attention)
                test_metric_logger.log(epoch=epoch,
                                       recall=test_recall,
                                       ndcg=test_ndcg)

                info += "\t[{:.1f}s] test recall:{:.5f}, test ndcg:{:.5f}".format(
                    time() - time1, test_recall, test_ndcg)
                #th.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file)
            else:
                valid_metric_logger.log(epoch=epoch,
                                        recall=val_recall,
                                        ndcg=val_ndcg,
                                        is_best=0)
                recall, ndcg = eval(model, test_g,
                                    dataset.train_valid_user_dict,
                                    dataset.test_user_dict, item_id_range,
                                    use_cuda, args.use_attention)
                print("test recall:{}, test_ndcg: {}".format(recall, ndcg))
            logging.info(info)

    logging.info(
        "Final test recall:{:.5f}, test ndcg:{:.5f}, best epoch:{}".format(
            test_recall, test_ndcg, best_epoch))
Пример #6
0
def train(args):
    print(args)

    dataset = DataSetLoader(args.data_name,
                            args.device,
                            use_one_hot_fea=args.use_one_hot_fea,
                            symm=args.gcn_agg_norm_symm,
                            test_ratio=args.data_test_ratio,
                            valid_ratio=args.data_valid_ratio,
                            sample_rate=args.sample_rate)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    #args.decoder = "MLP"
    net = Net(args=args)

    #print(args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels

    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = -np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)

    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(
        args.device)

    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)
    dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(
        args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        '''
        noisy_labels = th.LongTensor(np.random.choice([-1, 0, 1], train_gt_ratings.shape[0], replace=True, p=[0.001, 0.998, 0.001])).to(args.device)

        train_gt_labels += noisy_labels
    
        max_label = dataset.max_l + th.zeros_like(train_gt_labels)
        min_label = dataset.min_l + th.zeros_like(train_gt_labels)
        max_label = max_label.long()
        min_label = min_label.long()
        train_gt_labels = th.where(train_gt_labels > max_label, max_label, train_gt_labels)
        train_gt_labels = th.where(train_gt_labels < min_label, min_label, train_gt_labels)
        '''

        if iter_idx > 3:
            t0 = time.time()
        net.train()
        if iter_idx > 250:
            Two_Stage = True
        else:
            Two_Stage = False
        Two_Stage = False
        pred_ratings, reg_loss, user_out, movie_out, W = net(
            dataset.train_enc_graph, dataset.train_dec_graph,
            dataset.user_feature, dataset.movie_feature, Two_Stage)
        #print("user_out:\n", user_out[0])
        #print("movie_out:\n", movie_out[0])
        #print("W:\n", W.shape)
        if args.loss_func == "CE":
            loss = rating_loss_net(
                pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss
            '''
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2)
            loss += mse_loss * 0.0001
            '''
        elif args.loss_func == "Hinge":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
            gap = (real_pred_ratings - train_gt_labels)**2
            hinge_loss = th.where(gap > 1.0, gap * gap, gap).mean()
            loss = hinge_loss
        elif args.loss_func == "MSE":
            '''
            seeds = th.arange(pred_ratings.shape[0])
            random.shuffle(seeds)
            for i in range((pred_ratings.shape[0] - 1) // 50 + 1):
                start = i * 50
                end = (i + 1) * 50
                if end > (pred_ratings.shape[0] - 1):
                    end = pred_ratings.shape[0] - 1
                batch = seeds[start:end]
                loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss
                count_loss += loss.item() * 50 / pred_ratings.shape[0]
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
                optimizer.step()
                pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph,
                                   dataset.user_feature, dataset.movie_feature)
            '''
            loss = th.mean((pred_ratings[:, 0] -
                            nd_possible_rating_values[train_gt_labels])**
                           2) + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.loss_func == "CE":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
        elif args.loss_func == "MSE":
            real_pred_ratings = pred_ratings[:, 0]
        rmse = ((real_pred_ratings - train_gt_ratings)**2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss / iter_idx, count_rmse / count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            ndcg_valid = evaluate_metric(args=args,
                                         net=net,
                                         dataset=dataset,
                                         segment='valid',
                                         debug=False)
            print("ndcg_valid:", ndcg_valid)
            valid_loss_logger.log(iter=iter_idx,
                                  rmse=valid_rmse,
                                  ndcg_20=ndcg_valid[0],
                                  ndcg_40=ndcg_valid[1],
                                  ndcg_80=ndcg_valid[2])
            print("-" * 80)

            #test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
            #test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20 = ndcg_k[0], ndcg_40 = ndcg_k[1], ndcg_80 = ndcg_k[2])
            #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
            logging_str += ',\tndcg_valid_20={:.4f}'.format(ndcg_valid[0])
            logging_str += ',\tndcg_valid_40={:.4f}'.format(ndcg_valid[1])
            logging_str += ',\tndcg_valid_80={:.4f}'.format(ndcg_valid[2])

            ndcg_valid_20 = ndcg_valid[0]
            #print("***********",ndcg_valid_20)

            if ndcg_valid_20 > best_valid_ndcg:
                best_valid_ndcg = ndcg_valid_20
                print("************best_valid_ndcg:", best_valid_ndcg)
                print("************ndcg_valid_20:", ndcg_valid_20)
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test',
                                     debug=True,
                                     idx=iter_idx)
                ndcg_test = evaluate_metric(args=args,
                                            net=net,
                                            dataset=dataset,
                                            segment='test',
                                            debug=False)
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[0])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[1])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[2])
                #best_test_rmse = test_rmse
                best_test_ndcg = ndcg_test
                #test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                test_loss_logger.log(iter=iter_idx,
                                     rmse=test_rmse,
                                     ndcg_20=ndcg_test[0],
                                     ndcg_40=ndcg_test[1],
                                     ndcg_80=ndcg_test[2])
                #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor,
                                 args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
            #print("************best_valid_ndcg:",best_valid_ndcg)
            #print("************ndcg_valid_20:",ndcg_valid_20)
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print(
        'Best Iter Idx={}, best ndcg_20={:.4f}, best ndcg_40={:.4f}, best ndcg_80={:.4f}'
        .format(best_iter, best_test_ndcg[0], best_test_ndcg[1],
                best_test_ndcg[2]))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #7
0
def train(args):
    print(args)
    if args.data_name == 'jukebox':
        dataset = JukeboxDataset('dataset/listen_count.txt')
    else:
        dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.MSELoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 1
    count_num = 1
    count_loss = 0
    count_step = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    def batch(iterable, n=1):
        current_batch = []
        for item in iterable:
            current_batch.append(item)
            if len(current_batch) == n:
                yield current_batch
                current_batch = []
        if current_batch:
            yield current_batch
    batches = []
    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        unique_item_list = dataset.train['item_id'].unique().tolist()

        ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                   dataset.user_feature, dataset.movie_feature)
        from tqdm import tqdm
        if iter_idx ==1:
            for row in tqdm(list(dataset.train.itertuples())):
                user, item, rating = row.user_id, row.item_id, row.rating
                userid = dataset.global_user_id_map[user]
                observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist()
                negatives = set()
                while len(negatives) < 1:
                    sample = random.choice(unique_item_list)
                    if sample not in observed:
                        negatives.add(sample)
                        batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample]))

        for bt in tqdm(list(batch(batches, 2**14))):
            uidfeat = ufeat[[e[0] for e in bt]]
            posfeat = ifeat[[e[1] for e in bt]]
            negfeat = ifeat[[e[2] for e in bt]]

            pos_scores = uidfeat @ net.decoder.Q @ posfeat.T
            neg_scores = uidfeat @ net.decoder.Q @ negfeat.T

            lmbd = 1e-5
            mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean()
            mf_loss = -1 * mf_loss

            regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q))
            emb_loss = lmbd * regularizer
            print('mf_loss', mf_loss)
            print('emb_loss', emb_loss)
            optimizer.zero_grad()
            loss = mf_loss + emb_loss
            count_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
            optimizer.step()
            ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                       dataset.user_feature, dataset.movie_feature)
            count_step += 1

        print('train done')

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (count_step + 1))
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/(count_step + 1), count_rmse/count_num,
                np.average(dur))
            count_rmse = 1
            count_num = 1

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid')
            ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid')
            print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support)
            valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore,
                                  support=support)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #8
0
def train(proc_id, n_gpus, args, devices, movielens):
    # Start up distributed training, if enabled.
    dev_id = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
    th.cuda.set_device(dev_id)
    # set random seed in each gpu
    th.manual_seed(args.seed)
    if th.cuda.is_available():
        th.cuda.manual_seed_all(args.seed)

    # Split train_dataset and set dataloader
    train_rating_pairs = th.split(th.stack(movielens.train_rating_pairs),
                                  len(movielens.train_rating_values) //
                                  args.n_gpus,
                                  dim=1)[proc_id]
    train_rating_values = th.split(movielens.train_rating_values,
                                   len(movielens.train_rating_values) //
                                   args.n_gpus,
                                   dim=0)[proc_id]

    train_dataset = MovieLensDataset(train_rating_pairs, train_rating_values,
                                     movielens.train_graph, args.hop,
                                     args.sample_ratio, args.max_nodes_per_hop)
    train_loader = th.utils.data.DataLoader(train_dataset,
                                            batch_size=args.batch_size,
                                            shuffle=True,
                                            num_workers=args.num_workers,
                                            collate_fn=collate_movielens)
    if proc_id == 0:
        if args.testing:
            test_dataset = MovieLensDataset(movielens.test_rating_pairs,
                                            movielens.test_rating_values,
                                            movielens.train_graph, args.hop,
                                            args.sample_ratio,
                                            args.max_nodes_per_hop)
        else:
            test_dataset = MovieLensDataset(movielens.valid_rating_pairs,
                                            movielens.valid_rating_pairs,
                                            movielens.train_graph, args.hop,
                                            args.sample_ratio,
                                            args.max_nodes_per_hop)
        test_loader = th.utils.data.DataLoader(test_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=args.num_workers,
                                               collate_fn=collate_movielens)

    model = IGMC(
        in_feats=(args.hop + 1) * 2,
        latent_dim=[32, 32, 32, 32],
        num_relations=5,  #dataset_base.num_rating, 
        num_bases=4,
        regression=True,
        edge_dropout=args.edge_dropout,
        #  side_features=args.use_features,
        #  n_side_features=n_features,
        #  multiply_by=args.multiply_by
    ).to(dev_id)
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[dev_id],
                                        output_device=dev_id)
    loss_fn = nn.MSELoss().to(dev_id)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.train_lr,
                           weight_decay=0)

    if proc_id == 0:
        print("Loading network finished ...\n")
        # prepare the logger
        logger = MetricLogger(args.save_dir, args.valid_log_interval)

        best_epoch = 0
        best_rmse = np.inf
        print("Start training ...")

    for epoch_idx in range(1, args.train_epochs + 1):
        if proc_id == 0:
            print('Epoch', epoch_idx)

        train_loss = train_epoch(proc_id, n_gpus, model, loss_fn, optimizer,
                                 args.arr_lambda, train_loader, dev_id,
                                 args.train_log_interval)

        if n_gpus > 1:
            th.distributed.barrier()
        if proc_id == 0:
            test_rmse = evaluate(model, test_loader, dev_id)
            eval_info = {
                'epoch': epoch_idx,
                'train_loss': train_loss,
                'test_rmse': test_rmse,
            }
            print(
                '=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(
                    *eval_info.values()))

            if epoch_idx % args.train_lr_decay_step == 0:
                for param in optimizer.param_groups:
                    param['lr'] = args.train_lr_decay_factor * param['lr']

            logger.log(eval_info, model, optimizer)
            if best_rmse > test_rmse:
                best_rmse = test_rmse
                best_epoch = epoch_idx

    if n_gpus > 1:
        th.distributed.barrier()
    if proc_id == 0:
        eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(
            best_rmse, best_epoch)
        print(eval_info)
        with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f:
            f.write(eval_info)
Пример #9
0
def run(proc_id, n_gpus, args, devices, dataset):
    dev_id = devices[proc_id]
    train_labels = dataset.train_labels
    train_truths = dataset.train_truths
    num_edges = train_truths.shape[0]

    reverse_types = {
        to_etype_name(k): 'rev-' + to_etype_name(k)
        for k in dataset.possible_rating_values
    }
    reverse_types.update({v: k for k, v in reverse_types.items()})
    sampler = dgl.dataloading.MultiLayerNeighborSampler([None],
                                                        return_eids=True)
    dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, {
        to_etype_name(k): th.arange(
            dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k)))
        for k in dataset.possible_rating_values
    },
                                                sampler,
                                                batch_size=args.minibatch_size,
                                                shuffle=True,
                                                drop_last=False)

    if proc_id == 0:
        valid_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.valid_dec_graph,
            th.arange(dataset.valid_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.valid_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)
        test_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.test_dec_graph,
            th.arange(dataset.test_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.test_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)

    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=dev_id)
    if n_gpus > 0:
        th.cuda.set_device(dev_id)

    nd_possible_rating_values = \
        th.FloatTensor(dataset.possible_rating_values)
    nd_possible_rating_values = nd_possible_rating_values.to(dev_id)

    start = time.time()
    net = Net(args=args, dev_id=dev_id)
    net = net.to(dev_id)
    if n_gpus > 1:
        net = DistributedDataParallel(net,
                                      device_ids=[dev_id],
                                      output_device=dev_id)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_epoch = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    print("Start training ...")
    dur = []
    iter_idx = 1
    logging_str = None

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    for epoch in range(1, args.train_max_epoch):
        if epoch == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))
        if epoch > 1:
            t0 = time.time()
        net.train()
        with tqdm.tqdm(dataloader) as tq:
            for step, (input_nodes, pair_graph, blocks) in enumerate(tq):
                head_feat, tail_feat, blocks = load_subtensor(
                    input_nodes, pair_graph, blocks, dataset,
                    dataset.train_enc_graph)
                frontier = blocks[0]
                compact_g = flatten_etypes(pair_graph, dataset,
                                           'train').to(dev_id)
                true_relation_labels = compact_g.edata['label']
                true_relation_ratings = compact_g.edata['rating']

                head_feat = head_feat.to(dev_id)
                tail_feat = tail_feat.to(dev_id)
                frontier = frontier.to(dev_id)

                pred_ratings = net(compact_g, frontier, head_feat, tail_feat,
                                   dataset.possible_rating_values)
                loss = rating_loss_net(pred_ratings,
                                       true_relation_labels.to(dev_id)).mean()
                count_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(net.parameters(),
                                         args.train_grad_clip)
                optimizer.step()

                if proc_id == 0 and iter_idx == 1:
                    print("Total #Param of net: %d" %
                          (torch_total_param_num(net)))

                real_pred_ratings = (
                    th.softmax(pred_ratings, dim=1) *
                    nd_possible_rating_values.view(1, -1)).sum(dim=1)
                rmse = ((real_pred_ratings -
                         true_relation_ratings.to(dev_id))**2).sum()
                count_rmse += rmse.item()
                count_num += pred_ratings.shape[0]

                if iter_idx % args.train_log_interval == 0:
                    train_loss_logger.log(iter=iter_idx,
                                          loss=count_loss / (iter_idx + 1),
                                          rmse=count_rmse / count_num)

                tq.set_postfix(
                    {
                        'loss': '{:.4f}'.format(count_loss / iter_idx),
                        'rmse': '{:.4f}'.format(count_rmse / count_num)
                    },
                    refresh=False)

                iter_idx += 1

        if epoch > 1:
            epoch_time = time.time() - t0
            print("Epoch {} time {}".format(epoch, epoch_time))

        if epoch % args.train_valid_interval == 0:
            if n_gpus > 1:
                th.distributed.barrier()
            if proc_id == 0:
                valid_rmse = evaluate(args=args,
                                      dev_id=dev_id,
                                      net=net,
                                      dataset=dataset,
                                      dataloader=valid_dataloader,
                                      segment='valid')
                valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
                logging_str = 'Val RMSE={:.4f}'.format(valid_rmse)

                if valid_rmse < best_valid_rmse:
                    best_valid_rmse = valid_rmse
                    no_better_valid = 0
                    best_epoch = epoch
                    test_rmse = evaluate(args=args,
                                         dev_id=dev_id,
                                         net=net,
                                         dataset=dataset,
                                         dataloader=test_dataloader,
                                         segment='test')
                    best_test_rmse = test_rmse
                    test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                    logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
                else:
                    no_better_valid += 1
                    if no_better_valid > args.train_early_stopping_patience\
                        and learning_rate <= args.train_min_lr:
                        logging.info(
                            "Early stopping threshold reached. Stop training.")
                        break
                    if no_better_valid > args.train_decay_patience:
                        new_lr = max(
                            learning_rate * args.train_lr_decay_factor,
                            args.train_min_lr)
                        if new_lr < learning_rate:
                            logging.info("\tChange the LR to %g" % new_lr)
                            learning_rate = new_lr
                            for p in optimizer.param_groups:
                                p['lr'] = learning_rate
                            no_better_valid = 0
                            print("Change the LR to %g" % new_lr)
            # sync on evalution
            if n_gpus > 1:
                th.distributed.barrier()

        if logging_str is not None:
            print(logging_str)
    if proc_id == 0:
        print(
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))

        train_loss_logger.close()
        valid_loss_logger.close()
        test_loss_logger.close()

        with open(
                os.path.join(args.save_dir, f'duration_{args.save_id:d}.txt'),
                'a') as f:
            print(f'wall: {time.time() - start}')
            f.write(f'wall: {time.time() - start}')
Пример #10
0
def train(args):
    print(args)
    dataset = DataSetLoader(args.data_name, args.device,
                use_one_hot_fea=args.use_one_hot_fea,
                symm=args.gcn_agg_norm_symm,
                test_ratio=args.data_test_ratio,
                valid_ratio=args.data_valid_ratio,
                sample_rate = args.sample_rate)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    # train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
    #                                  os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    # valid_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f',  '%.4f',  '%.4f',  '%.4f'],
    #                                  os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    # test_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f',  '%.4f',  '%.4f',  '%.4f'],
    #                                 os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))
    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss.csv'))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss.csv'))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss.csv'))
    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = []
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    
    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    #dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(args.device)
    #dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(args.device)

    print("Start training ...")
    dur = []


    train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info)
    
    # 首先需要对每个用户采样出他的0样本,这个操作做一次就可以了。
    # 其次每次从这些样本中随机抽取一些边作为0的边
    # def sample_negative(interact_status, sample_num, random_number):
    #     #"""return sample_num sampled negative items"""
    #     random.seed(random_number)
    #     interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_num))
    #     return interact_status[['user_id', 'negative_samples']]

    # def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data, seed):
    #     #train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info)
    #     train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(sampled_data)    
    #     train_rating_pairs_new = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1]))
    #     train_rating_values_new = np.append(train_rating_values, train_rating_values_zeros)
    #     train_enc_graph_NS = dataset._generate_enc_graph(train_rating_pairs_new, train_rating_values_new, add_support = True)
    #     #print("dataset.train_dec_graph:", dataset.train_enc_graph)
    #     train_enc_graph_NS = train_enc_graph_NS.int().to(args.device)
    #     valid_enc_graph_NS = train_enc_graph_NS    
    #     return train_enc_graph_NS

    def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data):
        train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(dataset.train_rating_info, sampled_data)
        train_rating_pairs = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1]))
        train_rating_values = np.append(train_rating_values, train_rating_values_zeros)
        dataset.train_enc_graph = dataset._generate_enc_graph(train_rating_pairs, train_rating_values, add_support = True)
        dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
        dataset.valid_enc_graph = dataset.train_enc_graph    
        return dataset.train_enc_graph

    def sample_data(interact_status, random_number, sample_rate):
        random.seed(random_number)
        #print("length:", len(interact_status['negative_items']))
        #for i in interact_status['negative_items']
        #print("neg:\n",interact_status['negative_items'])
        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_rate))
        return interact_status[['user_id', 'negative_items', 'negative_samples']]


    seed_list = np.random.randint(0, 10000, (args.train_max_iter,))

    #negitive_all = dataset.negative_all(dataset.train_rating_info)
    # max_num = 0
    # for i in range(0,len(negitive_all)):
    #     if len(negitive_all['negative_items'][i]) > max_num:
    #         max_num = len(negitive_all['negative_items'][i])
    # min_num = np.inf
    # for i in range(0,len(negitive_all)):
    #     if len(negitive_all['negative_items'][i]) < min_num:
    #         min_num = len(negitive_all['negative_items'][i])

    # sheet = np.zeros((len(negitive_all), max_num))     
    # for i in range(0,len(negitive_all)):
    #     for j in range (0, len(np.array(negitive_all['negative_items'][i]))):
    #         sheet[i][j] =  np.array(negitive_all['negative_items'][i])[j]
    # sheet_new = sheet[:,:min_num]
    # print(sheet_new)
    # X = np.array(negitive_all['negative_items'])
    # max_len = max(len(xx) for xx in X) 
    # M = np.array( [np.concatenate([xx, np.zeros( max_len - len(xx))]) for xx in X])
    # sheet = []
    # for i in range(M.shape[0]):
    #     random.shuffle(M[i])
        #print(list(M[i]))
        #print(np.random.shuffle(list(M[i])))
        #sheet.append[np.random.shuffle(list(M[i]))]
    
    #np.random.randint(0,10,(4,3))

    # print("neg_all:",negitive_all)
    # sampled_data = sample_data(negitive_all, random_number = 1, sample_rate = 3)
    # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data)
    # dataset.valid_enc_graph = dataset.train_enc_graph 

    for iter_idx in range(1, args.train_max_iter):
        # """
        # 方法:创建一个最基本的encode图,然后再加边,每次加一种点
        # """
        #print(len(negitive_all))
        #print ("max_num:", max_num,min_num)
        #print("M shape:", sheet_new.shape)
        #print("sheet:",M)
        #print(np.random.shuffle(np.array(sheet_new)))
        #map_matrix = np.random.randint(0,min_num,(sheet_new.shape[0], sheet_new.shape[1])) < 20
        #print(sheet_new[map_matrix].shape)
        #print(np.where(sheet_new[map_matrix]))
        #print(sheet_new)
        # if args.sample_rate > 0:
            # 这是随机采样的代码 
            # """
            # 如何采样?
            # 1. 单次采样:时间占用还好
            # 2. 每次随机采样:
            #     我们先存一个所有负样本的表,每次在这个负样本的表中去采样

            # 对于更新函数,我们需要对train_enc_graph进行更新,

            # 函数:
            # 1. 一个采样函数
            # 2. 更新函数
            # """
            # print(1)
            # sampled_data = sample_data(negitive_all, random_number = seed_list[iter_idx], sample_rate = 10)
            # print(2)
            # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data)
            # print(3)
            # dataset.valid_enc_graph = dataset.train_enc_graph 
            #print(4)

        if iter_idx > 3:
            t0 = time.time()
        net.train()
        if iter_idx > 250:
            Two_Stage = True
        else:
            Two_Stage = False
        Two_Stage = False
        pred_ratings, reg_loss, user_out, movie_out, W = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature, Two_Stage)
        #print("pre:",pred_ratings[0])
        if args.loss_func == "CE":
            loss = rating_loss_net(pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss
            '''
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2)
            loss += mse_loss * 0.0001
            '''
        elif args.loss_func == "Hinge":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            gap = (real_pred_ratings - train_gt_labels) ** 2
            hinge_loss = th.where(gap > 1.0, gap*gap, gap).mean()
            loss = hinge_loss
        elif args.loss_func == "MSE":
            '''
            seeds = th.arange(pred_ratings.shape[0])
            random.shuffle(seeds)
            for i in range((pred_ratings.shape[0] - 1) // 50 + 1):
                start = i * 50
                end = (i + 1) * 50
                if end > (pred_ratings.shape[0] - 1):
                    end = pred_ratings.shape[0] - 1
                batch = seeds[start:end]
                loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss
                count_loss += loss.item() * 50 / pred_ratings.shape[0]
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
                optimizer.step()
                pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph,
                                   dataset.user_feature, dataset.movie_feature)
            '''
            loss = th.mean((pred_ratings[:, 0] - nd_possible_rating_values[train_gt_labels]) ** 2) + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()
        #print("iter:",iter_idx, loss)
        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if args.loss_func == "CE":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
        elif args.loss_func == "MSE":
            real_pred_ratings = pred_ratings[:, 0]

        rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/iter_idx, count_rmse/count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0
        
        
        
        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
            logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
            dev_step(args, net, dataset=dataset, segment='test', debug = False)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test', debug = True, idx = iter_idx)
                
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Пример #11
0
def train(args):
    print(args)
    dataset = DataSetLoader(args.data_name,
                            args.device,
                            use_one_hot_fea=args.use_one_hot_fea,
                            symm=args.gcn_agg_norm_symm,
                            test_ratio=args.data_test_ratio,
                            valid_ratio=args.data_valid_ratio,
                            sample_rate=args.sample_rate)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    NDCG_logger = MetricLogger(
        ['recall50', 'recall100', 'recall200', 'ndcg50', 'ndcg100', 'ndcg200'],
        ['%.4f', '%.4f', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'NDCG.csv'))

    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = []
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    train_m = dataset.train_m
    test_m = dataset.test_m
    tset = dataset.tset

    user_num, item_num = train_m.shape[0], train_m.shape[1]
    #dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(args.device)
    #dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(args.device)

    print("Start training ...")

    train_rating_pairs, train_rating_values = dataset._generate_pair_value(
        dataset.train_rating_info)

    def update_encode_graph(dataset, train_rating_pairs, train_rating_values,
                            sampled_data):
        train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(
            dataset.train_rating_info, sampled_data)
        train_rating_pairs = (np.append(train_rating_pairs[0],
                                        train_rating_pairs_zeros[0]),
                              np.append(train_rating_pairs[1],
                                        train_rating_pairs_zeros[1]))
        train_rating_values = np.append(train_rating_values,
                                        train_rating_values_zeros)
        dataset.train_enc_graph = dataset._generate_enc_graph(
            train_rating_pairs, train_rating_values, add_support=True)
        dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
        dataset.valid_enc_graph = dataset.train_enc_graph
        return dataset.train_enc_graph

    def sample_data(interact_status, random_number, sample_rate):
        random.seed(random_number)
        interact_status['negative_samples'] = interact_status[
            'negative_items'].apply(lambda x: random.sample(x, sample_rate))
        return interact_status[[
            'user_id', 'negative_items', 'negative_samples'
        ]]

    seed_list = np.random.randint(0, 10000, (args.train_max_iter, ))
    Two_Stage = False
    #sampled_data = sample_data(negitive_all, random_number = seed_list[iter_idx], sample_rate = 3)
    negitive_all = dataset.negative_all(dataset.train_rating_info)

    sampled_data = sample_data(negitive_all, random_number=1, sample_rate=99)
    dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs,
                                                  train_rating_values,
                                                  sampled_data)
    dataset.valid_enc_graph = dataset.train_enc_graph

    for iter_idx in range(1, args.train_max_iter):

        #sampled_data = sample_data(negitive_all, random_number = 1, sample_rate = 3)
        #dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data)

        print("iter:", iter_idx)
        net.train()
        pred_ratings, reg_loss, user_out, movie_out, W = net(
            dataset.train_enc_graph, dataset.train_dec_graph,
            dataset.user_feature, dataset.movie_feature, Two_Stage)
        loss = rating_loss_net(pred_ratings,
                               train_gt_labels).mean() + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()
        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        #print(real_pred_ratings.shape)

        # 对pred的
        if iter_idx < 100:
            if iter_idx % 10 == 0:
                recall50_, recall100_, recall200_, ndcg50_, ndcg100_, ndcg200_ = \
                    dev_step(tset, train_m, test_m, net, dataset, args, nd_possible_rating_values)
                #dev_cold(u_train,i_train, tset, train_m, test_m)
                NDCG_logger.log(recall50=recall50_,
                                recall100=recall100_,
                                recall200=recall200_,
                                ndcg50=ndcg50_,
                                ndcg100=ndcg100_,
                                ndcg200=ndcg200_)
            if iter_idx >= 500:
                recall50, recall100, recall200, ndcg50, ndcg100, ndcg200 = \
                    dev_step(tset, train_m, test_m, net, dataset, args ,nd_possible_rating_values)
                NDCG_logger.log(recall50=recall50_,
                                recall100=recall100_,
                                recall200=recall200_,
                                ndcg50=ndcg50_,
                                ndcg100=ndcg100_,
                                ndcg200=ndcg200_)

                #dev_cold(u_train,i_train, tset, train_m, test_m)

    NDCG_logger.close()
Пример #12
0
def train(args):
    ### prepare data and set model
    movielens = MovieLens(args.data_name,
                          testing=args.testing,
                          test_ratio=args.data_test_ratio,
                          valid_ratio=args.data_valid_ratio)
    if args.testing:
        test_dataset = MovieLensDataset(movielens.test_rating_pairs,
                                        movielens.test_rating_values,
                                        movielens.train_graph, args.hop,
                                        args.sample_ratio,
                                        args.max_nodes_per_hop)
    else:
        test_dataset = MovieLensDataset(movielens.valid_rating_pairs,
                                        movielens.valid_rating_values,
                                        movielens.train_graph, args.hop,
                                        args.sample_ratio,
                                        args.max_nodes_per_hop)
    train_dataset = MovieLensDataset(movielens.train_rating_pairs,
                                     movielens.train_rating_values,
                                     movielens.train_graph, args.hop,
                                     args.sample_ratio, args.max_nodes_per_hop)

    train_loader = th.utils.data.DataLoader(train_dataset,
                                            batch_size=args.batch_size,
                                            shuffle=True,
                                            num_workers=args.num_workers,
                                            collate_fn=collate_movielens)
    test_loader = th.utils.data.DataLoader(test_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=False,
                                           num_workers=args.num_workers,
                                           collate_fn=collate_movielens)

    model = IGMC(
        in_feats=(args.hop + 1) * 2,
        latent_dim=[32, 32, 32, 32],
        num_relations=5,  # movielens.num_rating, 
        num_bases=4,
        regression=True,
        edge_dropout=args.edge_dropout,
        #  side_features=args.use_features,
        #  n_side_features=n_features,
        #  multiply_by=args.multiply_by
    ).to(args.device)
    loss_fn = nn.MSELoss().to(args.device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.train_lr,
                           weight_decay=0)
    print("Loading network finished ...\n")

    ### prepare the logger
    logger = MetricLogger(args.save_dir, args.valid_log_interval)

    best_epoch = 0
    best_rmse = np.inf
    ### declare the loss information
    print("Start training ...")
    for epoch_idx in range(1, args.train_epochs + 1):
        print('Epoch', epoch_idx)

        train_loss = train_epoch(model, loss_fn, optimizer, args.arr_lambda,
                                 train_loader, args.device,
                                 args.train_log_interval)
        test_rmse = evaluate(model, test_loader, args.device)
        eval_info = {
            'epoch': epoch_idx,
            'train_loss': train_loss,
            'test_rmse': test_rmse,
        }
        print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(
            *eval_info.values()))

        if epoch_idx % args.train_lr_decay_step == 0:
            for param in optimizer.param_groups:
                param['lr'] = args.train_lr_decay_factor * param['lr']

        logger.log(eval_info, model, optimizer)
        if best_rmse > test_rmse:
            best_rmse = test_rmse
            best_epoch = epoch_idx
    eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(
        best_rmse, best_epoch)
    print(eval_info)
    with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f:
        f.write(eval_info)