def main():
    log_hardware()
    args = parse_args()
    log_args(args)

    model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
                  mlp_layer_sizes=args.layers, dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.opt_level == "O2":
        model = amp.initialize(model, opt_level=args.opt_level,
                               keep_batchnorm_fp32=False, loss_scale='dynamic')

    users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users)
    items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items)

    latencies = []
    for _ in range(args.num_batches):
        torch.cuda.synchronize()
        start = time.time()
        predictions = model(users, items, sigmoid=True)
        torch.cuda.synchronize()
        latencies.append(time.time() - start)

    LOGGER.log(key='batch_size', value=args.batch_size)
    LOGGER.log(key='best_inference_throughput', value=args.batch_size / min(latencies))
    LOGGER.log(key='best_inference_latency', value=min(latencies))
    LOGGER.log(key='inference_latencies', value=latencies)
    return
示例#2
0
def main():
    args = parse_args()
    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=args.log_path),
        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
    ])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users,
                  nb_items=args.n_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.opt_level == "O2":
        model = amp.initialize(model,
                               opt_level=args.opt_level,
                               keep_batchnorm_fp32=False,
                               loss_scale='dynamic')
    model.eval()

    users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users)
    items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items)

    latencies = []
    for _ in range(args.num_batches):
        torch.cuda.synchronize()
        start = time.time()
        predictions = model(users, items, sigmoid=True)
        torch.cuda.synchronize()
        latencies.append(time.time() - start)

    dllogger.log(data={
        'batch_size':
        args.batch_size,
        'best_inference_throughput':
        args.batch_size / min(latencies),
        'best_inference_latency':
        min(latencies),
        'mean_inference_throughput':
        args.batch_size / np.mean(latencies),
        'mean_inference_latency':
        np.mean(latencies),
        'inference_latencies':
        latencies
    },
                 step=tuple())
    dllogger.flush()
    return
示例#3
0
def main():
    args = parse_args()
    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                       filename=args.log_path),
                            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
                  mlp_layer_sizes=args.layers, dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.fp16:
        model.half()
    model.eval()
    
    batch_sizes = args.batch_sizes.split(',')
    batch_sizes = [int(s) for s in batch_sizes]

    result_data = {}
    for batch_size in batch_sizes:
        print('benchmarking batch size: ', batch_size)
        users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users)
        items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items)

        latencies = []
        for _ in range(args.num_batches):
            torch.cuda.synchronize()
            start = time.time()
            _ = model(users, items, sigmoid=True)
            torch.cuda.synchronize()
            latencies.append(time.time() - start)

        result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies)
        result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies)
        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90)
        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95)
        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99)

    dllogger.log(data=result_data, step=tuple())
    dllogger.flush()
    return
示例#4
0
def main():
    # Note: The run start is in convert.py

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # set shuffle=True in DataLoader
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LEARN_RATE,
                         value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):
        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        model.train()
        losses = utils.AverageMeter()

        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=train_dataset.nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": float(np.mean(hits))
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
示例#5
0
def main():

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    # Check where to put data loader
    if use_cuda:
        dataloader_device = 'cpu' if args.cpu_dataloader else 'cuda'
    else:
        dataloader_device = 'cpu'

    # more like load trigger timmer now
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL,
                         value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT,
                         value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT,
                         value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

    # sync worker before timing.
    torch.cuda.synchronize()

    #===========================================================================
    #== The clock starts on loading the preprocessed data. =====================
    #===========================================================================
    mlperf_log.ncf_print(key=mlperf_log.RUN_START)
    run_start_time = time.time()

    print(datetime.now(), "Loading test ratings.")
    test_ratings = [torch.LongTensor()] * args.user_scaling

    for chunk in range(args.user_scaling):
        test_ratings[chunk] = torch.from_numpy(
            np.load(args.data + '/testx' + str(args.user_scaling) + 'x' +
                    str(args.item_scaling) + '_' + str(chunk) + '.npz',
                    encoding='bytes')['arr_0'])

    fn_prefix = args.data + '/' + CACHE_FN.format(args.user_scaling,
                                                  args.item_scaling)
    sampler_cache = fn_prefix + "cached_sampler.pkl"
    print(datetime.now(), "Loading preprocessed sampler.")
    if os.path.exists(args.data):
        print("Using alias file: {}".format(args.data))
        with open(sampler_cache, "rb") as f:
            sampler, pos_users, pos_items, nb_items, _ = pickle.load(f)
    print(datetime.now(), "Alias table loaded.")

    nb_users = len(sampler.num_regions)
    train_users = torch.from_numpy(pos_users).type(torch.LongTensor)
    train_items = torch.from_numpy(pos_items).type(torch.LongTensor)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label, neg_label))
    del neg_label

    test_pos = [l[:, 1].reshape(-1, 1) for l in test_ratings]
    test_negatives = [torch.LongTensor()] * args.user_scaling
    test_neg_items = [torch.LongTensor()] * args.user_scaling

    print(datetime.now(), "Loading test negatives.")
    for chunk in range(args.user_scaling):
        file_name = (args.data + '/test_negx' + str(args.user_scaling) + 'x' +
                     str(args.item_scaling) + '_' + str(chunk) + '.npz')
        raw_data = np.load(file_name, encoding='bytes')
        test_negatives[chunk] = torch.from_numpy(raw_data['arr_0'])
        print(
            datetime.now(),
            "Test negative chunk {} of {} loaded ({} users).".format(
                chunk + 1, args.user_scaling, test_negatives[chunk].size()))

    test_neg_items = [l[:, 1] for l in test_negatives]

    # create items with real sample at last position
    test_items = [
        torch.cat((a.reshape(-1, args.valid_negative), b), dim=1)
        for a, b in zip(test_neg_items, test_pos)
    ]
    del test_ratings, test_neg_items

    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = zip(*[torch.sort(l)
                                  for l in test_items])  # [1,1,1,2], [3,1,0,2]
    sum_item_indices = [
        a.float() + b.float() / len(b[0])
        for a, b in zip(sorted_items, indices)
    ]  #[1.75,1.25,1.0,2.5]
    indices_order = [torch.sort(l)[1] for l in sum_item_indices]  #[2,1,0,3]
    stable_indices = [
        torch.gather(a, 1, b) for a, b in zip(indices, indices_order)
    ]  #[0,1,3,2]
    # produce -1 mask
    dup_mask = [(l[:, 0:-1] == l[:, 1:]) for l in sorted_items]
    dup_mask = [
        torch.cat((torch.zeros_like(a, dtype=torch.uint8), b), dim=1)
        for a, b in zip(test_pos, dup_mask)
    ]
    dup_mask = [
        torch.gather(a, 1,
                     b.sort()[1]) for a, b in zip(dup_mask, stable_indices)
    ]
    # produce real sample indices to later check in topk
    sorted_items, indices = zip(*[(a != b).sort()
                                  for a, b in zip(test_items, test_pos)])
    sum_item_indices = [(a.float()) + (b.float()) / len(b[0])
                        for a, b in zip(sorted_items, indices)]
    indices_order = [torch.sort(l)[1] for l in sum_item_indices]
    stable_indices = [
        torch.gather(a, 1, b) for a, b in zip(indices, indices_order)
    ]
    real_indices = [l[:, 0] for l in stable_indices]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos

    # For our dataset, test set is identical to user set, so arange() provides
    # all test users.
    test_users = torch.arange(nb_users, dtype=torch.long)
    test_users = test_users[:, None]
    test_users = test_users + torch.zeros(1 + args.valid_negative,
                                          dtype=torch.long)
    # test_items needs to be of type Long in order to be used in embedding
    test_items = torch.cat(test_items).type(torch.long)

    dup_mask = torch.cat(dup_mask)
    real_indices = torch.cat(real_indices)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # we shuffled later with randperm

    print(
        datetime.now(),
        "Data loading done {:.1f} sec. #user={}, #item={}, #train={}, #test={}"
        .format(time.time() - run_start_time, nb_users, nb_items,
                len(train_users), nb_users))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    params = model.parameters()

    optimizer = torch.optim.Adam(params,
                                 lr=args.learning_rate,
                                 betas=(args.beta1, args.beta2),
                                 eps=args.eps)
    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps)
    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    local_batch = args.batch_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    samples_per_user = test_items.size(1)
    users_per_valid_batch = args.valid_batch_size // samples_per_user

    test_users = test_users.split(users_per_valid_batch)
    test_items = test_items.split(users_per_valid_batch)
    dup_mask = dup_mask.split(users_per_valid_batch)
    real_indices = real_indices.split(users_per_valid_batch)

    hr, ndcg = val_epoch(model,
                         test_users,
                         test_items,
                         dup_mask,
                         real_indices,
                         args.topk,
                         samples_per_user=samples_per_user,
                         num_user=nb_users)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=hr, ndcg=ndcg))
    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):

        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=args.negative_samples)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()

        st = timeit.default_timer()
        if args.random_negatives:
            neg_users = train_users.repeat(args.negative_samples)
            neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(
                0, nb_items)
        else:
            negatives = generate_negatives(sampler, args.negative_samples,
                                           train_users.numpy())
            negatives = torch.from_numpy(negatives)
            neg_users = negatives[:, 0]
            neg_items = negatives[:, 1]

        print("generate_negatives loop time: {:.2f}",
              timeit.default_timer() - st)

        after_neg_gen = time.time()

        st = timeit.default_timer()
        epoch_users = torch.cat((train_users, neg_users))
        epoch_items = torch.cat((train_items, neg_items))
        del neg_users, neg_items

        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(len(epoch_users),
                                       device=dataloader_device)
        epoch_size = len(epoch_indices)
        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]
        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)

        print("shuffle time: {:.2f}", timeit.default_timer() - st)

        # only print progress bar on rank 0
        num_batches = (epoch_size + args.batch_size - 1) // args.batch_size
        qbar = tqdm.tqdm(range(num_batches))
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users_list) < num_batches:
            print("epoch_size % batch_size < number of worker!")
            exit(1)

        after_shuffle = time.time()

        neg_gen_time = (after_neg_gen - begin)
        shuffle_time = (after_shuffle - after_neg_gen)

        for i in qbar:
            # selecting input from prepared data
            user = epoch_users_list[i].cuda()
            item = epoch_items_list[i].cuda()
            label = epoch_label_list[i].view(-1, 1).cuda()

            for p in model.parameters():
                p.grad = None

            outputs = model(user, item)
            loss = traced_criterion(outputs, label).float()
            loss = torch.mean(loss.view(-1), 0)

            loss.backward()
            optimizer.step()

        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
        train_time = time.time() - begin
        begin = time.time()

        mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=samples_per_user,
                             num_user=nb_users,
                             output=valid_results_file,
                             epoch=epoch,
                             loss=loss.data.item())

        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}, loss = {loss:.4f},'
            ' neg_gen: {neg_gen_time:.4f}, shuffle_time: {shuffle_time:.2f}'.
            format(epoch=epoch,
                   K=args.topk,
                   hit_rate=hr,
                   ndcg=ndcg,
                   train_time=train_time,
                   val_time=val_time,
                   loss=loss.data.item(),
                   neg_gen_time=neg_gen_time,
                   shuffle_time=shuffle_time))

        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": hr
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP, value=epoch)

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    run_stop_time = time.time()
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)

    # easy way of tracking mlperf score
    if success:
        print("mlperf_score", run_stop_time - run_start_time)
示例#6
0
def main():
    log_hardware()
    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    log_args(args)

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Saving results to {}".format(args.checkpoint_dir))
    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # The default of np.random.choice is replace=True, so does pytorch random_()
    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()
    LOGGER.log(key=tags.RUN_START)

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:{}'.format(
                               args.local_rank)))

    valid_negative = test_negs.shape[1]
    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=valid_negative)

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1
    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.opt_level == "O2":
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level,
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(admm_utils.count_parameters(model)))
    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
    LOGGER.log(key=tags.OPT_NAME, value="Adam")
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2)
    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        LOGGER.log(key=tags.EVAL_START, value=0)
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
            K=args.topk, hit_rate=hr, ndcg=ndcg))
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": 0, "value": hr})
        LOGGER.log(key=tags.EVAL_STOP, value=0)
        LOGGER.log(key='best_eval_throughput', value=eval_throughput)
        return

    success = False
    max_hr = 0
    train_throughputs, eval_throughputs = [], []

    LOGGER.log(key=tags.TRAIN_LOOP)
    for epoch in range(args.epochs):

        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
        LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples)
        LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN)

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.opt_level == "O2":
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)
        LOGGER.log(key='train_throughput', value=train_throughput)
        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        LOGGER.log(key=tags.EVAL_START, value=epoch)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=hr,
                ndcg=ndcg,
                train_time=train_time,
                val_time=val_time))

        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)
        LOGGER.log(key='eval_throughput', value=eval_throughput)

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                'model.pth')
            print("New best hr! Saving the model to: ", save_checkpoint_path)
            torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    if args.local_rank == 0:
        LOGGER.log(key='best_train_throughput', value=max(train_throughputs))
        LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs))
        LOGGER.log(key='best_accuracy', value=max_hr)
        LOGGER.log(key='time_to_target', value=time.time() - main_start_time)
        LOGGER.log(key='time_to_best_model',
                   value=best_model_timestamp - main_start_time)

        LOGGER.log(key=tags.RUN_STOP, value={"success": success})
        LOGGER.log(key=tags.RUN_FINAL)
示例#7
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    for epoch in range(args.epochs):

        model.train()
        losses = utils.AverageMeter()

        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        length = len(loader)
        if length < 101:
            print(
                'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.'
            )
            cuda.profile_stop()
            exit()
        for batch_index, (user, item, label) in enumerate(loader):
            if batch_index == length // 2 and epoch == 0:
                print('Starting profiling for 100 iterations.')
                cuda.profile_start()

            if batch_index == length // 2 + 100 and epoch == 0:
                print(
                    'Profiling completed, stopping profiling and continuing training.'
                )
                cuda.profile_stop()

            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))

        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                return 0
示例#8
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    # TODO: Reading CSVs is slow. Could use HDF or Apache Arrow
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    print('batchsize=%d' % args.batch_size)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=8,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))
    for epoch in range(args.epochs):
        model.train()
        losses = utils.AverageMeter()

        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        counting_data = 0
        counting_forward = 0
        counting_zerograd = 0
        counting_backward = 0
        counting_updateweight = 0
        counting_des = 0
        for batch_index, (user, item, label) in enumerate(loader):
            start0 = time.time()
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            start1 = time.time()
            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))
            start2 = time.time()

            optimizer.zero_grad()
            start3 = time.time()
            loss.backward()
            start4 = time.time()
            optimizer.step()
            start5 = time.time()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

            start6 = time.time()

            counting_data += start1 - start0
            counting_forward += start2 - start1
            counting_zerograd += start3 - start2
            counting_backward += start4 - start3
            counting_updateweight += start5 - start3
            counting_des += start6 - start5

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch)
        val_time = time.time() - begin
        print(
            'data: {data:.f4}, forward: {ft:.4f}, zerograd: {zg:.4f}, backward: {bw:.4f},'
            ' adam: {adam:.4f}, description: {des:.4f}'.format(
                data=counting_data,
                ft=counting_forward,
                zg=counting_zerograd,
                bw=counting_backward,
                adam=counting_updateweight,
                des=counting_des))
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                return 0
示例#9
0
def main():
    args = parse_args()

    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # Create model
    model = NeuMF(2197225,
                  855776,
                  mf_dim=64,
                  mf_reg=0.,
                  mlp_layer_sizes=[256, 256, 128, 64],
                  mlp_layer_regs=[0. for i in [256, 256, 128, 64]])

    print(model)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()

    if args.load_ckp:
        ckp = torch.load(args.load_ckp)
        model.load_state_dict(ckp)

    if args.quantize:
        all_embeding = [
            n for n, m in model.named_modules() if isinstance(m, nn.Embedding)
        ]
        all_linear = [
            n for n, m in model.named_modules() if isinstance(m, nn.Linear)
        ]
        all_relu = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU)
        ]
        all_relu6 = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
        ]
        # layers = all_relu + all_relu6 + all_linear
        layers = all_embeding
        replacement_factory = {
            nn.ReLU: ActivationModuleWrapperPost,
            nn.ReLU6: ActivationModuleWrapperPost,
            nn.Linear: ParameterModuleWrapperPost,
            nn.Embedding: ActivationModuleWrapperPost
        }
        mq = ModelQuantizer(model, args, layers, replacement_factory)
        # mq.log_quantizer_state(ml_logger, -1)

    test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader(
        args.data)
    data = NcfData(test_users, test_items, dup_mask, real_indices, K,
                   samples_per_user, num_user)

    hr, ndcg = val(model, data)
    print('')
    print('')
    print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(K=K,
                                                                  hit_rate=hr,
                                                                  ndcg=ndcg))
示例#10
0
def main():
    args = parse_args()
    init_distributed(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        torch.manual_seed(args.seed)

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:{}'.format(
                               args.local_rank)))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
示例#11
0
def main():
    global msglogger

    script_dir = os.path.dirname(__file__)

    args = parse_args()

    # Distiller loggers
    msglogger = apputils.config_pylogger('logging.conf',
                                         args.name,
                                         output_dir=args.output_dir)
    tflogger = TensorBoardLogger(msglogger.logdir)
    # tflogger.log_gradients = True
    # pylogger = PythonLogger(msglogger)

    if args.seed is not None:
        msglogger.info("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    args.qe_mode = str(args.qe_mode).split('.')[1]
    args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1]

    apputils.log_execution_env_state(sys.argv)

    if args.gpus is not None:
        try:
            args.gpus = [int(s) for s in args.gpus.split(',')]
        except ValueError:
            msglogger.error(
                'ERROR: Argument --gpus must be a comma-separated list of integers only'
            )
            exit(1)
        if len(args.gpus) > 1:
            msglogger.error('ERROR: Only single GPU supported for NCF')
            exit(1)
        available_gpus = torch.cuda.device_count()
        for dev_id in args.gpus:
            if dev_id >= available_gpus:
                msglogger.error(
                    'ERROR: GPU device ID {0} requested, but only {1} devices available'
                    .format(dev_id, available_gpus))
                exit(1)
        # Set default device in case the first one on the list != 0
        torch.cuda.set_device(args.gpus[0])

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = msglogger.logdir
    msglogger.info("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    training = not (args.eval or args.qe_calibration
                    or args.activation_histograms)
    msglogger.info('Loading data')
    if training:
        train_dataset = CFTrainDataset(
            os.path.join(args.data, TRAIN_RATINGS_FILENAME),
            args.negative_samples)
        train_dataloader = torch.utils.data.DataLoader(
            dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            pin_memory=True)
        nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    else:
        train_dataset = None
        train_dataloader = None
        nb_users, nb_items = (138493, 26744)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))

    msglogger.info(
        'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' %
        (time.time() - t1, nb_users, nb_items,
         str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers],
                  split_final=args.split_final)
    if use_cuda:
        model = model.cuda()
    msglogger.info(model)
    msglogger.info("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    compression_scheduler = None
    start_epoch = 0
    optimizer = None
    if args.load:
        if training:
            model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint(
                model, args.load)
            if args.reset_optimizer:
                start_epoch = 0
                optimizer = None
        else:
            model = apputils.load_lean_checkpoint(model, args.load)

    # Add loss to graph
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        criterion = criterion.cuda()

    if training and optimizer is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        msglogger.info('Optimizer Type: %s', type(optimizer))
        msglogger.info('Optimizer Args: %s', optimizer.defaults)

    if args.compress:
        compression_scheduler = distiller.file_config(model, optimizer,
                                                      args.compress)
        model.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    if args.qe_calibration or args.activation_histograms:
        calib = {
            'portion':
            args.qe_calibration,
            'desc_str':
            'quantization calibration stats',
            'collect_func':
            partial(distiller.data_loggers.collect_quant_stats,
                    inplace_runtime_check=True,
                    disable_inplace_attrs=True)
        }
        hists = {
            'portion':
            args.activation_histograms,
            'desc_str':
            'activation histograms',
            'collect_func':
            partial(distiller.data_loggers.collect_histograms,
                    activation_stats=None,
                    nbins=2048,
                    save_hist_imgs=True)
        }
        d = calib if args.qe_calibration else hists

        distiller.utils.assign_layer_fq_names(model)
        num_users = int(np.floor(len(test_ratings) * d['portion']))
        msglogger.info(
            "Generating {} based on {:.1%} of the test-set ({} users)".format(
                d['desc_str'], d['portion'], num_users))

        test_fn = partial(val_epoch,
                          ratings=test_ratings,
                          negs=test_negs,
                          K=args.topk,
                          use_cuda=use_cuda,
                          processes=args.processes,
                          num_users=num_users)
        d['collect_func'](model=model,
                          test_fn=test_fn,
                          save_dir=run_dir,
                          classes=None)

        return 0

    if args.eval:
        if args.quantize_eval and args.qe_calibration is None:
            model.cpu()
            quantizer = quantization.PostTrainLinearQuantizer.from_args(
                model, args)
            dummy_input = (torch.tensor([1]), torch.tensor([1]),
                           torch.tensor([True], dtype=torch.bool))
            quantizer.prepare_model(dummy_input)
            model.cuda()

        distiller.utils.assign_layer_fq_names(model)

        if args.eval_fp16:
            model = model.half()

        # Calculate initial Hit Ratio and NDCG
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                processes=args.processes)
        val_time = time.time() - begin
        hit_rate = np.mean(hits)
        msglogger.info(
            'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}'
            .format(K=args.topk,
                    hit_rate=hit_rate,
                    ndcg=np.mean(ndcgs),
                    val_time=val_time))
        hit_rate = 0

        if args.quantize_eval:
            checkpoint_name = 'quantized'
            apputils.save_checkpoint(0,
                                     'NCF',
                                     model,
                                     optimizer=None,
                                     extras={'quantized_hr@10': hit_rate},
                                     name='_'.join([args.name, 'quantized'])
                                     if args.name else checkpoint_name,
                                     dir=msglogger.logdir)
        return 0

    total_samples = len(train_dataloader.sampler)
    steps_per_epoch = math.ceil(total_samples / args.batch_size)
    best_hit_rate = 0
    best_epoch = 0
    for epoch in range(start_epoch, args.epochs):
        msglogger.info('')
        model.train()
        losses = utils.AverageMeter()

        begin = time.time()

        if compression_scheduler:
            compression_scheduler.on_epoch_begin(epoch, optimizer)

        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(
                    epoch, batch_index, steps_per_epoch, optimizer)

            outputs = model(user, item, torch.tensor([False],
                                                     dtype=torch.bool))
            loss = criterion(outputs, label)

            if compression_scheduler:
                compression_scheduler.before_backward_pass(
                    epoch,
                    batch_index,
                    steps_per_epoch,
                    loss,
                    optimizer,
                    return_loss_components=False)

            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if compression_scheduler:
                compression_scheduler.on_minibatch_end(epoch, batch_index,
                                                       steps_per_epoch,
                                                       optimizer)

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

            steps_completed = batch_index + 1
            if steps_completed % args.log_freq == 0:
                stats_dict = OrderedDict()
                stats_dict['Loss'] = losses.avg
                stats = ('Performance/Training/', stats_dict)
                params = model.named_parameters(
                ) if args.log_params_histograms else None
                distiller.log_training_progress(stats, params, epoch,
                                                steps_completed,
                                                steps_per_epoch, args.log_freq,
                                                [tflogger])

                tflogger.log_model_buffers(model,
                                           ['tracked_min', 'tracked_max'],
                                           'Quant/Train/Acts/TrackedMinMax',
                                           epoch, steps_completed,
                                           steps_per_epoch, args.log_freq)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        val_time = time.time() - begin

        if compression_scheduler:
            compression_scheduler.on_epoch_end(epoch, optimizer)

        hit_rate = np.mean(hits)
        mean_ndcgs = np.mean(ndcgs)

        stats_dict = OrderedDict()
        stats_dict['HR@{0}'.format(args.topk)] = hit_rate
        stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs
        stats = ('Performance/Validation/', stats_dict)
        distiller.log_training_progress(stats,
                                        None,
                                        epoch,
                                        steps_completed=0,
                                        total_steps=1,
                                        log_freq=1,
                                        loggers=[tflogger])

        msglogger.info(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, '
            'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=hit_rate,
                ndcg=mean_ndcgs,
                loss=losses,
                train_time=train_time,
                val_time=val_time))

        is_best = False
        if hit_rate > best_hit_rate:
            best_hit_rate = hit_rate
            is_best = True
            best_epoch = epoch
        extras = {
            'current_hr@10': hit_rate,
            'best_hr@10': best_hit_rate,
            'best_epoch': best_epoch
        }
        apputils.save_checkpoint(epoch,
                                 'NCF',
                                 model,
                                 optimizer,
                                 compression_scheduler,
                                 extras,
                                 is_best,
                                 dir=run_dir)

        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                msglogger.info("Hit threshold of {}".format(args.threshold))
                break
示例#12
0
def main():

    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}.{}".format(config['timestamp'],args.local_rank)
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # more like load trigger timmer now
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

    # sync worker before timing.
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    #===========================================================================
    #== The clock starts on loading the preprocessed data. =====================
    #===========================================================================
    mlperf_log.ncf_print(key=mlperf_log.RUN_START)
    run_start_time = time.time()

    # load not converted data, just seperate one for test
    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))

    # get input data
    # get dims
    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item()+1
    nb_items = nb_maxs[1].item()+1
    train_users = train_ratings[:,0]
    train_items = train_ratings[:,1]
    del nb_maxs, train_ratings
    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
    mat[train_users, train_items] = 0
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label,neg_label))
    del neg_label
    if args.fp16:
        train_label = train_label.half()

    # produce validation negative sample on GPU
    all_test_users = test_ratings.shape[0]

    test_users = test_ratings[:,0]
    test_pos = test_ratings[:,1].reshape(-1,1)
    test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1]

    # create items with real sample at last position
    test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative)
    test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1)
    del test_ratings, test_negs

    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
    # produce -1 mask
    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
    # produce real sample indices to later check in topk
    sorted_items, indices = (test_items != test_pos).sort()
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
    indices_order = torch.sort(sum_item_indices)[1]
    stable_indices = torch.gather(indices, 1, indices_order)
    real_indices = stable_indices[:,0]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos

    if args.distributed:
        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER)  # we shuffled later with randperm

    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d'
          % (time.time()-run_start_time, nb_users, nb_items, len(train_users),
             nb_users))

    # Create model
    model = NeuMF(nb_users, nb_items,
                  mf_dim=args.factors, mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])

    if args.fp16:
        model = model.half()

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    if args.fp16:
        fp_optimizer = Fp16Optimizer(model, args.loss_scale)
        params = fp_optimizer.fp32_params
    else:
        params = model.parameters()

    #optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps)
    # optimizer = AdamOpt(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps)
    optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
    criterion = nn.BCEWithLogitsLoss(reduction = 'none') # use torch.mean() with dim later to avoid copy to host
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps)
    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    if args.distributed:
        model = DDP(model)
        local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
    else:
        local_batch = args.batch_size
    traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1)))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')
    # Calculate initial Hit Ratio and NDCG
    test_x = test_users.view(-1).split(args.valid_batch_size)
    test_y = test_items.view(-1).split(args.valid_batch_size)

    hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
                         num_user=all_test_users, distributed=args.distributed)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
          .format(K=args.topk, hit_rate=hr, ndcg=ndcg))
    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):

        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)

        begin = time.time()

        # prepare data for epoch
        neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples)
        epoch_users = torch.cat((train_users,neg_users))
        epoch_items = torch.cat((train_items,neg_items))
        del neg_users, neg_items

        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(len(epoch_users), device='cuda:{}'.format(args.local_rank))
        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]
        if args.distributed:
            epoch_users = torch.chunk(epoch_users, args.world_size)[args.local_rank]
            epoch_items = torch.chunk(epoch_items, args.world_size)[args.local_rank]
            epoch_label = torch.chunk(epoch_label, args.world_size)[args.local_rank]
        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)

        # only print progress bar on rank 0
        num_batches = (len(epoch_indices) + args.batch_size - 1) // args.batch_size
        if args.local_rank == 0:
            qbar = tqdm.tqdm(range(num_batches))
        else:
            qbar = range(num_batches)
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users_list) < num_batches:
            print("epoch_size % batch_size < number of worker!")
            exit(1)

        for i in qbar:
            # selecting input from prepared data
            user = epoch_users_list[i]
            item = epoch_items_list[i]
            label = epoch_label_list[i].view(-1,1)

            for p in model.parameters():
                p.grad = None

            outputs = model(user, item)
            loss = traced_criterion(outputs, label).float()
            loss = torch.mean(loss.view(-1), 0)

            if args.fp16:
                fp_optimizer.step(loss, optimizer)
            else:
                loss.backward()
                optimizer.step()

        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
        train_time = time.time() - begin
        begin = time.time()

        mlperf_log.ncf_print(key=mlperf_log.EVAL_START)

        hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
                             num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed)

        val_time = time.time() - begin
        print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
              ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'
              .format(epoch=epoch, K=args.topk, hit_rate=hr,
                      ndcg=ndcg, train_time=train_time,
                      val_time=val_time))

        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    run_stop_time = time.time()
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)

    # easy way of tracking mlperf score
    if success:
        print("mlperf_score", run_stop_time - run_start_time)
示例#13
0
def main():
    log_hardware()

    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    log_args(args)

    main_start_time = time.time()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    # Save configuration to file
    timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
    run_dir = "./run/neumf/{}.{}".format(timestamp, args.local_rank)
    print("Saving results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)

    # more like load trigger timer now
    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)

    # sync worker before timing.
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()

    # load not converted data, just seperate one for test
    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))

    # get input data
    # get dims
    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1
    train_users = train_ratings[:, 0]
    train_items = train_ratings[:, 1]
    del nb_maxs, train_ratings
    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
    mat[train_users, train_items] = 0
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label, neg_label))
    del neg_label
    if args.fp16:
        train_label = train_label.half()

    # produce validation negative sample on GPU
    all_test_users = test_ratings.shape[0]

    test_users = test_ratings[:, 0]
    test_pos = test_ratings[:, 1].reshape(-1, 1)
    test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative,
                             True)[1]

    # create items with real sample at last position
    test_users = test_users.reshape(-1, 1).repeat(1, 1 + args.valid_negative)
    test_items = torch.cat(
        (test_negs.reshape(-1, args.valid_negative), test_pos), dim=1)
    del test_ratings, test_negs

    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = torch.sort(test_items)  # [1,1,1,2], [3,1,0,2]
    sum_item_indices = sorted_items.float() + indices.float() / len(
        indices[0])  #[1.75,1.25,1.0,2.5]
    indices_order = torch.sort(sum_item_indices)[1]  #[2,1,0,3]
    stable_indices = torch.gather(indices, 1, indices_order)  #[0,1,3,2]
    # produce -1 mask
    dup_mask = (sorted_items[:, 0:-1] == sorted_items[:, 1:])
    dup_mask = torch.cat(
        (torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask), dim=1)
    dup_mask = torch.gather(dup_mask, 1, stable_indices.sort()[1])
    # produce real sample indices to later check in topk
    sorted_items, indices = (test_items != test_pos).sort()
    sum_item_indices = sorted_items.float() + indices.float() / len(indices[0])
    indices_order = torch.sort(sum_item_indices)[1]
    stable_indices = torch.gather(indices, 1, indices_order)
    real_indices = stable_indices[:, 0]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos

    if args.distributed:
        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
        real_indices = torch.chunk(real_indices,
                                   args.world_size)[args.local_rank]

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm

    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - run_start_time, nb_users, nb_items, len(train_users),
           nb_users))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers],
                  dropout=args.dropout)

    if args.fp16:
        model = model.half()

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    if args.fp16:
        fp_optimizer = Fp16Optimizer(model, args.loss_scale)
        params = fp_optimizer.fp32_params
    else:
        params = model.parameters()

    optimizer = FusedAdam(params,
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps,
                          eps_inside_sqrt=False)
    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
    LOGGER.log(key=tags.OPT_NAME, value="Adam")
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2)
    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)

    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.distributed:
        model = DDP(model)
        local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
    else:
        local_batch = args.batch_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    train_users_per_worker = len(train_label) / int(os.environ['WORLD_SIZE'])
    train_users_begin = int(train_users_per_worker * args.local_rank)
    train_users_end = int(train_users_per_worker * (args.local_rank + 1))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')
    # Calculate initial Hit Ratio and NDCG
    test_x = test_users.view(-1).split(args.valid_batch_size)
    test_y = test_items.view(-1).split(args.valid_batch_size)

    if args.mode == 'test':
        state_dict = torch.load(args.checkpoint_path)
        model.load_state_dict(state_dict)

    begin = time.time()
    LOGGER.log(key=tags.EVAL_START, value=-1)

    hr, ndcg = val_epoch(model,
                         test_x,
                         test_y,
                         dup_mask,
                         real_indices,
                         args.topk,
                         samples_per_user=test_items.size(1),
                         num_user=all_test_users,
                         distributed=args.distributed)
    val_time = time.time() - begin
    print(
        'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, valid_time: {val_time:.4f}'
        .format(K=args.topk, hit_rate=hr, ndcg=ndcg, val_time=val_time))

    LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": -1, "value": hr})
    LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
    LOGGER.log(key=tags.EVAL_STOP, value=-1)

    if args.mode == 'test':
        return

    success = False
    max_hr = 0
    LOGGER.log(key=tags.TRAIN_LOOP)
    train_throughputs = []
    eval_throughputs = []

    for epoch in range(args.epochs):

        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
        LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples)
        LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN)

        begin = time.time()

        # prepare data for epoch
        neg_users, neg_items = generate_neg(train_users, mat, nb_items,
                                            args.negative_samples)
        epoch_users = torch.cat((train_users, neg_users))
        epoch_items = torch.cat((train_items, neg_items))

        del neg_users, neg_items

        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(train_users_end - train_users_begin,
                                       device='cuda:{}'.format(
                                           args.local_rank))
        epoch_indices += train_users_begin

        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]

        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)

        # only print progress bar on rank 0
        num_batches = len(epoch_users_list)
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users) % args.batch_size < args.world_size:
            print("epoch_size % batch_size < number of worker!")
            exit(1)

        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users_list[batch_idx]
                item = epoch_items_list[batch_idx]
                label = epoch_label_list[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)
                if args.fp16:
                    fp_optimizer.backward(loss)
                else:
                    loss.backward()

            if args.fp16:
                fp_optimizer.step(optimizer)
            else:
                optimizer.step()

            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_users) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)
        LOGGER.log(key='train_throughput', value=train_throughput)
        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        LOGGER.log(key=tags.EVAL_START, value=epoch)

        hr, ndcg = val_epoch(model,
                             test_x,
                             test_y,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=test_items.size(1),
                             num_user=all_test_users,
                             output=valid_results_file,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=hr,
                ndcg=ndcg,
                train_time=train_time,
                val_time=val_time))

        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        eval_size = all_test_users * test_items.size(1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)
        LOGGER.log(key='eval_throughput', value=eval_throughput)

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            print("New best hr! Saving the model to: ", args.checkpoint_path)
            torch.save(model.state_dict(), args.checkpoint_path)

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    LOGGER.log(key='best_train_throughput', value=max(train_throughputs))
    LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs))
    LOGGER.log(key='best_accuracy', value=max_hr)
    LOGGER.log(key='time_to_target', value=time.time() - main_start_time)

    LOGGER.log(key=tags.RUN_STOP, value={"success": success})
    LOGGER.log(key=tags.RUN_FINAL)
示例#14
0
def main():
    args = parse_args()
    init_distributed(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.metadata('train_throughput', {
        "name": 'train_throughput',
        'format': ":.3e"
    })
    dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"})
    dllogger.metadata('train_epoch_time', {
        "name": 'train_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('validation_epoch_time', {
        "name": 'validation_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('eval_throughput', {
        "name": 'eval_throughput',
        'format': ":.3e"
    })

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        torch.manual_seed(args.seed)

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    feature_spec_path = os.path.join(args.data, args.feature_spec_file)
    feature_spec = FeatureSpec.from_yaml(feature_spec_path)
    trainset = dataloading.TorchTensorDataset(feature_spec,
                                              mapping_name='train',
                                              args=args)
    testset = dataloading.TorchTensorDataset(feature_spec,
                                             mapping_name='test',
                                             args=args)
    train_loader = dataloading.TrainDataloader(trainset, args)
    test_loader = dataloading.TestDataLoader(testset, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]
    model = NeuMF(
        nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'],
        nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'],
        mf_dim=args.factors,
        mlp_layer_sizes=args.layers,
        dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    # this should always be overridden if hr>0.
    # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring
    # to an uninitialized variable.
    max_hr = 0
    best_epoch = 0
    best_model_timestamp = time.time()
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()
        batch_dict_list = train_loader.get_epoch_data()
        num_batches = len(batch_dict_list)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                batch_dict = batch_dict_list[batch_idx]

                user_features = batch_dict[USER_CHANNEL_NAME]
                item_features = batch_dict[ITEM_CHANNEL_NAME]

                user_batch = user_features[user_feature_name]
                item_batch = item_features[item_feature_name]

                label_features = batch_dict[LABEL_CHANNEL_NAME]
                label_batch = label_features[label_feature_name]

                outputs = model(user_batch, item_batch)
                loss = traced_criterion(outputs, label_batch.view(-1,
                                                                  1)).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del batch_dict_list
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = train_loader.length_after_augmentation
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)

        val_time = time.time() - begin
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
示例#15
0
def main():
    from grace_dl.dist.helper import timer, volume, tensor_bits

    args = parse_args()
    init_distributed(args)
    if args.weak_scaling:
        args.batch_size *= args.world_size
    init_wandb(args)
    init_grace(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:0'))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:0'))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:0'))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    # if args.distributed:
    #     model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    if args.local_rank == 0:
        print(model)
        print("{} parameters".format(utils.count_parameters(model)))
        # [print(parameter) for parameter in model.parameters()]

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    # broadcast model states from rank0 to other nodes !!! This is important!
    [torch.distributed.broadcast(p.data, src=0) for p in model.parameters()]
    # if args.local_rank == 0:
    #     save_initial_state_path = os.path.join(args.checkpoint_dir, 'model_init.pth')
    #     print("Saving the model to: ", save_initial_state_path)
    #     torch.save(model.state_dict(), save_initial_state_path)

    for epoch in range(args.epochs):

        begin = time.time()
        train_time = 0

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            batch_start = time.time()
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

            # check grad sparsity
            if args.sparsity_check:
                total_nonzero = 0
                total_numel = 0
                for index, (name, p) in enumerate(model.named_parameters()):
                    sparsity = 1.0 - torch.sum(
                        p.grad.data.abs() > 0).float() / p.grad.data.numel()
                    total_nonzero += torch.sum(p.grad.data.abs() > 0).float()
                    total_numel += p.grad.data.numel()
                    if args.local_rank == 0:
                        wandb.log(
                            {
                                f"{name}(sparsity)(numel={p.grad.data.numel()})":
                                sparsity,
                            },
                            commit=False)
                if args.local_rank == 0:
                    wandb.log(
                        {
                            f"total_sparsity(numel={total_numel})":
                            1 - total_nonzero / total_numel,
                        },
                        commit=True)

            # add grace just before optimizer.step()
            torch.cuda.synchronize()
            comm_start = time.time()
            for index, (name, p) in enumerate(model.named_parameters()):
                new_grad = args.grc.step(p.grad.data, name)
                p.grad.data = new_grad
            torch.cuda.synchronize()
            timer['comm'] = time.time() - comm_start

            # [torch.distributed.all_reduce(p.grad.data) for p in model.parameters()]
            # for param in model.parameters():
            #     dist.all_reduce(param.grad.data)
            #     param.grad.data /= float(args.world_size)

            optimizer.step()
            for p in model.parameters():
                p.grad = None
            if args.throughput:
                torch.cuda.synchronize()

            if args.log_time and args.local_rank == 0:
                timer['batch_time'] = time.time() - batch_start
                timer['computation'] = timer['batch_time'] - timer['comm']
                print("Timer:", timer, '\n')

                timer['en/decoding'] = 0
                timer['batch_time'] = 0
                timer['computation'] = 0
                timer['comm'] = 0

            if args.log_volume and args.local_rank == 0:
                ratio = volume['compress'] / volume['nocompress']
                volume['ratio_acc'].append(ratio)
                avg_ratio = sum(volume['ratio_acc']) / len(volume['ratio_acc'])
                print(
                    f"Data volume:: compress {volume['compress']} no_compress {volume['nocompress']} ratio {ratio:.4f} avg_ratio {avg_ratio:.4f}"
                )
                volume['compress'] = 0
                volume['nocompress'] = 0

            batch_throughput = args.batch_size / (time.time() - batch_start
                                                  )  # global throughput
            train_time += time.time() - batch_start
            if (args.throughput
                    or args.eval_at_every_batch) and args.local_rank == 0:
                print(
                    f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t "
                    f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}"
                )

            if args.throughput and i == 3:
                break
            if args.local_rank == 0:
                print(
                    f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t "
                    f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}"
                )
            if args.eval_at_every_batch:
                hr, ndcg = val_epoch(model,
                                     test_users,
                                     test_items,
                                     dup_mask,
                                     real_indices,
                                     args.topk,
                                     samples_per_user=valid_negative + 1,
                                     num_user=all_test_users,
                                     epoch=epoch,
                                     distributed=args.distributed)
                if args.local_rank == 0:
                    wandb.log({
                        "eval/hr@10": hr,
                    })

        del epoch_users, epoch_items, epoch_label
        # train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        if args.throughput:
            train_throughput = batch_throughput
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if args.local_rank == 0:
            wandb.log(
                {
                    "train_epoch_time": train_time,
                    'validation_epoch_time': val_time,
                    'eval_throughput': eval_throughput,
                    'train_throughput': train_throughput,
                },
                commit=False)
            if not args.eval_at_every_batch:
                wandb.log({
                    "eval/hr@10": hr,
                }, commit=False)
            wandb.log({"epoch": epoch})

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

        if args.throughput:
            break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())

        wandb.log({
            'best_train_throughput': max(train_throughputs),
            'best_eval_throughput': max(eval_throughputs),
            'mean_train_throughput': np.mean(train_throughputs),
            'mean_eval_throughput': np.mean(eval_throughputs),
            'best_accuracy': max_hr,
            'best_epoch': best_epoch,
            'time_to_target': time.time() - main_start_time,
            'time_to_best_model': best_model_timestamp - main_start_time
        })
示例#16
0
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # Create model
    model = NeuMF(2197225,
                  855776,
                  mf_dim=64,
                  mf_reg=0.,
                  mlp_layer_sizes=[256, 256, 128, 64],
                  mlp_layer_regs=[0. for i in [256, 256, 128, 64]])

    print(model)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        model.device = torch.device('cuda:{}'.format(0))

    if args.load_ckp:
        ckp = torch.load(args.load_ckp)
        model.load_state_dict(ckp)

    all_embeding = [
        n for n, m in model.named_modules() if isinstance(m, nn.Embedding)
    ]
    all_linear = [
        n for n, m in model.named_modules() if isinstance(m, nn.Linear)
    ]
    all_relu = [n for n, m in model.named_modules() if isinstance(m, nn.ReLU)]
    all_relu6 = [
        n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
    ]
    layers = all_relu + all_relu6 + all_linear + all_embeding
    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Linear: ParameterModuleWrapperPost,
        nn.Embedding: ActivationModuleWrapperPost
    }
    mq = ModelQuantizer(model, args, layers, replacement_factory)
    # mq.log_quantizer_state(ml_logger, -1)

    test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader(
        args.data)
    data = NcfData(test_users, test_items, dup_mask, real_indices, K,
                   samples_per_user, num_user)
    cal_data = CalibrationSet('ml-20mx16x32/cal_set').cuda()
    cal_data.split(batch_size=10000)

    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    criterion = criterion.cuda()

    print("init_method: {}, qtype {}".format(args.init_method, args.qtype))
    # evaluate to initialize dynamic clipping
    loss = evaluate_calibration(model, cal_data, criterion)
    print("Initial loss: {:.4f}".format(loss))

    # get clipping values
    init = get_clipping(mq)

    # evaluate
    hr, ndcg = validate(model, data)
    ml_logger.log_metric('HR init', hr, step='auto')

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        loss = run_inference_on_calibration(x, model, mq, cal_data, criterion)
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss))

    res = opt.minimize(lambda scales: run_inference_on_calibration(
        scales, model, mq, cal_data, criterion),
                       np.array(init),
                       method=args.min_method,
                       options=min_options,
                       callback=local_search_callback)

    print(res)
    scales = res.x
    set_clipping(mq, scales, model.device)
    # evaluate
    hr, ndcg = validate(model, data)
    ml_logger.log_metric('HR Powell', hr, step='auto')