示例#1
0
def session_impl_test(json_file):
    solver_config = solver_parser_helper(seed=0,
                                         batchsize=16384,
                                         batchsize_eval=16384,
                                         model_file="",
                                         embedding_files=[],
                                         vvgpu=[[0, 1], [2, 3], [4, 5], [6,
                                                                         7]],
                                         use_mixed_precision=False,
                                         scaler=1.0,
                                         i64_input_key=False,
                                         use_algorithm_search=True,
                                         use_cuda_graph=True,
                                         repeat_dataset=True)
    sess = Session(solver_config, json_file)
    sess.start_data_reading()
    lr_sch = get_learning_rate_scheduler(json_file)
    for i in range(10000):
        lr = lr_sch.get_next()
        sess.set_learning_rate(lr)
        sess.train()
        if (i % 100 == 0):
            loss = sess.get_current_loss()
            if (rank == 0):
                print("[HUGECTR][INFO] iter: {}; loss: {}".format(i, loss))
        if (i % 1000 == 0 and i != 0):
            metrics = sess.evaluation()
            print("[HUGECTR][INFO] rank: {}, iter: {}, {}".format(
                rank, i, metrics))
    return
def session_impl_test(json_file):
    solver_config = solver_parser_helper(seed=0,
                                         batchsize=16384,
                                         batchsize_eval=16384,
                                         model_file="",
                                         embedding_files=[],
                                         vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]],
                                         use_mixed_precision=True,
                                         scaler=1024,
                                         i64_input_key=False,
                                         use_algorithm_search=True,
                                         use_cuda_graph=True,
                                         repeat_dataset=True)
    lr_sch = get_learning_rate_scheduler(json_file)
    sess = Session(solver_config, json_file)
    sess.start_data_reading()
    for i in range(10000):
        lr = lr_sch.get_next()
        sess.set_learning_rate(lr)
        sess.train()
        if (i % 100 == 0):
            loss = sess.get_current_loss()
            print("[HUGECTR][INFO] iter: {}; loss: {}".format(i, loss))
        if (i % 1000 == 0 and i != 0):
            sess.check_overflow()
            sess.copy_weights_for_evaluation()
            data_reader_eval = sess.get_data_reader_eval()
            for _ in range(solver_config.max_eval_batches):
                sess.eval()
            metrics = sess.get_eval_metrics()
            print("[HUGECTR][INFO] iter: {}, {}".format(i, metrics))
    return
def set_source_raw_test(json_file):
    train_data = "./train_data.bin"
    test_data = "./test_data.bin"
    solver_config = solver_parser_helper(seed=0,
                                         batchsize=16384,
                                         batchsize_eval=16384,
                                         max_eval_batches=5441,
                                         model_file="",
                                         embedding_files=[],
                                         vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]],
                                         use_mixed_precision=True,
                                         scaler=1024,
                                         i64_input_key=False,
                                         use_algorithm_search=True,
                                         use_cuda_graph=True,
                                         repeat_dataset=False)
    lr_sch = get_learning_rate_scheduler(json_file)
    sess = Session(solver_config, json_file)
    data_reader_train = sess.get_data_reader_train()
    data_reader_eval = sess.get_data_reader_eval()
    data_reader_eval.set_source(test_data)
    iteration = 1
    for cnt in range(2):
        data_reader_train.set_source(train_data)
        print("[HUGECTR][INFO] round: {}".format(cnt), flush=True)
        while True:
            lr = lr_sch.get_next()
            sess.set_learning_rate(lr)
            good = sess.train()
            if good == False:
                break
            if iteration % 4000 == 0:
                sess.check_overflow()
                sess.copy_weights_for_evaluation()
                data_reader_eval = sess.get_data_reader_eval()
                good_eval = True
                j = 0
                while good_eval:
                    if j >= solver_config.max_eval_batches:
                        break
                    good_eval = sess.eval()
                    j += 1
                if good_eval == False:
                    data_reader_eval.set_source()
                metrics = sess.get_eval_metrics()
                print("[HUGECTR][INFO] iter: {}, metrics: {}".format(
                    iteration, metrics),
                      flush=True)
            iteration += 1
        print("[HUGECTR][INFO] trained with data in {}".format(train_data),
              flush=True)
示例#4
0
def model_oversubscriber_test(json_file, temp_dir):
    dataset = [("file_list." + str(i) + ".txt",
                "file_list." + str(i) + ".keyset") for i in range(5)]
    solver_config = solver_parser_helper(seed=0,
                                         batchsize=16384,
                                         batchsize_eval=16384,
                                         model_file="",
                                         embedding_files=[],
                                         vvgpu=[[0]],
                                         use_mixed_precision=False,
                                         scaler=1.0,
                                         i64_input_key=False,
                                         use_algorithm_search=True,
                                         use_cuda_graph=True,
                                         repeat_dataset=False)
    lr_sch = get_learning_rate_scheduler(json_file)
    sess = Session(solver_config, json_file, True, temp_dir)
    data_reader_train = sess.get_data_reader_train()
    data_reader_eval = sess.get_data_reader_eval()
    data_reader_eval.set_source("file_list.5.txt")
    model_oversubscriber = sess.get_model_oversubscriber()
    iteration = 0
    for file_list, keyset_file in dataset:
        data_reader_train.set_source(file_list)
        model_oversubscriber.update(keyset_file)
        while True:
            lr = lr_sch.get_next()
            sess.set_learning_rate(lr)
            good = sess.train()
            if good == False:
                break
            if iteration % 100 == 0:
                sess.check_overflow()
                sess.copy_weights_for_evaluation()
                data_reader_eval = sess.get_data_reader_eval()
                good_eval = True
                j = 0
                while good_eval:
                    if j >= solver_config.max_eval_batches:
                        break
                    good_eval = sess.eval()
                    j += 1
                if good_eval == False:
                    data_reader_eval.set_source()
                metrics = sess.get_eval_metrics()
                print("[HUGECTR][INFO] iter: {}, metrics: {}".format(
                    iteration, metrics))
            iteration += 1
        print("[HUGECTR][INFO] trained with data in {}".format(file_list))
    sess.download_params_to_files("./", iteration)
示例#5
0
import hugectr
from mpi4py import MPI
solver = hugectr.solver_parser_helper(num_epochs=0,
                                      max_iter=10000,
                                      max_eval_batches=100,
                                      batchsize_eval=2048,
                                      batchsize=2048,
                                      display=200,
                                      eval_interval=1000,
                                      i64_input_key=False,
                                      use_mixed_precision=False,
                                      repeat_dataset=True)
optimizer = hugectr.optimizer.CreateOptimizer(
    optimizer_type=hugectr.Optimizer_t.Adam, use_mixed_precision=False)
model = hugectr.Model(solver, optimizer)
model.add(
    hugectr.Input(data_reader_type=hugectr.DataReaderType_t.Norm,
                  source="./criteo_data/train/_file_list.txt",
                  eval_source="./criteo_data/val/_file_list.txt",
                  check_type=hugectr.Check_t.Non,
                  label_dim=1,
                  label_name="label",
                  dense_dim=13,
                  dense_name="dense",
                  data_reader_sparse_param_array=[
                      hugectr.DataReaderSparseParam(
                          hugectr.DataReaderSparse_t.Distributed, 30, 1, 26)
                  ],
                  sparse_names=["data1"]))
model.add(
    hugectr.SparseEmbedding(
示例#6
0
import hugectr
solver = hugectr.solver_parser_helper(num_epochs=0,
                                      max_iter=10000,
                                      max_eval_batches=100,
                                      batchsize_eval=2048,
                                      batchsize=2048,
                                      eval_interval=1000,
                                      use_mixed_precision=False)
optimizer = hugectr.optimizer.CreateOptimizer(
    optimizer_type=hugectr.Optimizer_t.Adam, use_mixed_precision=False)
model = hugectr.Model(solver, optimizer)
model.add(
    hugectr.Input(data_reader_type=hugectr.DataReaderType_t.Norm,
                  source="./file_list.txt",
                  eval_source="./file_list_test.txt",
                  check_type=hugectr.Check_t.Sum,
                  label_dim=1,
                  label_name="label",
                  dense_dim=13,
                  dense_name="dense",
                  data_reader_sparse_param_array=[
                      hugectr.DataReaderSparseParam(
                          hugectr.DataReaderSparse_t.Distributed, 30, 1, 26)
                  ],
                  sparse_names=["data1"]))
model.add(
    hugectr.SparseEmbedding(
        embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
        max_vocabulary_size_per_gpu=1737709,
        embedding_vec_size=16,
        combiner=0,
示例#7
0
def train(input_train, input_val, max_iter,
                batchsize, snapshot, num_gpus, eval_interval,
                dense_model_file, sparse_model_files):

    logging.info(f"GPU Devices: {num_gpus}")

    # Configure and define the HugeCTR model
    solver = hugectr.solver_parser_helper(num_epochs = 0,
                                        max_iter = max_iter,
                                        max_eval_batches = 100,
                                        batchsize_eval = batchsize,
                                        batchsize = batchsize,
                                        model_file = dense_model_file,
                                        embedding_files = sparse_model_files,
                                        display = 200,
                                        eval_interval = eval_interval,
                                        i64_input_key = True,
                                        use_mixed_precision = False,
                                        repeat_dataset = True,
                                        snapshot = snapshot,
                                        vvgpu = [num_gpus],
                                        use_cuda_graph = False
                                        )

    optimizer = hugectr.optimizer.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                        use_mixed_precision = False)
    model = hugectr.Model(solver, optimizer)

    # The slot_size_array are the cardinalities of each categorical feature after NVTabular preprocessing
    model.add(hugectr.Input(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                source = input_train,
                                eval_source = input_val,
                                check_type = hugectr.Check_t.Non,
                                label_dim = 1, label_name = "label",
                                dense_dim = 13, dense_name = "dense",
                                slot_size_array = [18576837, 29428, 15128, 7296, 19902, 4, 6466, 1311, 62, 11700067, 622921, 219557, 11, 2209, 9780, 71, 4, 964, 15, 22022124, 4384510, 15960286, 290588, 10830, 96, 35],
                                data_reader_sparse_param_array =
                                [hugectr.DataReaderSparseParam(hugectr.DataReaderSparse_t.Distributed, 30, 1, 26)],
                                sparse_names = ["data1"]))

    # Sparse Embedding Layer
    model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
                                max_vocabulary_size_per_gpu = 88656602,
                                embedding_vec_size = 16,
                                combiner = 0,
                                sparse_embedding_name = "sparse_embedding1",
                                bottom_name = "data1"))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                                bottom_names = ["sparse_embedding1"],
                                top_names = ["reshape1"],
                                leading_dim=416))

    # Concatenate sparse embedding and dense input
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                                bottom_names = ["reshape1", "dense"], top_names = ["concat1"]))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Slice,
                                bottom_names = ["concat1"],
                                top_names = ["slice11", "slice12"],
                                ranges=[(0,429),(0,429)]))

    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross,
                                bottom_names = ["slice11"],
                                top_names = ["multicross1"],
                                num_layers=6))

    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                                bottom_names = ["slice12"],
                                top_names = ["fc1"],
                                num_output=1024))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                                bottom_names = ["fc1"],
                                top_names = ["relu1"]))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                                bottom_names = ["relu1"],
                                top_names = ["dropout1"],
                                dropout_rate=0.5))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                                bottom_names = ["dropout1"],
                                top_names = ["fc2"],
                                num_output=1024))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                                bottom_names = ["fc2"],
                                top_names = ["relu2"]))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                                bottom_names = ["relu2"],
                                top_names = ["dropout2"],
                                dropout_rate=0.5))

    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                                bottom_names = ["dropout2", "multicross1"],
                                top_names = ["concat2"]))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                                bottom_names = ["concat2"],
                                top_names = ["fc3"],
                                num_output=1))
    model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                                bottom_names = ["fc3", "label"],
                                top_names = ["loss"]))
    model.compile()
    model.summary()
    model.fit()
示例#8
0
def _run_model(slot_sizes, total_cardinality):

    solver = hugectr.solver_parser_helper(
        vvgpu=[[0]],
        max_iter=2000,
        batchsize=2048,
        display=100,
        eval_interval=200,
        batchsize_eval=2048,
        max_eval_batches=160,
        i64_input_key=True,
        use_mixed_precision=False,
        repeat_dataset=True,
        snapshot=1900,
    )

    optimizer = hugectr.optimizer.CreateOptimizer(
        optimizer_type=hugectr.Optimizer_t.Adam, use_mixed_precision=False
    )
    model = hugectr.Model(solver, optimizer)

    model.add(
        hugectr.Input(
            data_reader_type=hugectr.DataReaderType_t.Parquet,
            source=DATA_DIR + "train/_file_list.txt",
            eval_source=DATA_DIR + "valid/_file_list.txt",
            check_type=hugectr.Check_t.Non,
            label_dim=1,
            label_name="label",
            dense_dim=0,
            dense_name="dense",
            slot_size_array=slot_sizes,
            data_reader_sparse_param_array=[
                hugectr.DataReaderSparseParam(
                    hugectr.DataReaderSparse_t.Distributed, len(slot_sizes) + 1, 1, len(slot_sizes)
                )
            ],
            sparse_names=["data1"],
        )
    )

    model.add(
        hugectr.SparseEmbedding(
            embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
            max_vocabulary_size_per_gpu=total_cardinality,
            embedding_vec_size=16,
            combiner=0,
            sparse_embedding_name="sparse_embedding1",
            bottom_name="data1",
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.Reshape,
            bottom_names=["sparse_embedding1"],
            top_names=["reshape1"],
            leading_dim=48,
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["reshape1"],
            top_names=["fc1"],
            num_output=128,
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.ReLU,
            bottom_names=["fc1"],
            top_names=["relu1"],
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu1"],
            top_names=["fc2"],
            num_output=128,
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.ReLU,
            bottom_names=["fc2"],
            top_names=["relu2"],
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu2"],
            top_names=["fc3"],
            num_output=1,
        )
    )
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
            bottom_names=["fc3", "label"],
            top_names=["loss"],
        )
    )
    model.compile()
    model.summary()
    model.fit()
示例#9
0
def train_hugectr(workflow, devices, out_path):
    # Gets embeddings and devices
    embeddings = list(get_embedding_sizes(workflow).values())
    embeddings = [emb[0] for emb in embeddings]
    devices = [[int(d)] for d in list(devices)[0::2]]
    # Set solver and model
    solver = hugectr.solver_parser_helper(
        vvgpu=[[0]],
        max_iter=10000,
        max_eval_batches=100,
        batchsize_eval=2720,
        batchsize=2720,
        display=1000,
        eval_interval=3200,
        snapshot=3200,
        i64_input_key=True,
        use_mixed_precision=False,
        repeat_dataset=True,
    )
    optimizer = hugectr.optimizer.CreateOptimizer(
        optimizer_type=hugectr.Optimizer_t.SGD, use_mixed_precision=False)
    model = hugectr.Model(solver, optimizer)
    model.add(
        hugectr.Input(
            data_reader_type=hugectr.DataReaderType_t.Parquet,
            source=out_path + "/output/train/_file_list.txt",
            eval_source=out_path + "/output/valid/_file_list.txt",
            check_type=hugectr.Check_t.Non,
            label_dim=1,
            label_name="label",
            dense_dim=13,
            dense_name="dense",
            slot_size_array=embeddings,
            data_reader_sparse_param_array=[
                hugectr.DataReaderSparseParam(
                    hugectr.DataReaderSparse_t.Localized, 26, 1, 26)
            ],
            sparse_names=["data1"],
        ))
    model.add(
        hugectr.SparseEmbedding(
            embedding_type=hugectr.Embedding_t.
            LocalizedSlotSparseEmbeddingHash,
            max_vocabulary_size_per_gpu=15500000,
            embedding_vec_size=128,
            combiner=0,
            sparse_embedding_name="sparse_embedding1",
            bottom_name="data1",
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["dense"],
            top_names=["fc1"],
            num_output=512,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc1"],
                           top_names=["relu1"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu1"],
            top_names=["fc2"],
            num_output=256,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc2"],
                           top_names=["relu2"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu2"],
            top_names=["fc3"],
            num_output=128,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc3"],
                           top_names=["relu3"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.Interaction,
            bottom_names=["relu3", "sparse_embedding1"],
            top_names=["interaction1"],
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["interaction1"],
            top_names=["fc4"],
            num_output=1024,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc4"],
                           top_names=["relu4"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu4"],
            top_names=["fc5"],
            num_output=1024,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc5"],
                           top_names=["relu5"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu5"],
            top_names=["fc6"],
            num_output=512,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc6"],
                           top_names=["relu6"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu6"],
            top_names=["fc7"],
            num_output=256,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc7"],
                           top_names=["relu7"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu7"],
            top_names=["fc8"],
            num_output=1,
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
            bottom_names=["fc8", "label"],
            top_names=["loss"],
        ))
    # Run training
    model.compile()
    model.summary()
    model.fit()