def convert(topology, backend, device, extra_config={}): """ This function is used to convert a `onnxconverter_common.topology.Topology` object into a *backend* model. Args: topology: The `onnxconverter_common.topology.Topology` object that will be converted into a backend model backend: Which backend the model should be run on device: Which device the translated model will be run on extra_config: Extra configurations to be used by individual operator converters Returns: A model implemented in the selected backend """ assert topology is not None, "Cannot convert a Topology object of type None." assert backend is not None, "Cannot convert a Topology object into backend None." assert device is not None, "Cannot convert a Topology object into device None." tvm_backend = None operator_map = {} if tvm_installed(): import tvm from tvm import relay from tvm.contrib import graph_runtime tvm_backend = tvm.__name__ for operator in topology.topological_operator_iterator(): try: converter = get_converter(operator.type) if backend == onnx.__name__: # vers = LooseVersion(torch.__version__) # allowed_min = LooseVersion("1.6.0") # Pytorch <= 1.6.0 has a bug with exporting GEMM into ONNX. # For the moment only tree_trav is enabled for pytorch <= 1.6.0 # if vers < allowed_min: extra_config[constants.TREE_IMPLEMENTATION] = "tree_trav" operator_map[operator.full_name] = converter( operator, device, extra_config) except ValueError: raise MissingConverter( "Unable to find converter for {} type {} with extra config: {}." .format(operator.type, type(getattr(operator, "raw_model", None)), extra_config)) except Exception as e: raise e # Set the parameters for the model / container n_threads = None if constants.N_THREADS not in extra_config else extra_config[ constants.N_THREADS] batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[ constants.BATCH_SIZE] # We set the number of threads for torch here to avoid errors in case we JIT. # We set intra op concurrency while we force operators to run sequentially. # We can revise this later, but in general we don't have graphs requireing inter-op parallelism. if n_threads is not None: if torch.get_num_interop_threads() != 1: torch.set_num_interop_threads(1) torch.set_num_threads(n_threads) operators = list(topology.topological_operator_iterator()) torch_model = _PyTorchBackendModel(topology.raw_model.input_names, topology.raw_model.output_names, operator_map, operators, extra_config).eval() if backend == onnx.__name__: onnx_model_name = output_model_name = None target_opset = 11 # Set optional configuration options for ONNX if any. if constants.ONNX_OUTPUT_MODEL_NAME in extra_config: onnx_model_name = extra_config[constants.ONNX_OUTPUT_MODEL_NAME] output_model_name = onnx_model_name + ".onnx" if constants.ONNX_TARGET_OPSET in extra_config: target_opset = extra_config[constants.ONNX_TARGET_OPSET] if output_model_name is None: output_model_name = str(uuid4().hex) + ".onnx" # Put the tracing test input into the right format. batch_trace_input, _ = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) # Generate the ONNX models torch.onnx.export( torch_model, batch_trace_input, output_model_name, input_names=topology.raw_model.input_names, output_names=topology.raw_model.output_names, keep_initializers_as_inputs=False, opset_version=target_opset, do_constant_folding=True, ) hb_model = onnx.load(output_model_name) os.remove(output_model_name) # Set the ONNX model name if any. if onnx_model_name is not None: hb_model.graph.name = onnx_model_name # Fix the model to use arbitrary batch dimensions def fix_dim(dim): updated = False if dim.HasField("dim_value"): dim.Clear() updated = True dim.dim_param = "sym" return updated def fix_value_info(value): num_fixed = 0 if value.type.HasField("tensor_type"): shape = value.type.tensor_type.shape if shape: dim = shape.dim[0] if fix_dim(dim): num_fixed += 1 return num_fixed def fix_graph(graph): num_fixed = 0 for input in graph.input: num_fixed += fix_value_info(input) for output in graph.output: num_fixed += fix_value_info(output) for node in graph.node: for attr in node.attribute: if attr.HasField("g"): num_fixed += fix_graph(attr.g) return num_fixed fix_graph(hb_model.graph) elif backend == tvm_backend: # First we need to generate the torchscript model. batch_trace_input, remainder_trace_input = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) ts_model = _jit_model(torch_model, batch_trace_input, "cpu", extra_config) if remainder_trace_input is not None: remainder_ts_model = _jit_model(torch_model, remainder_trace_input, "cpu", extra_config) # Generate the test input in the TVM format. In case we have a remainder beyond the batch, generate a remainder test input as well. test_input = [( topology.raw_model.input_names[i], batch_trace_input[i].shape if type(batch_trace_input) is tuple else batch_trace_input.shape, ) for i in range(len(topology.raw_model.input_names))] if remainder_trace_input is not None: remainder_test_input = [( topology.raw_model.input_names[i], remainder_trace_input[i].shape if type(remainder_trace_input) is tuple else remainder_trace_input.shape, ) for i in range(len(topology.raw_model.input_names))] # Pick the proper target. if device == "cuda": target = tvm.target.cuda() ctx = tvm.gpu() elif device == "cpu": target = "llvm" ctx = tvm.cpu() elif "llvm" in device: target = device ctx = tvm.cpu() else: raise RuntimeError("Device {} not recognized".format(device)) # Get configuration parameters. config = {} if constants.TVM_MAX_FUSE_DEPTH in extra_config: config["relay.FuseOps.max_depth"] = extra_config[ constants.TVM_MAX_FUSE_DEPTH] else: # 50 is a good depth for operator fusion. More than that will probably hurt performance. # https://github.com/microsoft/hummingbird/issues/232#issuecomment-697979508 config["relay.FuseOps.max_depth"] = 50 # Create the relay version of the model. model, params = relay.frontend.from_pytorch(ts_model, test_input) if remainder_trace_input is not None: remainder_model, remainder_params = relay.frontend.from_pytorch( remainder_ts_model, remainder_test_input) # Generate the model. We set opt_level=3 to enable all optimizations. with tvm.transform.PassContext(opt_level=3, config=config): graph, lib, params = relay.build(model, target=target, params=params) tvm_model = graph_runtime.create(graph, lib, ctx) tvm_model.set_input(**params) if remainder_trace_input is not None: with tvm.transform.PassContext(opt_level=3, config=config): graph, lib, params = relay.build(remainder_model, target=target, params=remainder_params) tvm_remainder_model = graph_runtime.create(graph, lib, ctx) tvm_remainder_model.set_input(**params) # In the container we will be using the context to properly configure the input tensors. extra_config[constants.TVM_CONTEXT] = ctx extra_config[ constants.TVM_INPUT_NAMES] = topology.raw_model.input_names if remainder_trace_input is not None: extra_config[constants.TVM_REMAINDER_MODEL] = tvm_remainder_model hb_model = tvm_model else: # Set the device for the model. if device != "cpu": if backend == torch.__name__ or torch.jit.__name__: torch_model = torch_model.to(device) # If the backend is tochscript, jit the model. if backend == torch.jit.__name__: trace_input, _ = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) if device != "cpu": trace_input.to(device) torch_model = torch.jit.trace(torch_model, trace_input).eval() torch.jit.optimized_execution(torch_model) hb_model = torch_model # Return if the container is not needed. if constants.CONTAINER in extra_config and not extra_config[ constants.CONTAINER]: return hb_model # We scan the operators backwards until we find an operator with a defined type. # This is necessary because ONNX models can have arbitrary operators doing casting, reshaping etc. idx = len(operators) - 1 while (idx >= 0 and not operator_map[operators[idx].full_name].regression and not operator_map[operators[idx].full_name].classification and not operator_map[operators[idx].full_name].anomaly_detection and not operator_map[operators[idx].full_name].transformer): idx -= 1 assert idx >= 0, "Cannot detect container type. Please fill an issue at https://github.com/microsoft/hummingbird." # If is a transformer, we need to check whether there is another operator type before. # E.g., normalization after classification. tmp_idx = idx if operator_map[operators[idx].full_name].transformer: while (idx >= 0 and not operator_map[operators[idx].full_name].regression and not operator_map[operators[idx].full_name].classification and not operator_map[operators[idx].full_name].anomaly_detection): idx -= 1 if idx < 0: idx = tmp_idx # Get the proper container type. if operator_map[operators[idx].full_name].regression: # We are doing a regression task. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerRegression elif backend == onnx.__name__: container = ONNXSklearnContainerRegression elif backend == tvm_backend: container = TVMSklearnContainerRegression else: container = PyTorchSklearnContainerRegression elif operator_map[operators[idx].full_name].anomaly_detection: # We are doing anomaly detection. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerAnomalyDetection elif backend == onnx.__name__: container = ONNXSklearnContainerAnomalyDetection elif backend == tvm_backend: container = TVMSklearnContainerAnomalyDetection else: container = PyTorchSklearnContainerAnomalyDetection elif operator_map[operators[idx].full_name].transformer: # We are just transforming the input data. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerTransformer elif backend == onnx.__name__: container = ONNXSklearnContainerTransformer elif backend == tvm_backend: container = TVMSklearnContainerTransformer else: container = PyTorchSklearnContainerTransformer else: # We are doing a classification task. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerClassification elif backend == onnx.__name__: container = ONNXSklearnContainerClassification elif backend == tvm_backend: container = TVMSklearnContainerClassification else: container = PyTorchSklearnContainerClassification n_threads = None if constants.N_THREADS not in extra_config else extra_config[ constants.N_THREADS] batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[ constants.BATCH_SIZE] hb_model = container(hb_model, n_threads, batch_size, extra_config=extra_config) return hb_model
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument( '--num_cpus', type=int, default=1, metavar='N', help='number of CPU vCores to train with (default: use all available)') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print() print("Number of CPU vCores specified to be used {}".format(args.num_cpus)) print("Total # of CPU threads on OS {}".format(os.cpu_count())) print("Total # of usable CPU threads on OS {}".format( len(os.sched_getaffinity(0)))) print("Total # of Intra-op CPU threads - PyTorch {}".format( torch.get_num_threads())) print("Total # of Inter-op threads - PyTorch {}".format( torch.get_num_interop_threads())) print() print("Setting # of Intra-op and Inter-op CPU threads in PyTorch to {}". format(args.num_cpus)) torch.set_num_threads(args.num_cpus) torch.set_num_interop_threads(args.num_cpus) print() print("Total # of Intra-op CPU threads - PyTorch {}".format( torch.get_num_threads())) print("Total # of Inter-op threads - PyTorch {}".format( torch.get_num_interop_threads())) print() kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): epoch_start = time.time() train(args, model, device, train_loader, optimizer, epoch) elapse_time = time.time() - epoch_start elapse_time = datetime.timedelta(seconds=elapse_time) print("Epoch training time {}".format(elapse_time)) test(args, model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
print('[prepare data]') trainset, valset = Criteo.prepare_Criteo(root=args.dataset_root, min_threshold=args.min_threshold, val_split=args.val_split, n_jobs=os.cpu_count()) print('[init process group]') # distributed.init_process_group( # backend=args.backend, # init_method=args.init_method, # world_size=args.world_size, # rank=args.rank # ) torch.set_num_interop_threads( max(args.num_threads, torch.get_num_interop_threads())) torch.manual_seed(args.seed) print('[init dataloader]') trainloader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # trainloader = DataLoader( # dataset=trainset, # batch_size=args.batch_size, # sampler=DistributedSampler(trainset), # num_workers=args.num_workers # ) valloader = DataLoader(dataset=valset,
def __enter__(self): self.num_threads_env = torch.get_num_interop_threads() torch.set_num_interop_threads(self.num_threads_exe)
# Creates a matrix M_data = [[1., 2., 3.], [4., 5., 6]] M = torch.tensor(M_data) print(M) # Random matrix 3x4x5 x = torch.randn((3, 4, 5)) print(x) # OPERATIONS x = torch.tensor([1., 2., 3.]) y = torch.tensor([4., 5., 6.]) z = x + y print(z) tStart = time.time() for i in range(100000): t0 = torch.randn((100, 100)) t1 = torch.randn((100, 100)) t2 = torch.randn((100, 100)) for j in range(10): t2 += t0 * t1 # print(t2) print(time.time() - tStart) print(torch.get_num_interop_threads()) print(torch.device('cpu'), )
def ensure_num_interop_threads(n): if torch.get_num_interop_threads() < n: torch.set_num_interop_threads(n) return torch.get_num_interop_threads()
def _setup(self, config): self.config = config print('NeuroCard config:') pprint.pprint(config) os.chdir(config['cwd']) for k, v in config.items(): setattr(self, k, v) if config['__gpu'] == 0: torch.set_num_threads(config['__cpu']) # W&B. # Do wandb.init() after the os.chdir() above makes sure that the Git # diff file (diff.patch) is w.r.t. the directory where this file is in, # rather than w.r.t. Ray's package dir. wandb_project = config['__run'] wandb.init(name=os.path.basename( self.logdir if self.logdir[-1] != '/' else self.logdir[:-1]), sync_tensorboard=True, config=config, project=wandb_project) self.epoch = 0 if isinstance(self.join_tables, int): # Hack to support training single-model tables. sorted_table_names = sorted( list(datasets.JoinOrderBenchmark.GetJobLightJoinKeys().keys())) self.join_tables = [sorted_table_names[self.join_tables]] # Try to make all the runs the same, except for input orderings. torch.manual_seed(0) np.random.seed(0) # Common attributes. self.loader = None self.join_spec = None join_iter_dataset = None table_primary_index = None # New datasets should be loaded here. assert self.dataset in ['imdb'] if self.dataset == 'imdb': print('Training on Join({})'.format(self.join_tables)) loaded_tables = [] for t in self.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=self.use_cols) table.data.info() loaded_tables.append(table) if len(self.join_tables) > 1: join_spec, join_iter_dataset, loader, table = self.MakeSamplerDatasetLoader( loaded_tables) self.join_spec = join_spec self.train_data = join_iter_dataset self.loader = loader table_primary_index = [t.name for t in loaded_tables ].index('auth_user') table.cardinality = datasets.JoinOrderBenchmark.GetFullOuterCardinalityOrFail( self.join_tables) self.train_data.cardinality = table.cardinality print('rows in full join', table.cardinality, 'cols in full join', len(table.columns), 'cols:', table) else: # Train on a single table. table = loaded_tables[0] if self.dataset != 'imdb' or len(self.join_tables) == 1: table.data.info() self.train_data = self.MakeTableDataset(table) self.table = table # Provide true cardinalities in a file or implement an oracle CardEst. self.oracle = None self.table_bits = 0 # A fixed ordering? self.fixed_ordering = self.MakeOrdering(table) model = self.MakeModel(self.table, self.train_data, table_primary_index=table_primary_index) # NOTE: ReportModel()'s returned value is the true model size in # megabytes containing all all *trainable* parameters. As impl # convenience, the saved ckpts on disk have slightly bigger footprint # due to saving non-trainable constants (the masks in each layer) as # well. They can be deterministically reconstructed based on RNG seeds # and so should not be counted as model size. self.mb = train_utils.ReportModel(model) if not isinstance(model, transformer.Transformer): print('applying train_utils.weight_init()') model.apply(train_utils.weight_init) self.model = model if self.use_data_parallel: self.model = DataParallelPassthrough(self.model) wandb.watch(model, log='all') if self.use_transformer: opt = torch.optim.Adam( list(model.parameters()), 2e-4, # betas=(0.9, 0.98), # B in Lingvo; in Trfmr paper. betas=(0.9, 0.997), # A in Lingvo. eps=1e-9, ) else: if self.optimizer == 'adam': opt = torch.optim.Adam(list(model.parameters()), 2e-4) else: print('Using Adagrad') opt = torch.optim.Adagrad(list(model.parameters()), 2e-4) print('Optimizer:', opt) self.opt = opt total_steps = self.epochs * self.max_steps if self.lr_scheduler == 'CosineAnnealingLR': # Starts decaying to 0 immediately. self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( opt, total_steps) elif self.lr_scheduler == 'OneCycleLR': # Warms up to max_lr, then decays to ~0. self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( opt, max_lr=2e-3, total_steps=total_steps) elif self.lr_scheduler is not None and self.lr_scheduler.startswith( 'OneCycleLR-'): warmup_percentage = float(self.lr_scheduler.split('-')[-1]) # Warms up to max_lr, then decays to ~0. self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( opt, max_lr=2e-3, total_steps=total_steps, pct_start=warmup_percentage) elif self.lr_scheduler is not None and self.lr_scheduler.startswith( 'wd_'): # Warmups and decays. splits = self.lr_scheduler.split('_') assert len(splits) == 3, splits lr, warmup_fraction = float(splits[1]), float(splits[2]) self.custom_lr_lambda = train_utils.get_cosine_learning_rate_fn( total_steps, learning_rate=lr, min_learning_rate_mult=1e-5, constant_fraction=0., warmup_fraction=warmup_fraction) else: assert self.lr_scheduler is None, self.lr_scheduler self.tbx_logger = tune_logger.TBXLogger(self.config, self.logdir) if self.checkpoint_to_load: self.LoadCheckpoint() self.loaded_queries = None self.oracle_cards = None if self.dataset == 'imdb' and len(self.join_tables) > 1: queries_job_format = utils.JobToQuery(self.queries_csv) self.loaded_queries, self.oracle_cards = utils.UnpackQueries( self.table, queries_job_format) # 解析过程,需要替换 timepre1 = time.time() print('Pretime:\n', "{:.2f}".format(timepre1 - gettimest1())) if config['__gpu'] == 0: print('CUDA not available, using # cpu cores for intra-op:', torch.get_num_threads(), '; inter-op:', torch.get_num_interop_threads())
parser.add_argument("-e", "--experiment", dest="name", default="default", help="Experiment to run", choices=CONFIGS.keys()) parser.add_argument("-g", "--num-gpus", type=int, default=torch.cuda.device_count(), help="number of GPUs to use") parser.add_argument( "-n", "--num-cpus", type=int, default=torch.get_num_interop_threads(), help="number of CPUs to use when GPU is not available."), parser.add_argument("-r", "--resume", action="store_true", help="Resume training from last known checkpoint") parser.add_argument("-j", "--workers", type=int, default=6, help="Number of dataloaders workers") parser.add_argument("-b", "--backend", choices=["nccl", "gloo"], help="Pytorch Distributed backend", default="nccl")
def load(location, do_unzip_and_model_type_check=True): """ Method used to load a container from the file system. Args: location: The location on the file system where to load the model. do_unzip_and_model_type_check: Whether to unzip the model and check the type. Returns: The loaded model. """ container = None # Unzip the dir. if do_unzip_and_model_type_check: zip_location = location if not location.endswith("zip"): zip_location = location + ".zip" else: location = zip_location[:-4] assert os.path.exists( zip_location), "Zip file {} does not exist.".format( zip_location) shutil.unpack_archive(zip_location, location, format="zip") assert os.path.exists( location), "Model location {} does not exist.".format(location) # Load the model type. with open(os.path.join(location, constants.SAVE_LOAD_MODEL_TYPE_PATH), "r") as file: model_type = file.readline() # Check the versions of the modules used when saving the model. if os.path.exists( os.path.join(location, constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)): with open( os.path.join(location, constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "r") as file: configuration = file.readlines() check_dumped_versions(configuration, hummingbird, torch) else: warnings.warn( "Cannot find the configuration file with versions. You are likely trying to load a model saved with an old version of Hummingbird." ) if model_type == "torch.jit": # This is a torch.jit model model = torch.jit.load( os.path.join(location, constants.SAVE_LOAD_TORCH_JIT_PATH)) with open(os.path.join(location, "container.pkl"), "rb") as file: container = pickle.load(file) container._model = model elif model_type == "torch": # This is a pytorch model with open( os.path.join(location, constants.SAVE_LOAD_TORCH_JIT_PATH), "rb") as file: container = pickle.load(file) else: shutil.rmtree(location) raise RuntimeError( "Model type {} not recognized".format(model_type)) # Need to set the number of threads to use as set in the original container. if container._n_threads is not None: if torch.get_num_interop_threads() != 1: torch.set_num_interop_threads(1) torch.set_num_threads(container._n_threads) shutil.rmtree(location) return container
parser.add_argument("--config_files", type=str, nargs='+') opt = parser.parse_args() # Load Config Files __C = Config() for filename in opt.config_files: ic(filename) __C.add_from_dict(Config.parse_from_yml(filename)) # ic(__C) # ic(__C) # Limit CPU usage torch.set_num_threads(__C.CPU_THREADS) torch.set_num_interop_threads(__C.CPU_THREADS) print(colorama.Fore.GREEN + "Using %d/%d cores/threads of CPU" % (torch.get_num_threads(), torch.get_num_interop_threads())) # config_file = "__C.YML" # config = YAMLParser(config_file).data # ------------------ configuration tests ----------------- # assert __C.OPTIMIZER_TYPE in ["Adam", "SGD", "AdamW"] assert __C.DATASET_TYPE in ["Cora", "Citeseer", "Pubmed"] assert __C.MODEL_TYPE in ["DGCNN", "GCN", "SGC", "GCNII"] # ---------------- general configurations ---------------- # __C.NGPU = len(__C.GPU_IDS) __C.PARALLEL = __C.NGPU > 1 __C.BATCH_SIZE = __C.BATCH_SIZE_SINGLE * __C.NGPU