def __init__(self, submodules: list, name: str, strategy_name: str): """ :param submodules: list or nn.ModuleList of choices :param name: name of the architecture weight :param strategy_name: name of the architecture strategy to use """ super().__init__(submodules) self._add_to_kwargs(name=name, strategy_name=strategy_name) self.sm = StrategyManager() self.ws = self.sm.make_weight(self.strategy_name, name, only_single_path=False, choices=self.submodules)
def generate_from_name(name: str, save=True, verbose=True): genotype, compact = compact_from_name(name, verbose=verbose) run_configs = '{path_conf_tasks}/d1_dartsv1.run_config, {path_conf_net_search}darts.run_config' # create weight sharing cell model changes = { 'cls_data': 'Cifar10Data', '{cls_data}.fake': True, '{cls_task}.save_del_old': False, '{cls_network_body}.cell_order': 'n, r', '{cls_network_body}.features_first_cell': 36 * 4, '{cls_network_stem}.features': 36 * 3, 'cls_network_cells_primitives': "%s, %s" % (compact.get('primitives'), compact.get('primitives')), } task = Main.new_task(run_configs, args_changes=changes) net = task.get_method().get_network() args = task.args wss = StrategyManager().get_strategies() assert len(wss) == 1 ws = wss[list(wss.keys())[0]] # fix arc, all block inputs use different weights # go through all weights in the search cell for n, w in ws.named_parameters_single(): # figure out cell type ("normal", "reduce"), block index, and if it's the first, second, ... op of that block c_type, block_idx, num_inputs, num_idx = n.split('/')[-4:] block_idx = int(block_idx.split('-')[-1]) num_idx = int(num_idx.split('-')[-1]) # set all paths weights to zero w.data.zero_() # go through the cell description of the genotype, if input and op number match, set the weight to be higher for op_idx, from_idx in compact.get(c_type)[block_idx]: if num_idx == from_idx: w[op_idx] = 1 ws.forward() # saving config now will only use the highest weighted connections, since we have a search network cfg = net.config(finalize=True, num_block_ops=2) if save: path = Builder.save_config(cfg, get_net_config_dir(genotype.source), name) print('Saved config: %s' % path) return net, cfg, args
def __init__(self, submodules: list, name: str, strategy_name: str): """ :param submodules: list or nn.ModuleList of choices :param name: name of the architecture weight :param strategy_name: name of the architecture strategy to use """ assert None not in [self._depth, self._act_fun], "this class should not be initialized directly" super().__init__(submodules, name, strategy_name) # store previous names, get their number of choices, no need to store the own name sm = StrategyManager() self._all_prev_names = sm.ordered_names(unique=False)[-self._depth - 1:-1] self._all_prev_sizes = [sm.get_num_weight_choices(n) for n in self._all_prev_names] self._eye = np.eye(N=max(self._all_prev_sizes + [1])) self._attention_op = None self._expand_axis = []
class MixedOp(SumParallelModules): """ all op choices on one path in parallel, the weight strategy decides which results to compute and combine """ def __init__(self, submodules: list, name: str, strategy_name: str): """ :param submodules: list or nn.ModuleList of choices :param name: name of the architecture weight :param strategy_name: name of the architecture strategy to use """ super().__init__(submodules) self._add_to_kwargs(name=name, strategy_name=strategy_name) self.sm = StrategyManager() self.ws = self.sm.make_weight(self.strategy_name, name, only_single_path=False, choices=self.submodules) def config(self, finalize=True, **_) -> dict: if finalize: indices = self.ws.get_finalized_indices(self.name) if len(indices) == 1: return self.submodules[indices[0]].config(finalize=finalize, **_) return SumParallelModules([self.submodules[i] for i in indices]).config(finalize=finalize, **_) else: return super().config(finalize=finalize, **_) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.ws.combine(self.name, x, self.submodules)
def __init__(self, args: Namespace, *args_, **kwargs): AbstractTask.__init__(self, args, *args_, **kwargs) # for architecture weights log_headline(self.logger, 'adding Strategy and Data') StrategyManager().add_strategy(RandomChoiceStrategy(max_epochs=1)) # data data_set = self._parsed_meta_argument(Register.data_sets, 'cls_data', args, index=None).from_args(args, index=None) self.batch_size = data_set.get_batch_size(train=False) # device handling self.devices_handler = self._parsed_meta_argument(Register.devices_managers, 'cls_device', args, index=None)\ .from_args(self.seed, self.is_deterministic, args, index=None) self.mover = self.devices_handler.allocate_devices(num=-1) # network log_headline(self.logger, 'adding Network') self.net = self._parsed_meta_argument(Register.networks, 'cls_network', args, index=None).from_args(args) self.net.build(s_in=data_set.get_data_shape(), s_out=data_set.get_label_shape()) self.net = self.mover.move_module(self.net) # profiler log_headline(self.logger, 'adding Profiler') self.profiler = self._parsed_meta_argument(Register.profilers, 'cls_profiler', args, index=None)\ .from_args(args, index=None, is_test_run=self.is_test_run) assert isinstance(self.profiler, AbstractProfiler)
def __init__(self, args: Namespace, wildcards: dict, descriptions: dict = None): super().__init__() # args, seed self.args = args self.save_dir = self._parsed_argument('save_dir', args) self.is_test_run = self._parsed_argument('is_test_run', args) self.seed = self._parsed_argument('seed', args) self.is_deterministic = self._parsed_argument('is_deterministic', args) random.seed(self.seed) np.random.seed(self.seed) torch.manual_seed(self.seed) if self.is_deterministic: # see https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") torch.set_deterministic(self.is_deterministic) # maybe delete old dir, note arguments, save run_config if self._parsed_argument('save_del_old', args): shutil.rmtree(self.save_dir, ignore_errors=True) os.makedirs(self.save_dir, exist_ok=True) save_as_json(args, get_task_config_path(self.save_dir), wildcards) dump_system_info(self.save_dir + 'sysinfo.txt') # logging self.log_file = '%slog_task.txt' % self.save_dir LoggerManager().set_logging(default_save_file=self.log_file) self.logger = self.new_logger(index=None) log_args(self.logger, None, self.args, add_git_hash=True, descriptions=descriptions) Register.log_all(self.logger) # reset weight strategies so that consecutive tasks do not conflict with each other StrategyManager().reset() self.methods = []
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ key, alpha, grace_epochs = self._parsed_arguments( ['key', 'alpha', 'grace_epochs'], self.hparams) return StrategyManager().add_strategy( MdlStrategy(self.max_epochs, key=key, alpha=alpha, grace_epochs=grace_epochs))
def _run(self, save=False): # value spaces values = set() sm = StrategyManager() # add all evaluated architectures of the benchmarks for bs in self.benchmark_sets: assert isinstance(bs, MiniNASTabularBenchmark) l0, l1 = len(sm.ordered_names( unique=True)), bs.get_value_space().num_choices() assert l0 == l1, "Num choices of the network space (%d) and the bench space (%d) must match" % ( l0, l1) for r in bs.get_all(): values.add(r.arch_tuple) if len(values) > 0: self.logger.info( "Added %d architectures from given benchmark set(s) to the list" % len(values)) # if the space is smaller than desired, add random architectures network = self.get_method().get_network() assert isinstance(network, SearchUninasNetwork) net_space = sm.get_value_space() if self.measure_min > len(values): self.logger.info("Adding random architectures, have %d/%d" % (len(values), self.measure_min)) while len(values) < self.measure_min: values.add(net_space.random_sample()) # evaluate the given architectures self._architecture_space = SpecificValueSpace(list(values)) algorithm, population = super()._run(save=save) # add info to the candidates, e.g. from profilers, such as loss/flops/latency/macs pass # create a new bench bench = MiniNASSearchTabularBenchmark.make_from_population( population, self.get_method()) log_headline(self.logger, "Created bench file from super-network") bench.print_info(self.logger.info) bench.save_in_dir(self.save_dir) explore(bench, self.logger, n=10)
def _build(self, s_in: Shape, c_out: int) -> Shape: conv_kwargs = dict(dilation=self.dilation, padding=self.padding) c_in = s_in.num_features() self.has_skip = self.stride == 1 and c_in == c_out for e in range(len(self.expansions)): for k in range(len(self.k_sizes)): self._choices_by_idx.append((e, k)) if self.has_skip and isinstance(self.skip_op, str): self.skip = Register.network_layers.get(self.skip_op)() self.skip.build(s_in, c_out) self._choices_by_idx.append(('skip', 'skip')) self.ws = StrategyManager().make_weight(self.strategy_name, self.name, only_single_path=True, num_choices=len(self._choices_by_idx)) for e in self.expansions: c_mid = int(c_in * e) # pw in self.pw_in.append(nn.Sequential( get_conv2d(c_in, c_mid, k_size=self.k_size_in, groups=1, **conv_kwargs), nn.BatchNorm2d(c_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), )) # dw conv ops with different kernel sizes convs = nn.ModuleList([]) for k in self.k_sizes: convs.append(nn.Sequential( get_conv2d(c_mid, c_mid, k_size=k, stride=self.stride, groups=-1, **conv_kwargs), nn.BatchNorm2d(c_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), )) self.dw_conv.append(convs) # dw optional attention module if self.has_att: self.dw_att.append(AbstractAttentionModule.module_from_dict(c_mid, c_substitute=c_in, att_dict=self.att_dict)) # pw out self.pw_out.append(nn.Sequential( get_conv2d(c_mid, c_out, k_size=self.k_size_out, groups=1, **conv_kwargs), nn.BatchNorm2d(c_out, affine=self.bn_affine), )) return self.probe_outputs(s_in)
def profile(self, network: SearchUninasNetwork, mover: AbstractDeviceMover, batch_size: int): """ profile the network """ assert self.profile_fun is not None, "Can not measure if there is no profile function!" # unnecessary here, could check if this is a test and shorten everything # is_test_run = self.get('is_test_run') # set up nested structure if it does not exist self.data['measured'] = self.data.get('measured', {}) # stem if self.data.get('measured').get('stem', None) is None: self.logger.info('Measuring the stem') stem = network.get_stem() self.data['measured']['stem'] =\ self.profile_fun.profile(stem, stem.get_shape_in(), mover, batch_size) # cells self.data['measured']['cells'] = self.data.get('measured').get('cells', {}) sm = StrategyManager() cells = network.get_cells() n_choices = sm.get_num_choices() if len(cells) != len(n_choices): raise ValueError("Number of cells (%d) must match number of arc choices (%d)" % (len(cells), len(n_choices))) network.set_forward_strategy(False) for i1, (cell, n) in enumerate(zip(cells, n_choices)): self.data['measured']['cells'][i1] = self.data['measured']['cells'].get(i1, {}) for i2 in range(n): if self.data['measured']['cells'][i1].get(i2, None) is None: self.logger.info('Measuring cell %d, option %d' % (i1, i2)) sm.forward_const(i2) self.data['measured']['cells'][i1][i2] =\ self.profile_fun.profile(cell, cell.get_shape_in(), mover, batch_size) # final head if self.data.get('measured').get('head', None) is None: self.logger.info('Measuring the final head') head = network.get_heads()[-1] self.data['measured']['head'] =\ self.profile_fun.profile(head, head.get_shape_in(), mover, batch_size)
def make_from_population(cls, population: Population, method: AbstractMethod): """ creating a mini bench dataset from an evaluated super-network """ results = {} arch_to_idx = {} tuple_to_str = {} tuple_to_idx = {} space = StrategyManager().get_value_space(unique=True) data_set_name = method.get_data_set().__class__.__name__ space_name = method.get_network().get_model_name() default_result_type = "test" for i, candidate in enumerate(population.get_candidates()): # first use all estimated metrics # if they contain e.g. "acc1/valid", create a sub dict metrics = {} for k, v in candidate.metrics.items(): splits = k.split('/') if len(splits) == 1: metrics[splits[0]] = {data_set_name: v} else: metrics[splits[0]] = metrics.get(splits[0], {}) metrics[splits[0]][data_set_name] = metrics[splits[0]].get(data_set_name, {}) metrics[splits[0]][data_set_name][splits[1]] = v default_result_type = splits[1] # now make sure all keys exist for k in MiniResult.get_metric_keys(): metrics[k] = metrics.get(k, {data_set_name: -1}) # result r = MiniResult( arch_index=i, arch_str="%s(%s)" % (space_name, ", ".join([str(v) for v in candidate.values])), arch_tuple=candidate.values, **metrics ) assert tuple_to_str.get(r.arch_tuple) is None, "can not yet merge duplicate architecture results" results[i] = r arch_to_idx[r.arch_str] = i tuple_to_idx[r.arch_tuple] = i tuple_to_str[r.arch_tuple] = r.arch_str data_sets = list(results.get(0).params.keys()) return MiniNASSearchTabularBenchmark( default_data_set=data_sets[0], default_result_type=default_result_type, bench_name="%s on %s" % (space_name, data_sets[0]), bench_description="super-network evaluation results", value_space=space, results=results, arch_to_idx=arch_to_idx, tuple_to_str=tuple_to_str, tuple_to_idx=tuple_to_idx)
def log_detailed(self): # log some things log_headline(self.logger, 'Trainer, Method, Data, ...') rows = [('Trainer', '')] for i, trainer in enumerate(self.trainer): rows.append((' (%d)' % i, trainer.str())) log_in_columns(self.logger, rows) for i, method in enumerate(self.methods): log_headline(self.logger, "Method %d/%d" % (i+1, len(self.methods)), target_len=80) method.log_detailed(self.logger) StrategyManager().log_detailed(self.logger)
def __init__(self, submodules: list, name: str, strategy_name: str, depth=0): """ :param submodules: list or nn.ModuleList of choices :param name: name of the architecture weight :param strategy_name: name of the architecture strategy to use :param depth: depth, how many previous architecture decisions to consider """ super().__init__(submodules, name, strategy_name) # store previous names in case this mixed op will be deepened, no need to store the own name self._add_to_kwargs(depth=depth) self._all_prev_names = StrategyManager().ordered_names(unique=False)[-self.max_depth - 1:-1] self._state_dicts = {} self._last_state = 'w' self.change_depth(new_depth=self.depth)
def _on_epoch_start(self) -> dict: log_dict = super()._on_epoch_start() tau_0, tau_grace, beta = self._parsed_arguments( ['tau_0', 'tau_grace', 'beta'], self.hparams) for strategy in StrategyManager().get_strategies_list(): strategy.tau = tau_0 * beta**self.current_epoch log_dict = self._add_to_dict(log_dict, dict(tau=strategy.tau)) self.update_architecture_weights = strategy.tau < tau_grace if self.update_architecture_weights: strategy.mask_all_weights_below(0.4, div_by_numel=True) log_dict.update( strategy.get_masks_log_dict(prefix='asap/masks')) self.set_loader_multiples((1, 1)) else: self.set_loader_multiples((1, 0)) return log_dict
def change_depth(self, new_depth=1): """ called by a VariableDepthMixedOpCallback, increases the recursive depth of the op, copying the weights, using a copy depending on a previous layer choice """ if new_depth > 0: assert new_depth >= self.depth, "Can not reduce the depth" assert new_depth <= self.max_depth, "Can not increase the depth beyond %d" % self.max_depth assert StrategyManager().is_only_single_path() while self.depth < min([new_depth, len(self._all_prev_names)]): if len(self._state_dicts) == 0: self._state_dicts[self._last_state] = self.submodules.state_dict() # enlarge dict of stored state dicts by one layer new_state_dicts = {'0.%s' % k: v for k, v in self._state_dicts.items()} self._state_dicts = new_state_dicts self._last_state = '0.%s' % self._last_state self.depth += 1
def profile(self, network: SearchUninasNetwork, mover: AbstractDeviceMover, batch_size: int): """ profile the network """ assert self.profile_fun is not None, "Can not measure if there is no profile function!" sm = StrategyManager() # step 1) generate a dataset # at some point, if other predictors are attempted (nearest neighbor, SVM, ...) step1 code could be moved # to a shared parent class # number of choices at every position max_choices = sm.get_num_choices() print("max choices", max_choices) # get the search space, we can sample random architectures from it space = sm.get_value_space(unique=True) for i in range(10): print("random arc %d: %s" % (i, space.random_sample())) # make sure that a forward pass will not change the network topology network.set_forward_strategy(False) # find out the size of the network inputs shape_in = network.get_shape_in() # fix the network architecture, profile it sm.forward(fixed_arc=space.random_sample()) value = self.profile_fun.profile(module=network, shape_in=shape_in, mover=mover, batch_size=batch_size) print('value 1', value) # alternate way: instead of using one over-complete network that has unused modules, # - get the current network architecture (the last set fixed_arc indices will be used now) # - build it stand-alone (exactly as the "true" network would be used later), with the same input/output sizes # - place it on the profiled device # - profile that instead # this takes longer, but the mismatch between over-complete and stand-alone is very interesting to explore # can make this an option via Argument network_config = network.config(finalize=True) network_body = Builder().from_config(network_config) standalone = RetrainUninasNetwork(model_name='__tmp__', net=network_body, checkpoint_path='', assert_output_match=True) standalone.build(network.get_shape_in(), network.get_shape_out()[0]) standalone = mover.move_module(standalone) value = self.profile_fun.profile(module=standalone, shape_in=shape_in, mover=mover, batch_size=batch_size) print('value 2', value)
def test_rebuild(self): """ getting finalized configs from which we can build modules """ builder = Builder() StrategyManager().delete_strategy('default') StrategyManager().add_strategy(RandomChoiceStrategy(max_epochs=1)) n, c, h, w = 2, 8, 16, 16 x = torch.empty(size=[n, c, h, w]) shape = Shape([c, h, w]) layers = [ FusedMobileInvertedConvLayer(name='mmicl', k_sizes=(3, 5, 7), expansions=(3, 6)), SuperConvThresholdLayer(k_sizes=(3, 5, 7)), SuperSepConvThresholdLayer(k_sizes=(3, 5, 7)), SuperMobileInvertedConvThresholdLayer(k_sizes=(3, 5, 7), expansions=(3, 6), sse_dict=dict(c_muls=(0.0, 0.25, 0.5))), LinearTransformerLayer(), SuperConvLayer(k_sizes=(3, 5, 7), name='scl1'), SuperSepConvLayer(k_sizes=(3, 5, 7), name='scl2'), SuperMobileInvertedConvLayer(k_sizes=(3, 5, 7), name='scl3', expansions=(2, 3, 4, 6)), ] for layer in layers: assert layer.build(shape, c) == shape StrategyManager().build() StrategyManager().forward() for layer in layers: print('\n' * 2) print(layer.__class__.__name__) for i in range(3): StrategyManager().randomize_weights() StrategyManager().forward() for finalize in [False, True]: cfg = layer.config(finalize=finalize) print('\t', i, 'finalize', finalize) print('\t\tconfig dct:', cfg) cfg_layer = builder.from_config(cfg) assert cfg_layer.build(shape, c) == shape cfg_layer.forward(x) print('\t\tmodule str:', cfg_layer.str()[1:]) del cfg, cfg_layer
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ tau_0 = self._parsed_argument('tau_0', self.hparams) return StrategyManager().add_strategy( DifferentiableStrategy(self.max_epochs, tau=tau_0, use_mask=True))
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ return StrategyManager().add_strategy( FairRandomChoiceStrategy(self.max_epochs, assert_same_length=True))
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ return StrategyManager().add_strategy( RandomChoiceStrategy(self.max_epochs))
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ return StrategyManager().add_strategy(DifferentiableStrategy(self.max_epochs, use_mask=False))
class SearchUninasNetwork(AbstractUninasNetwork): def __init__(self, model_name: str, net: AbstractNetworkBody, do_forward_strategy=True, *args, **kwargs): super().__init__(model_name=model_name, net=net, *args, **kwargs) self.do_forward_strategy = do_forward_strategy # unnecessary line to remove "error" highlighting self._add_to_kwargs(do_forward_strategy=self.do_forward_strategy) self.strategy_manager = StrategyManager() self.strategies = None @classmethod def from_args(cls, args: Namespace, index=None, weight_strategies: Union[dict, str] = None)\ -> 'SearchUninasNetwork': """ :param args: global argparse namespace :param index: argument index :param weight_strategies: {strategy name: [cell indices]}, or name used for all, or None for defaults """ all_parsed = cls._all_parsed_arguments(args) cls_net = cls._parsed_meta_argument(Register.network_bodies, 'cls_network_body', args, index=index) net = cls_net.search_network_from_args(args, index=index, weight_strategies=weight_strategies) return cls(cls.__name__, net, **all_parsed) @classmethod def meta_args_to_add(cls) -> [MetaArgument]: """ list meta arguments to add to argparse for when this class is chosen, classes specified in meta arguments may have their own respective arguments """ return super().meta_args_to_add() + [ MetaArgument('cls_network_body', Register.network_bodies, help_name='network', allowed_num=1), ] def _build2(self, s_in: Shape, s_out: Shape) -> ShapeList: """ build the network """ s = self.net.build(s_in, s_out) self.strategies = self.strategy_manager.get_strategies_list() self.strategy_manager.build() return s def get_strategy_manager(self) -> StrategyManager: return self.strategy_manager def set_forward_strategy(self, forward_strategy: bool): self.do_forward_strategy = forward_strategy def get_forward_strategy(self) -> bool: return self.do_forward_strategy def forward(self, x: torch.Tensor, ws_kwargs: dict = None, **net_kwargs) -> [torch.Tensor]: """ forward first the weight strategy, then the network """ if self.do_forward_strategy: self.forward_strategy(**({} if ws_kwargs is None else ws_kwargs)) return super().forward(x, **net_kwargs) def forward_net(self, x: torch.Tensor, **net_kwargs) -> [torch.Tensor]: """ forward only the network """ return self.net(x, **net_kwargs) def forward_strategy(self, **ws_kwargs): """ forward only the weight strategy """ self.strategy_manager.forward(**ws_kwargs) def str(self, depth=0, **_) -> str: r = '{d}{name}(\n{ws},{net}\n{d}])'.format(**{ 'd': '{d}', 'd1': '{d1}', 'name': self.__class__.__name__, 'ws': '{d1}Strategies: [%s]' % ', '.join([ws.str() for ws in self.strategies]), 'net': self.net.str(depth=depth+1, max_depth=self.log_detail, **_), }) r = r.replace('{d}', '. '*depth).replace('{d1}', '. '*(depth+1)) return r def config(self, finalize=True, **_) -> dict: if finalize: return self.net.config(finalize=finalize, **_) return super().config(finalize=finalize, **_) def named_net_arc_parameters(self) -> (list, list): # all named parameters net_params, arc_params, duplicate_idx = list(self.net.named_parameters()), [], [] for ws in self.strategies: arc_params += list(ws.named_parameters()) # remove arc parameters from the network for an, ap in arc_params: for idx, (n, p) in enumerate(net_params): if ap is p: duplicate_idx.append(idx) for idx in sorted(duplicate_idx, reverse=True): net_params.pop(idx) return net_params, arc_params def track_used_params(self, x: torch.Tensor) -> Tracker: """ track which weights are used for the current architecture, and in which cell """ tracker = Tracker() is_train = self.training self.eval() handles = [] ws_modules = [] x = x.to(self.get_device()) # find all modules that have a weight strategy, add hooks for name, module in self.named_modules(): if hasattr(module, 'ws') and isinstance(module.ws, (AbstractWeightStrategy, StrategyManager)): ws_modules.append(module) for name2, m2 in module.named_modules(): if len(get_to_print(m2)) >= 1: handles.append(m2.register_forward_hook(Hook(tracker, 'net.%s.%s' % (name, name2)))) # forward pass with the current arc, all used weights are tracked self.forward_net(x) tracker.finalize() for h in handles: h.remove() self.train(is_train) return tracker @classmethod def get_space_tuple(cls, unique=True, flat=False) -> tuple: """ tuple of final topology """ return tuple(StrategyManager().get_all_finalized_indices(unique=unique, flat=flat))
def test_output_shapes(self): """ expected output shapes of standard layers """ Builder() StrategyManager().delete_strategy('default') StrategyManager().add_strategy(RandomChoiceStrategy(max_epochs=1)) bs, c1, c2, hw1, hw2 = 4, 4, 8, 32, 16 s_in = Shape([c1, hw1, hw1]) x = torch.empty(size=[bs] + s_in.shape) case_s1_c1 = (c1, 1, Shape([c1, hw1, hw1])) case_s1_c2 = (c2, 1, Shape([c2, hw1, hw1])) case_s2_c1 = (c1, 2, Shape([c1, hw2, hw2])) case_s2_c2 = (c2, 2, Shape([c2, hw2, hw2])) for cls, cases, kwargs in [ (SkipLayer, [case_s1_c1, case_s1_c2], dict()), (ZeroLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict()), (FactorizedReductionLayer, [case_s2_c1, case_s2_c2], dict()), (PoolingLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=3)), (ConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=3)), (SepConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=3)), (MobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=3)), (MobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=(3, ))), (MobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_size=(3, 5, 7), k_size_in=(1, 1), k_size_out=(1, 1))), (FusedMobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(name='mmicl1', k_sizes=(3, 5, 7), k_size_in=(1, 1), k_size_out=(1, 1))), (FusedMobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(name='mmicl2', k_sizes=((3, 5), (3, 5, 7)), k_size_in=(1, 1), k_size_out=(1, 1))), (ShuffleNetV2Layer, [case_s1_c1, case_s1_c2, case_s2_c2], dict(k_size=3)), (ShuffleNetV2XceptionLayer, [case_s1_c1, case_s1_c2, case_s2_c2], dict(k_size=3)), (LinearTransformerLayer, [case_s1_c1, case_s1_c2], dict()), (SuperConvThresholdLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7))), (SuperSepConvThresholdLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7))), (SuperMobileInvertedConvThresholdLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7), expansions=(3, 6), sse_dict=dict(c_muls=(0.0, 0.25, 0.5)))), (SuperConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7), name='scl')), (SuperSepConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7), name='sscl')), (SuperMobileInvertedConvLayer, [case_s1_c1, case_s1_c2, case_s2_c1, case_s2_c2], dict(k_sizes=(3, 5, 7), name='smicl', expansions=(3, 6))), (AttentionLayer, [case_s1_c1], dict(att_dict=dict(att_cls='EfficientChannelAttentionModule'))), (AttentionLayer, [case_s1_c1], dict(att_dict=dict(att_cls='SqueezeExcitationChannelModule'))), ]: for c, stride, shape_out in cases: m1 = cls(stride=stride, **kwargs) s_out = m1.build(s_in, c) assert s_out == shape_out, 'Expected output shape does not match, %s, build=%s / expected=%s' %\ (cls.__name__, s_out, shape_out) assert_output_shape(m1, x, [bs] + shape_out.shape) print('%s(stride=%d, c_in=%d, c_out=%d)' % (cls.__name__, stride, c1, c))
def setup_strategy(self) -> StrategyManager: """ set up the strategy for architecture weights """ tau0 = self._parsed_argument('tau0', self.hparams) return StrategyManager().add_strategy( GDASStrategy(self.max_epochs, tau0=tau0, use_mask=False))
def _build2(self, s_in: Shape, s_out: Shape) -> ShapeList: """ build the network """ # find the search config if not os.path.isfile(self.search_config_path): self.search_config_path = Builder.find_net_config_path( self.search_config_path, pattern='search') # create a temporary search strategy tmp_s = RandomChoiceStrategy(max_epochs=1, name='__tmp__') sm = StrategyManager() assert len(sm.get_strategies_list( )) == 0, "can not load when there already is a search network" sm.add_strategy(tmp_s) sm.set_fixed_strategy_name('__tmp__') # create a search network search_net = Register.builder.load_from_config(self.search_config_path) assert isinstance(search_net, SearchUninasNetwork) search_net.build(s_in, s_out) search_net.set_forward_strategy(False) # set the architecture, get the config req_gene = "" if self.gene == 'random': search_net.forward_strategy() gene = sm.get_all_finalized_indices(unique=True, flat=True) self.model_name = "random(%s)" % str(gene) req_gene = " (%s)" % self.gene else: gene = split(self.gene, int) l0, l1 = len(sm.get_all_finalized_indices(unique=True)), len(gene) assert l0 == l1, "number of unique choices in the network (%d) must match length of the gene (%d)" % ( l0, l1) search_net.forward_strategy(fixed_arc=gene) config = search_net.config(finalize=True) # clean up sm.delete_strategy('__tmp__') del sm del search_net # build the actually used finalized network LoggerManager().get_logger().info( "Extracting architecture %s%s from the super-network" % (gene, req_gene)) self.net = Register.builder.from_config(config) return self.net.build(s_in, s_out)
def _initialize_weights(self, net: AbstractModule, logger: logging.Logger): assert isinstance( net, AbstractUninasNetwork ), "This initializer will not work with external networks!" search_config = Builder.find_net_config_path(self.path, pattern='search') checkpoint = CheckpointCallback.load_last_checkpoint(self.path) state_dict = checkpoint.get('state_dict') # figure out correct weights in super-network checkpoint if len(self.gene) > 0: log_headline(logger, "tmp network to track used params", target_len=80) sm = StrategyManager() tmp_s = RandomChoiceStrategy(max_epochs=1, name='__tmp__') assert len(sm.get_strategies_list( )) == 0, "can not load when there already is a search network" sm.add_strategy(tmp_s) sm.set_fixed_strategy_name('__tmp__') search_net = Builder().load_from_config(search_config) assert isinstance(search_net, SearchUninasNetwork) s_in, s_out = net.get_shape_in(), net.get_shape_out() search_net.build(s_in, s_out[0]) search_net.set_forward_strategy(False) search_net.forward_strategy(fixed_arc=self.gene) tracker = search_net.track_used_params( s_in.random_tensor(batch_size=2)) # tracker.print() logger.info(' > loading weights of gene %s from checkpoint "%s"' % (str(self.gene), self.path)) target_dict = net.state_dict() target_names = list(target_dict.keys()) new_dict = {} # add all stem and head weights, they are at the front of the dict and have pretty much the same name log_columns = [('shape in checkpoint', 'name in checkpoint', 'name in network', 'shape in network')] for k, v in state_dict.items(): if '.stem.' in k or '.heads.' in k: tn = target_names.pop(0) ts = target_dict[tn].shape log_columns.append( (str(list(v.shape)), k, tn, str(list(ts)))) n = k.replace('net.', '', 1) assert n == tn new_dict[n] = v # add all cell weights, can generally not compare names, only shapes for i, tracker_cell_entry in enumerate(tracker.get_cells()): for entry in tracker_cell_entry.get_pareto_best(): tn = target_names.pop(0) ts = target_dict[tn].shape log_columns.append((str(list(entry.shape)), entry.name, tn, str(list(ts)))) assert entry.shape == ts,\ 'Mismatching shapes for "%s" and "%s", is the gene correct?' % (entry.name, tn) new_dict[tn] = state_dict[entry.name] # log matches, load log_in_columns(logger, log_columns, add_bullets=True) net.load_state_dict(new_dict, strict=self.strict) # clean up del search_net sm.delete_strategy('__tmp__') del sm # simply load else: logger.info(' > simply loading state_dict') net.load_state_dict(state_dict, strict=self.strict)
def __init__(self, c_in: int, c_out: int, name: str, strategy_name='default', k_sizes=(3, 5), c_multipliers=(0.5, 1.0), dilation=1, stride=1, padding='same', groups=-1, bias=False): """ A super-kernel that applies convolution with a masked weight, using architecture weights to figure out the best masking, thus kernel size and num output channels. Since the architecture weights are applied to the mask rather than generating different outputs, this module can be used efficiently for differentiable weight strategies. :param c_in: num input channels :param c_out: num output channels :param name: name under which to register architecture weights :param strategy_name: name of the strategy for architecture weights :param k_sizes: kernel sizes :param c_multipliers: :param dilation: dilation for the kernel :param stride: stride for the kernel :param padding: :param padding: 'same' or number :param bias: """ super().__init__() self.name_c = '%s/c' % name self.name_k = '%s/k' % name self.k_sizes = k_sizes self.c_multipliers = c_multipliers assert max( c_multipliers ) <= 1.0, "Can only reduce max channels, choose a higher c_in/c_out" self._stride = stride self._groups = get_number(groups, c_out) self._dilation = dilation assert c_in % self._groups == 0 max_k = max(k_sizes) channels = [int(c_out * ci) for ci in sorted(c_multipliers)] masks_c, masks_k = [], [] # arc weights self.ws = StrategyManager().make_weight(strategy_name, self.name_k, only_single_path=True, num_choices=len(k_sizes)) self.ws = StrategyManager().make_weight(strategy_name, self.name_c, only_single_path=True, num_choices=len(channels)) # conv weight self._padding = get_padding(padding, max_k, stride, 1) self.weight = nn.Parameter(torch.Tensor(c_out, c_in // self._groups, max_k, max_k), requires_grad=True) nn.init.kaiming_normal_(self.weight, mode='fan_out') # bias if bias: self.bias = nn.Parameter(torch.Tensor(c_out)) nn.init.zeros_(self.bias) else: self.bias = None # mask c for cs in channels: mask = torch.ones(size=(c_out, 1, 1, 1), dtype=self.weight.dtype) mask[cs:c_out, :, :, :].zero_() masks_c.append(mask) self.register_buffer('masks_c', torch.stack(masks_c, dim=0)) # mask k for k in sorted(k_sizes): mask = torch.zeros(size=(1, 1, max_k, max_k), dtype=self.weight.dtype) dk = (max_k - k) // 2 if dk == 0: mask += 1 else: mask[:, :, dk:-dk, dk:-dk] += 1 masks_k.append(mask) self.register_buffer('masks_k', torch.stack(masks_k, dim=0))
class FusedMobileInvertedConvLayer(AbstractLayer, FusedOp): def __init__(self, name: str, strategy_name='default', skip_op: str = None, k_size_in=1, k_size_out=1, k_sizes=(3, 5, 7), stride=1, padding='same', expansions=(3, 6), dilation=1, bn_affine=True, act_fun='relu6', act_inplace=True, att_dict: dict = None): """ A fused layer for several kernel sizes and expansion sizes, to share the 1x1 conv weights. Currently only designed for having a single kernel+expansion per forward pass and for the final config. :param name: name under which to register architecture weights :param strategy_name: name of the strategy for architecture weights :param skip_op: optional layer name, adds an op that enables skipping the entire block, e.g. "SkipLayer" :param k_size_in: kernel size(s) for the first conv kernel (expanding) :param k_size_out: kernel size(s) for the last conv kernel (projecting) :param k_sizes: kernel sizes for the spatial kernel :param stride: stride for the spatial kernel :param padding: 'same' or number :param expansions: multipliers for inner channels, based on input channels :param dilation: dilation for the spatial kernel :param bn_affine: affine batch norm :param act_fun: activation function :param act_inplace: whether to use the activation function in-place if possible (e.g. ReLU) :param att_dict: None to disable attention modules, otherwise a dict with respective kwargs """ super().__init__() self._add_to_kwargs(name=name, strategy_name=strategy_name, skip_op=skip_op, k_size_in=k_size_in, k_size_out=k_size_out, k_sizes=k_sizes, stride=stride, expansions=expansions, padding=padding, dilation=dilation, bn_affine=bn_affine, act_fun=act_fun, act_inplace=act_inplace, att_dict=att_dict) self._add_to_print_kwargs(has_skip=False, has_att=isinstance(self.att_dict, dict)) self.ws = None self.skip = None self.pw_in = nn.ModuleList([]) self.dw_conv = nn.ModuleList([]) self.dw_att = nn.ModuleList([]) self.pw_out = nn.ModuleList([]) self.drop_path = DropPathModule() self._choices_by_idx = [] def _build(self, s_in: Shape, c_out: int) -> Shape: conv_kwargs = dict(dilation=self.dilation, padding=self.padding) c_in = s_in.num_features() self.has_skip = self.stride == 1 and c_in == c_out for e in range(len(self.expansions)): for k in range(len(self.k_sizes)): self._choices_by_idx.append((e, k)) if self.has_skip and isinstance(self.skip_op, str): self.skip = Register.network_layers.get(self.skip_op)() self.skip.build(s_in, c_out) self._choices_by_idx.append(('skip', 'skip')) self.ws = StrategyManager().make_weight(self.strategy_name, self.name, only_single_path=True, num_choices=len(self._choices_by_idx)) for e in self.expansions: c_mid = int(c_in * e) # pw in self.pw_in.append(nn.Sequential( get_conv2d(c_in, c_mid, k_size=self.k_size_in, groups=1, **conv_kwargs), nn.BatchNorm2d(c_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), )) # dw conv ops with different kernel sizes convs = nn.ModuleList([]) for k in self.k_sizes: convs.append(nn.Sequential( get_conv2d(c_mid, c_mid, k_size=k, stride=self.stride, groups=-1, **conv_kwargs), nn.BatchNorm2d(c_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), )) self.dw_conv.append(convs) # dw optional attention module if self.has_att: self.dw_att.append(AbstractAttentionModule.module_from_dict(c_mid, c_substitute=c_in, att_dict=self.att_dict)) # pw out self.pw_out.append(nn.Sequential( get_conv2d(c_mid, c_out, k_size=self.k_size_out, groups=1, **conv_kwargs), nn.BatchNorm2d(c_out, affine=self.bn_affine), )) return self.probe_outputs(s_in) def forward(self, x: torch.Tensor) -> torch.Tensor: idx, _ = self.ws.combine_info(self.name)[0] idx_e, idx_k = self._choices_by_idx[idx] if idx_e == 'skip': return x + self.skip(x) x2 = self.pw_in[idx_e](x) x2 = self.dw_conv[idx_e][idx_k](x2) if self.has_att: x2 = self.dw_att[idx_e](x2) x2 = self.pw_out[idx_e](x2) if self.has_skip: return x + self.drop_path(x2) return x2 def config(self, finalize=False, **__) -> dict: cfg = super().config(finalize=finalize, **__) if finalize: idxs = self.ws.get_finalized_indices(self.name) assert len(idxs) == 1 idx_e, idx_k = self._choices_by_idx[idxs[0]] if idx_e == 'skip': return self.skip.config(finalize=finalize, **__) cfg['name'] = MobileInvertedConvLayer.__name__ kwargs = cfg['kwargs'] for s in ['name', 'strategy_name', 'skip_op']: kwargs.pop(s) kwargs['k_size'] = kwargs.pop('k_sizes')[idx_k] kwargs['expansion'] = kwargs.pop('expansions')[idx_e] cfg['kwargs'] = kwargs return cfg
def __init__(self, model_name: str, net: AbstractNetworkBody, do_forward_strategy=True, *args, **kwargs): super().__init__(model_name=model_name, net=net, *args, **kwargs) self.do_forward_strategy = do_forward_strategy # unnecessary line to remove "error" highlighting self._add_to_kwargs(do_forward_strategy=self.do_forward_strategy) self.strategy_manager = StrategyManager() self.strategies = None
def get_space_tuple(cls, unique=True, flat=False) -> tuple: """ tuple of final topology """ return tuple(StrategyManager().get_all_finalized_indices(unique=unique, flat=flat))