def lay_conv2D( input, name='conv2d', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, useBias=True, gatedLU=False, # Gated Linear Unit architecture initializer=None, seed=12321, verbLev=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): variables = [] subOutList = [] if type(kernels) is not tuple: kernels = (kernels, ) if verbLev > 0: print(' > %s: kernels %s, filetrs %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): subKernel = kernels[k] if type(filters) is not tuple: subFilters = filters / len(kernels) else: subFilters = filters[k] if gatedLU: subFilters *= 2 convLay = tf.layers.Conv2D(filters=subFilters, kernel_size=subKernel, dilation_rate=dilation, activation=None, use_bias=useBias, kernel_initializer=initializer, padding='valid', data_format='channels_last') subOutput = convLay(input) for var in convLay.variables: variables.append(var) if verbLev > 1: print(' >> subConv: filters %s, kernel %s' % (subFilters, subKernel)) subOutList.append(subOutput) output = tf.concat(subOutList, axis=-1) if gatedLU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) else: if activation: output = activation(output) variables = flatten_LOTens(variables) return output, variables
def lay_conv1D( input, name='conv1D', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, use_bias=True, gated_LU=False, # Gated Linear Unit architecture initializer=None, padding='valid', # 'same' adds padding, 'valid' does not seed=12321, verb=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): sub_out_list = [] if type(kernels) is not tuple: kernels = (kernels, ) if verb > 1: print(' > %s: kernels %s, filters %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): sub_kernel = kernels[k] if type(filters) is not tuple: sub_filters = filters // len(kernels) else: sub_filters = filters[k] if gated_LU: sub_filters *= 2 conv_lay = tf.layers.Conv1D(filters=sub_filters, kernel_size=sub_kernel, dilation_rate=dilation, activation=None, use_bias=use_bias, kernel_initializer=initializer, padding=padding, data_format='channels_last') sub_output = conv_lay(input) if verb > 1: print(' >> sub_conv: filters %s, kernel %s' % (sub_filters, sub_kernel)) sub_out_list.append(sub_output) output = tf.concat(sub_out_list, axis=-1) if gated_LU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) elif activation: output = activation(output) return output
def merge_heads(x): x = tf.unstack(x, axis=-3) return tf.concat(x, axis=-1)
def enc_CNN( input: tf.Tensor, history: tf. Tensor = None, # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn name='enc_CNN', # layer params shared_lays: bool = False, # shared variables in enc_layers n_layers: int = 12, # num of layers kernel: int = 3, # layer kernel n_filters: int = 128, # num of filters activation=tf.nn. relu, # global enc activation func, gelu is really worth a try lay_drop: float or None = 0.0, ldrt_scale: int or None = 0, # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build ldrt_drop: float or None = 0.0, # DRT @enc_lay - dropout # other training_flag: tf.Tensor or bool = None, # dropout training flag tensor initializer=None, seed: int = 12321, n_hist: int = 4, # number of histogram layers verb=0): if verb > 0: print( f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...') if initializer is None: initializer = my_initializer(seed) # manage history history_lays = None if history is not None: history_lays = tf.unstack(history, axis=-3) if verb > 1: print( f' > state_lays len {len(history_lays)} of: {history_lays[0]}') hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_lays = [ ] # here we will store inputs of the following layers to extract the state (history) zsL = [] # zeroes # input projection - to match n_filters and input width if verb > 1: print(f' > encoder input: {input}') if input.shape[-1] != n_filters: input = lay_dense(input=input, units=n_filters, name='enc_input_projection', initializer=initializer) if verb > 1: print(f' > encoder projected input: {input}') output = input # for 0 layers case sub_output = input # first input for depth in range(n_layers): lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared' if verb > 1: print(f'<< layer {lay_name}:') lay_input = tf.concat([history_lays[depth], sub_output], axis=-2) if history_lays else sub_output if verb > 1: print(f' > sub_output (previous): {sub_output}') print(f' > lay_input (eventually padded): {lay_input}') input_lays.append(lay_input) hist_lay = depth in hist_layers with tf.variable_scope(lay_name): if hist_lay: hist_summ.append( tf.summary.histogram('a_lay_in', lay_input, family=name)) # LN lay_input = tf.keras.layers.LayerNormalization( axis=-1)(lay_input) if hist_lay: hist_summ.append( tf.summary.histogram('b_LN', lay_input, family=name)) # conv no activation output = lay_conv1D( input=lay_input, name='conv1D', kernels=kernel, filters=n_filters, activation=None, initializer=initializer, padding='same' if history is None else 'valid', seed=seed, verb=0) if hist_lay: hist_summ.append( tf.summary.histogram('c_cnn', output, family=name)) # activation if activation: output = activation(output) zsL += [zeroes(output)] # catch zeroes if hist_lay: hist_summ.append( tf.summary.histogram('d_activation', output, family=name)) # dropout if lay_drop: output = tf.layers.dropout(inputs=output, rate=lay_drop, training=training_flag, seed=seed) if hist_lay: hist_summ.append( tf.summary.histogram('e_drop', output, family=name)) # RES, here we take sub_output, since lay_input may be padded by history output += sub_output if hist_lay: hist_summ.append( tf.summary.histogram('f_residual', output, family=name)) if verb > 1: print(f' > output (layer): {output}') if ldrt_scale: lay_out = lay_DRT(input=output, name=lay_name + '_lay_DRT', hist_name=name, dns_scale=ldrt_scale, activation=activation, dropout=ldrt_drop, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] zsL += lay_out['zeroes'] if hist_lay: hist_summ.append(lay_out['hist_summ']) sub_output = output output = tf.keras.layers.LayerNormalization(axis=-1)(output) # final LN # prepare fin_state fin_state = None if history is not None: state = tf.stack(input_lays, axis=-3) if verb > 1: print(f' > state (stacked): {state}') fin_state = tf.split(state, num_or_size_splits=[-1, kernel - 1], axis=-2)[1] if verb > 1: print(f' > fin_state (split): {fin_state}') if verb > 1: print(f' > {name} output: {output}') return { 'output': output, 'state': fin_state, # history for next 'hist_summ': hist_summ, 'zeroes': zsL }
def flatten_LOTens(tList): resh_vars = [tf.reshape(var, [-1]) for var in tList] return tf.concat(resh_vars, axis=-1)
def __init__( self, fwd_func: GRAPH_FUNC, # function building forward graph (from PH to loss) mdict: DNA, # model(graph) parameters dictionary devices=-1, # check neuralmess.dev_manager.ft_devices for details do_optimization: bool = True, # add optimization part to the graph (for training) # values below complement mdict name='NEM', name_timestamp=False, # adds timestamp to name seed=12321, opt_class=tf.train. AdamOptimizer, # default optimizer, other examples: tf.train.GradientDescentOptimizer, partial(tf.train.AdamOptimizer, beta1=0.7, beta2=0.7) iLR=1e-3, warm_up=None, ann_base=None, ann_step=1, n_wup_off: float = 1, avt_SVal=1, avt_window=100, avt_max_upd=1.5, do_clip=False, # save read_only=False, # sets model to be read only (..still may log) save_TFD: str = SAVE_TFD, # top folder of model_FD savers_names: tuple = ( None, ), # names of savers for MultiSaver # TODO: what does for << this default value? load_saver: bool or str = True, # for None does not load, for True loads default do_logfile=True, # enables saving log file in save_TFD # GPU management sep_device=True, # separate first device for variables, gradients_avg, optimizer (otherwise those ar placed on the first FWD calculations tower) collocate_GWO=False, # collocates gradient calculations with tf.OPs (gradients are calculated on every tower with its operations, but remember that vars are on one device...) (otherwise with first FWD calculations tower) # other verb: int = 0 ): # verb of NEModel (object/constructor), fwd_func has own verb in mdict dict.__init__(self) # init self as a dict self.verb = verb if self.verb > 0: print('\n*** NEModel *** initializes...') self_args_dict = { # params dict from NEModel constructor 'name': name, 'seed': seed, 'opt_class': opt_class, 'iLR': iLR, 'warm_up': warm_up, 'ann_base': ann_base, 'ann_step': ann_step, 'n_wup_off': n_wup_off, 'avt_SVal': avt_SVal, 'avt_window': avt_window, 'avt_max_upd': avt_max_upd, 'do_clip': do_clip } fwdf_mdict = get_defaults( function=fwd_func) # params dict with defaults of fwd_func # resolve model name and extend with timestamp when needed resolved_name = self_args_dict['name'] if 'name' in fwdf_mdict: resolved_name = fwdf_mdict['name'] if 'name' in mdict: resolved_name = mdict['name'] if name_timestamp: resolved_name += '.' + stamp() mdict['name'] = resolved_name self.model_dir = f'{save_TFD}/{mdict["name"]}' # here goes everything from the model if self.verb > 0: print(f' > NEModel name: {mdict["name"]}') print(f' > NEModel dir: {self.model_dir}') # build folder managed dna with empty dna, it gets dna FROM FOLDER self.__dna = ParaDict(dna_TFD=save_TFD, dna_SFD=mdict['name'], fn_pfx=NEMODEL_DNA_PFX, verb=self.verb) # set logfile if do_logfile: set_logger(log_folder=self.model_dir, custom_name=mdict['name'], verb=self.verb) # resolve model dict (dna) in proper order md = {} md.update(self_args_dict ) # 1 update with params dict from NEModel constructor md.update(fwdf_mdict) # 2 update with defaults of fwd_func md.update(self.__dna) # 3 update with params from folder md.update(mdict) # 4 update with given mdict self.__dna.update(md) self.__dna.check_params_sim(SPEC_KEYS) # safety check self.readonly = read_only if self.model_dir and not self.readonly: self.__dna.save() self.update( self.__dna) # finally update self with all model building params devices = tf_devices(devices, verb=self.verb) # report devices if self.verb > 0: print() if len(devices) == 1: if 'CPU' in devices[0]: print(f'NEModel builds CPU device setup') else: print(f'NEModel builds single-GPU setup') else: print( f'NEModel builds multi-dev setup for {len(devices)} devices' ) if len(devices) < 3: sep_device = False # SEP is available for 3 or more devices # build FWD graph(s) >> manage variables >> build OPT graph self.gFWD = [] # list of dicts of all FWD graphs (from all devices) self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self['seed']) # set graph seed np.random.seed(self['seed']) if self.verb > 0: print(f'\nNEModel set TF & NP seed to {self["seed"]}') # builds graph @SEP, this graph wont be run, it is only needed to place variables, if not vars_sep >> variables will be placed with first tower if sep_device: if self.verb > 0: print( f'\nNEModel places {self["name"]} VARs on {devices[0]}...' ) with tf.device(devices[0]): fwd_func(**self) tower_devices = [] + devices if sep_device: tower_devices = tower_devices[1:] # trim SEP for dev in tower_devices: if self.verb > 0: print( f'\nNEModel builds FWD graph of {self["name"]} model @device: {dev}' ) with tf.device(dev): with tf.variable_scope('', reuse=tf.AUTO_REUSE): self.gFWD.append(fwd_func(**self)) self.update(self.gFWD[0] ) # update self with dictionary returned by fwd_func # get FWD variables returned by fwd_func (4 saver) train_vars = [] # variables to train saver_vars = {} # dict of variables to save for key in self.keys(): if 'var' in key.lower(): if key == 'train_vars': train_vars = self[key] if type(train_vars) is not list: train_vars = [train_vars] else: if type(self[key]) is not list: saver_vars[key] = [self[key]] else: saver_vars[key] = self[key] all_vars = tf.global_variables() # there are returned variables >> assert there are all variables returned in lists if saver_vars: all_vars_returned = [] for key in saver_vars: all_vars_returned += saver_vars[key] there_are_all = True for var in all_vars: if var not in all_vars_returned: print( f' *** variable {var.name} not returned by fwd_func' ) there_are_all = False assert there_are_all, 'ERR: there are some variables not returned by fwd_func in lists!' else: saver_vars['fwd_vars'] = all_vars # put all if self.verb > 0: print('\nNEModel variables to save from fwd_func:') for key in sorted(list(saver_vars.keys())): varList = saver_vars[key] if varList: print( f' ### vars @{key} - num: {len(varList)}, floats: {short_scin(num_var_floats(varList))} ({varList[0].device})' ) else: print(' ### no vars') if self.verb > 1: log_vars(varList) if 'loss' not in self: do_optimization = False if self.verb > 0: print( '\nthere is no loss in FWD graph, OPT graph wont be build' ) if not do_optimization: if self.verb > 0: print('\nOPT graph wont be build') # build optimization graph else: if self.verb > 0: print(f'\nPreparing OPT part with {self["opt_class"]}') # select trainable variables for OPT all_tvars = tf.trainable_variables() if train_vars: # check if all train_vars are trainable: for var in train_vars: if var not in all_tvars: if self.verb > 0: print( f'variable {var.name} is not trainable but is in train_vars, please check the graph!' ) else: for key in saver_vars: for var in saver_vars[key]: if var in all_tvars: train_vars.append(var) assert train_vars, 'ERR: there are no trainable variables at the graph!' # log train_vars if self.verb > 0: print('\nNEModel trainable variables:') print( f' ### train_vars: {len(train_vars)} floats: {short_scin(num_var_floats(train_vars))}' ) if self.verb > 1: log_vars(train_vars) # build gradients for towers for ix in range(len(self.gFWD)): tower = self.gFWD[ix] tower['gradients'] = tf.gradients( ys=tower['loss'], xs=train_vars, colocate_gradients_with_ops=not collocate_GWO ) # TF default is False >> calculates gradients where OPS, for True >> where train_vars # log gradients if self.verb > 0: nGrad = len(tower['gradients']) # None_as_gradient case device = 'UNKNOWN' for t in tower['gradients']: if t is not None: device = t.device break print( f' > gradients for {ix} tower got {nGrad} tensors ({device})' ) if self.verb > 1: print('NEModel variables and their gradients:') for gix in range(len(tower['gradients'])): grad = tower['gradients'][gix] var = train_vars[gix] print(var, var.device) print( f' > {grad}' ) # grad as a tensor displays device when printed (unless colocated with OP!) self['gradients'] = self.gFWD[0]['gradients'] # None @gradients check none_grads = 0 for grad in self['gradients']: if grad is None: none_grads += 1 if none_grads and self.verb > 0: print( f'There are None gradients: {none_grads}/{len(self["gradients"])}, some trainVars may be unrelated to loss, please check the graph!' ) # average gradients if len(devices) > 1: if self.verb > 0: print( f'\nNEModel builds gradients averaging graph with device {devices[0]} for {len(self.gFWD)} towers' ) with tf.device(devices[0]): towerGrads = [ tower['gradients'] for tower in self.gFWD ] avgGrads = [] for mGrads in zip(*towerGrads): grads = [] for grad in mGrads: if grad is not None: # None for variables not used while training now... expandedG = tf.expand_dims(input=grad, axis=-1) grads.append(expandedG) if grads: grad = tf.concat(values=grads, axis=-1) grad = tf.reduce_mean(input_tensor=grad, axis=-1) avgGrads.append(grad) else: avgGrads.append(None) self[ 'gradients'] = avgGrads # update with averaged gradients if self.verb > 0: print( f' > NEModel averaged gradients ({self["gradients"][0].device})' ) # build OPT graph with tf.variable_scope('OPT', reuse=tf.AUTO_REUSE): if self.verb > 0: print( f'\nBuilding OPT graph for {self["name"]} model @device: {devices[0]}' ) with tf.device(devices[0]): self['g_step'] = tf.get_variable( # global step name='g_step', shape=[], trainable=False, initializer=tf.constant_initializer(0), dtype=tf.int32) self['iLR_var'] = tf.get_variable( # base LR variable name='iLR', shape=[], trainable=False, initializer=tf.constant_initializer(self['iLR']), dtype=tf.float32) self['scaled_LR'] = lr_scaler( iLR=self['iLR_var'], g_step=self['g_step'], warm_up=self['warm_up'], ann_base=self['ann_base'], ann_step=self['ann_step'], n_wup_off=self['n_wup_off'], verb=self.verb)['scaled_LR'] # updates with: optimizer, gg_norm, avt_gg_norm self.update( gc_loss_reductor(optimizer=self['opt_class']( learning_rate=self['scaled_LR']), vars=train_vars, g_step=self['g_step'], gradients=self['gradients'], avt_SVal=self['avt_SVal'], avt_window=self['avt_window'], avt_max_upd=self['avt_max_upd'], do_clip=self['do_clip'], verb=self.verb)) # select OPT vars saver_vars['opt_vars'] = tf.global_variables( scope=tf.get_variable_scope().name) if self.verb > 0: print( f' ### opt_vars: {len(saver_vars["opt_vars"])} floats: {short_scin(num_var_floats(saver_vars["opt_vars"]))} ({saver_vars["opt_vars"][0].device})' ) if self.verb > 1: log_vars(saver_vars['opt_vars']) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(graph=self.graph, config=config) # remove keys with no variables (corner case, for proper saver) sKeys = list(saver_vars.keys()) for key in sKeys: if not saver_vars[key]: saver_vars.pop(key) # TODO: saver_vars, savers_names, load_saver - need a little refactor!!! # add saver and load self.__saver = MultiSaver(model_name=self['name'], vars=saver_vars, save_TFD=save_TFD, savers=savers_names, session=self.session, verb=self.verb) if load_saver: if type(load_saver) is bool: load_saver = None self.__saver.load(saver=load_saver) self.update_LR(self['iLR']) # safety update of iLR self.__summ_writer = tf.summary.FileWriter( logdir=self.model_dir, #graph= self.graph, # you can call add_graph() later flush_secs=10) if not self.readonly else None if self.verb > 0: print(f'{self["name"]} (NEModel) build finished!') if self.verb > 2: print(self)