def lay_conv2D( input, name='conv2d', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, useBias=True, gatedLU=False, # Gated Linear Unit architecture initializer=None, seed=12321, verbLev=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): variables = [] subOutList = [] if type(kernels) is not tuple: kernels = (kernels, ) if verbLev > 0: print(' > %s: kernels %s, filetrs %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): subKernel = kernels[k] if type(filters) is not tuple: subFilters = filters / len(kernels) else: subFilters = filters[k] if gatedLU: subFilters *= 2 convLay = tf.layers.Conv2D(filters=subFilters, kernel_size=subKernel, dilation_rate=dilation, activation=None, use_bias=useBias, kernel_initializer=initializer, padding='valid', data_format='channels_last') subOutput = convLay(input) for var in convLay.variables: variables.append(var) if verbLev > 1: print(' >> subConv: filters %s, kernel %s' % (subFilters, subKernel)) subOutList.append(subOutput) output = tf.concat(subOutList, axis=-1) if gatedLU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) else: if activation: output = activation(output) variables = flatten_LOTens(variables) return output, variables
def lay_conv1D( input, name='conv1D', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, use_bias=True, gated_LU=False, # Gated Linear Unit architecture initializer=None, padding='valid', # 'same' adds padding, 'valid' does not seed=12321, verb=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): sub_out_list = [] if type(kernels) is not tuple: kernels = (kernels, ) if verb > 1: print(' > %s: kernels %s, filters %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): sub_kernel = kernels[k] if type(filters) is not tuple: sub_filters = filters // len(kernels) else: sub_filters = filters[k] if gated_LU: sub_filters *= 2 conv_lay = tf.layers.Conv1D(filters=sub_filters, kernel_size=sub_kernel, dilation_rate=dilation, activation=None, use_bias=use_bias, kernel_initializer=initializer, padding=padding, data_format='channels_last') sub_output = conv_lay(input) if verb > 1: print(' >> sub_conv: filters %s, kernel %s' % (sub_filters, sub_kernel)) sub_out_list.append(sub_output) output = tf.concat(sub_out_list, axis=-1) if gated_LU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) elif activation: output = activation(output) return output
def decN( input, dictW, predN=1, # N samples for every feature name='decN', hLays=None, # tuple or list of ints hActiv=tf.nn.relu, initializer=None, seed=12321, verbLev=0): if verbLev > 0: print('\nBuilding decoderN ...') if verbLev > 1: print('decoder input:', input) if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): # hidden layers if hLays: for nLay in range(len(hLays)): laySize = hLays[nLay] input = lay_dense(input=input, units=laySize, activation=hActiv, use_bias=True, initializer=initializer, seed=seed, name='decoderN_Hlay_%s' % nLay) # projection to predN x dictW logits = lay_dense(input=input, units=predN * dictW, activation=None, use_bias=True, initializer=initializer, seed=seed, name='decoderNProjection') if verbLev > 1: print(' > projection to logits (%dx dictW):' % predN, logits) if predN > 1: logits = tf.reshape(logits, [tf.shape(logits)[0], -1, dictW]) if verbLev > 1: print(' > reshaped logits (B,%dxS,dictW):' % predN, logits) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) if verbLev > 1: print(' > predictions:', predictions) return logits, predictions
def lay_dense( input, units: int, # layer width name='dense', reuse=False, activation=None, use_bias=True, initializer=None, seed=12321): if initializer is None: initializer = my_initializer(seed) dense_lay = tf.layers.Dense(units=units, activation=activation, use_bias=use_bias, kernel_initializer=initializer, name=name, _reuse=reuse) output = dense_lay(input) return output
def cards_enc( train_flag, # train flag (bool tensor) c_ids, # seven cards (ids tensor) tat_case: bool = False, # task attention transformer architecture emb_width: int = 24, # cards embedding width t_drop: float = 0, f_drop: float = 0, in_proj: int = None, n_layers: int = 8, dense_mul: int = 4, # transformer dense multiplication activation=tf.nn.relu, dropout: float = 0, # transformer dropout seed=12321, verb=0): if verb > 0: print('\nBuilding card encoder...') with tf.variable_scope('cards_enc'): zsL = [] hist_summ = [] c_emb = tf.get_variable( # cards embeddings name='c_emb', shape=[53, emb_width], # one card for 'no_card' dtype=tf.float32, initializer=my_initializer(seed=seed)) hist_summ += [tf.summary.histogram('1.c_emb', c_emb, family='c_emb')] c_emb_look = tf.nn.embedding_lookup(params=c_emb, ids=c_ids) if verb > 1: print(' > 1.c_emb_look:', c_emb_look) myc_emb = tf.get_variable( # my cards embeddings name='myc_emb', shape=[2, c_emb.shape[-1]], dtype=tf.float32, initializer=my_initializer(seed=seed)) myc_emb_look = tf.nn.embedding_lookup(params=myc_emb, ids=[0, 0, 1, 1, 1, 1, 1]) if verb > 1: print(' > myc_emb_look:', myc_emb_look) input = c_emb_look + myc_emb_look if t_drop or f_drop: input = tf_drop(input=input, time_drop=t_drop, feat_drop=f_drop, train_flag=train_flag, seed=seed) # input projection (without activation) if in_proj: input = lay_dense(input=input, units=in_proj, name='c_proj', reuse=tf.AUTO_REUSE, use_bias=False, seed=seed) if verb > 1: print(' > input projected:', input) elif verb > 1: print(' > input:', input) enc_out = enc_TNS(in_seq=input, name='TAT' if tat_case else 'TNS', seq_out=not tat_case, add_PE=False, n_blocks=n_layers, n_heads=1, dense_mul=dense_mul, activation=activation, max_seq_len=7, dropout=dropout, dropout_att=0, drop_flag=train_flag, seed=seed, n_hist=3, verb=verb) output = enc_out['output'] zsL += enc_out['zeroes'] hist_summ += enc_out['hist_summ'] if not tat_case: output = tf.unstack(output, axis=-2) output = tf.concat(output, axis=-1) if verb > 1: print(' > encT reshaped output:', output) elif verb > 1: print(' > encT output:', output) enc_vars = tf.global_variables(scope=tf.get_variable_scope().name) return { 'output': output, 'enc_vars': enc_vars, 'hist_summ': hist_summ, 'zeroes': zsL }
def enc_DRT( input, name='enc_DRT', shared_lays: bool = False, # shared variables in enc_layers n_layers=12, lay_width: int = None, # for None matches input width dns_scale=6, # scale(*) of first dense activation=tf.nn.relu, # gelu is really worth a try dropout=0.0, # dropout after two denses training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321, n_hist=4, # number of histogram layers (for TB) verb=0): lay_width_matched = '' if lay_width is None: lay_width = input.shape.as_list()[-1] lay_width_matched = '(lay_width taken form input width)' if verb > 0: drp = 0.0 if not dropout else dropout print( f'\nBuilding DRTencoder ({n_layers}x{lay_width} drop:{drp:.2f}) {lay_width_matched}...' ) if initializer is None: initializer = my_initializer(seed) hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(' > histogram layers of DRTencoder:', hist_layers) zsL = [] # zeroes list with tf.variable_scope(name): # input projection iW = input.shape[-1] if iW != lay_width: input = lay_dense(input=input, units=lay_width, use_bias=False, initializer=initializer, seed=seed) if verb > 0: print('projected input to layWidth(%d) since it differs(%d)' % (lay_width, iW)) input = tf.keras.layers.LayerNormalization(axis=-1)( input) # input layer_norm output = input # for 0 layers case for nL in range(n_layers): lay_name = f'DRLay_{nL}' if not shared_lays else 'DRLay_shared' lay_out = lay_DRT(input=output, name=lay_name, hist_name=name, dns_scale=dns_scale, activation=activation, dropout=dropout, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] if nL in hist_layers: hist_summ.append(lay_out['hist_summ']) zsL += lay_out['zeroes'] return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def lay_DRT( input, name='lay_DRT', # scope name, be careful when stacked since auto_reuse hist_name=None, # family name of histogram dns_scale=4, activation=tf.nn.relu, # gelu is really worth a try dropout=None, # dropout (after two denses) training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321): if not hist_name: hist_name = name lay_width = input.shape[-1] if initializer is None: initializer = my_initializer(seed) hist_summ = [] with tf.variable_scope(name_or_scope=name, reuse=tf.AUTO_REUSE): hist_summ.append( tf.summary.histogram('a_denseSin', input, family=hist_name)) # dense (scale up) output = lay_dense(input=input, units=int(lay_width * dns_scale), activation=None, use_bias=True, initializer=initializer, seed=seed, name='denseS') hist_summ.append( tf.summary.histogram('b_denseSout', output, family=hist_name)) # activation output = activation(output) zsL = [zeroes(output)] # zeroes list hist_summ.append( tf.summary.histogram('c_activation', output, family=hist_name)) # dense (scale down) no activ output = lay_dense(input=output, units=lay_width, name='DRTdenseNA', use_bias=True, initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('d_denseNAout', output, family=hist_name)) # layer dropout if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual output = lay_res(input, output) hist_summ.append( tf.summary.histogram('e_residual', output, family=hist_name)) # layer_norm output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('f_LAYout', output, family=hist_name)) return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def enc_CNN( input: tf.Tensor, history: tf. Tensor = None, # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn name='enc_CNN', # layer params shared_lays: bool = False, # shared variables in enc_layers n_layers: int = 12, # num of layers kernel: int = 3, # layer kernel n_filters: int = 128, # num of filters activation=tf.nn. relu, # global enc activation func, gelu is really worth a try lay_drop: float or None = 0.0, ldrt_scale: int or None = 0, # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build ldrt_drop: float or None = 0.0, # DRT @enc_lay - dropout # other training_flag: tf.Tensor or bool = None, # dropout training flag tensor initializer=None, seed: int = 12321, n_hist: int = 4, # number of histogram layers verb=0): if verb > 0: print( f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...') if initializer is None: initializer = my_initializer(seed) # manage history history_lays = None if history is not None: history_lays = tf.unstack(history, axis=-3) if verb > 1: print( f' > state_lays len {len(history_lays)} of: {history_lays[0]}') hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_lays = [ ] # here we will store inputs of the following layers to extract the state (history) zsL = [] # zeroes # input projection - to match n_filters and input width if verb > 1: print(f' > encoder input: {input}') if input.shape[-1] != n_filters: input = lay_dense(input=input, units=n_filters, name='enc_input_projection', initializer=initializer) if verb > 1: print(f' > encoder projected input: {input}') output = input # for 0 layers case sub_output = input # first input for depth in range(n_layers): lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared' if verb > 1: print(f'<< layer {lay_name}:') lay_input = tf.concat([history_lays[depth], sub_output], axis=-2) if history_lays else sub_output if verb > 1: print(f' > sub_output (previous): {sub_output}') print(f' > lay_input (eventually padded): {lay_input}') input_lays.append(lay_input) hist_lay = depth in hist_layers with tf.variable_scope(lay_name): if hist_lay: hist_summ.append( tf.summary.histogram('a_lay_in', lay_input, family=name)) # LN lay_input = tf.keras.layers.LayerNormalization( axis=-1)(lay_input) if hist_lay: hist_summ.append( tf.summary.histogram('b_LN', lay_input, family=name)) # conv no activation output = lay_conv1D( input=lay_input, name='conv1D', kernels=kernel, filters=n_filters, activation=None, initializer=initializer, padding='same' if history is None else 'valid', seed=seed, verb=0) if hist_lay: hist_summ.append( tf.summary.histogram('c_cnn', output, family=name)) # activation if activation: output = activation(output) zsL += [zeroes(output)] # catch zeroes if hist_lay: hist_summ.append( tf.summary.histogram('d_activation', output, family=name)) # dropout if lay_drop: output = tf.layers.dropout(inputs=output, rate=lay_drop, training=training_flag, seed=seed) if hist_lay: hist_summ.append( tf.summary.histogram('e_drop', output, family=name)) # RES, here we take sub_output, since lay_input may be padded by history output += sub_output if hist_lay: hist_summ.append( tf.summary.histogram('f_residual', output, family=name)) if verb > 1: print(f' > output (layer): {output}') if ldrt_scale: lay_out = lay_DRT(input=output, name=lay_name + '_lay_DRT', hist_name=name, dns_scale=ldrt_scale, activation=activation, dropout=ldrt_drop, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] zsL += lay_out['zeroes'] if hist_lay: hist_summ.append(lay_out['hist_summ']) sub_output = output output = tf.keras.layers.LayerNormalization(axis=-1)(output) # final LN # prepare fin_state fin_state = None if history is not None: state = tf.stack(input_lays, axis=-3) if verb > 1: print(f' > state (stacked): {state}') fin_state = tf.split(state, num_or_size_splits=[-1, kernel - 1], axis=-2)[1] if verb > 1: print(f' > fin_state (split): {fin_state}') if verb > 1: print(f' > {name} output: {output}') return { 'output': output, 'state': fin_state, # history for next 'hist_summ': hist_summ, 'zeroes': zsL }
def cnn_DMG( name :str, train_ce :bool= True, # train cards encoder c_embW :int= 12, # card emb width >> makes network width (x7) n_lay= 12, # number of CNNR layers >> makes network deep ( >> context length) width= None, # representation width (number of filters), for None uses cards_encoded_width activation= tf.nn.relu, opt_class= partial(tf.compat.v1.train.AdamOptimizer, beta1=0.7, beta2=0.7), iLR= 3e-5, warm_up= 100, # num of steps has to be small (since we do rare updates) avt_SVal= 0.04, avt_window= 20, do_clip= True, verb= 0, **kwargs): if verb>0: print(f'\nBuilding {name} cnn_DMG (graph)...') with tf.variable_scope(name): n_hands = tf.get_variable( # number of hands while learning name= 'n_hands', shape= [], trainable= False, initializer= tf.constant_initializer(0), dtype= tf.int32) cards_PH = tf.placeholder( # 7 cards placeholder name= 'cards_PH', dtype= tf.int32, shape= [None, None, 7]) # [bsz,seq,7cards] train_PH = tf.placeholder( # train placeholder name= 'train_PH', dtype= tf.bool, shape= []) ce_out = cards_enc( train_flag= train_PH, c_ids= cards_PH, emb_width= c_embW) cards_encoded = ce_out['output'] enc_vars = ce_out['enc_vars'] enc_zsL = ce_out['zeroes'] if verb>1: print(' ### num of enc_vars (%d) %s'%(len(enc_vars),short_scin(num_var_floats(enc_vars)))) if verb>1: print(' > cards encoded:', cards_encoded) switch_PH = tf.placeholder( # switch placeholder name= 'switch_PH', dtype= tf.int32, # 0 for move, 1 for cards shape= [None, None, 1]) # [bsz,seq,1] event_PH = tf.placeholder( # event id placeholder name= 'event_PH', dtype= tf.int32, shape= [None, None]) # [bsz,seq] n_events = 1 + N_TABLE_PLAYERS + len(TBL_MOV)*(N_TABLE_PLAYERS-1) event_emb = tf.get_variable( # event type embeddings name= 'event_emb', shape= [n_events, cards_encoded.shape[-1]], dtype= tf.float32, initializer= my_initializer()) event_in = tf.nn.embedding_lookup(params=event_emb, ids=event_PH) if verb>1: print(' > event_in:', event_in) # tried with tf.where and switching inputs, but speed was the same... switch = tf.cast(switch_PH, dtype=tf.float32) input = switch*cards_encoded + (1-switch)*event_in if verb>1: print(' > input (merged):', input) # projection without activation and bias if width: input = lay_dense( input= input, units= width, use_bias= False) if verb>1: print(' > projected input (projected):', input) else: width = cards_encoded.shape[-1] # layer_norm sub_output = tf.contrib.layers.layer_norm( inputs= input, begin_norm_axis= -1, begin_params_axis= -1) state_shape = [n_lay, 2, width] single_zero_state = tf.zeros(shape=state_shape) # [n_lay,2,width] state_PH = tf.placeholder( name= 'state_PH', dtype= tf.float32, shape= [None] + state_shape) # [bsz,n_lay,2,width] cnn_enc_out = enc_CNN( input= sub_output, history= state_PH, n_layers= n_lay, n_filters= width, activation= activation, n_hist= 0) out = cnn_enc_out['output'] fin_state = cnn_enc_out['state'] cnn_zsL = cnn_enc_out['zeroes'] if verb > 1: print(' > out:', out) print(' > fin_state (split):', fin_state) # projection to logits logits = lay_dense( input= out, units= len(TBL_MOV), use_bias= False) if verb>1: print(' > logits:', logits) probs = tf.nn.softmax(logits) cnn_vars = tf.trainable_variables(scope=tf.get_variable_scope().name) + [n_hands] cnn_vars = [var for var in cnn_vars if var not in enc_vars] if verb>1: print(' ### num of cnn_vars (%d) %s'%(len(cnn_vars),short_scin(num_var_floats(cnn_vars)))) move_PH = tf.placeholder( # move made (label) name= 'move_PH', dtype= tf.int32, shape= [None, None]) # [bsz,seq] rew_PH = tf.placeholder( # reward for move made name= 'rew_PH', dtype= tf.float32, shape= [None, None]) # [bsz,seq] # this loss is auto averaged with reduction parameter # loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) # loss = loss(y_true=move, y_pred=logits, sample_weight=rew) loss = tf.losses.sparse_softmax_cross_entropy( labels= move_PH, logits= logits, weights= rew_PH) train_vars = [] + cnn_vars if train_ce: train_vars += enc_vars return{ 'name': name, 'cards_PH': cards_PH, 'train_PH': train_PH, 'switch_PH': switch_PH, 'event_PH': event_PH, 'move_PH': move_PH, 'rew_PH': rew_PH, 'state_PH': state_PH, 'single_zero_state': single_zero_state, 'probs': probs, 'fin_state': fin_state, 'enc_zeroes': tf.concat(enc_zsL, axis=-1), 'cnn_zeroes': tf.concat(cnn_zsL, axis=-1), 'loss': loss, 'n_hands': n_hands, 'enc_vars': enc_vars, 'cnn_vars': cnn_vars, 'train_vars': train_vars}