def build_model(hp: HyperParameters): inputs = tf.keras.Input((15,)) x = inputs y = inputs t_dropout = hp.Float('target_dropout', 0.0, 0.5, 0.1, default=0.2) p_dropout = hp.Float('pretrain_dropout', 0.0, 0.5, 0.1, default=0.2) for i in range(1): # hidden layer x = tf.keras.layers.Dense(2**hp.Int('target_exponent_{}'.format(i), 5, 8, default=6), activation='relu', kernel_initializer='he_uniform', name='target_dense_{}'.format(i))(x) y = tf.keras.layers.Dense(2**hp.Int('pretrain_exponent_{}'.format(i), 5, 8, default=6), activation='relu', kernel_initializer='he_uniform', name='pretrain_dense_{}'.format(i))(y) a = tf.keras.layers.Dense(2**hp.Int('adapter_exponent_{}'.format(i), 2, 6, default=4), activation='relu', kernel_initializer='he_uniform', name='target_adapter_{}'.format(i))(y) # dropout layer x = tf.keras.layers.Dropout(t_dropout, name='target_dropout_{}'.format(i))(x) x = tf.keras.layers.concatenate([x, a], name='target_concat_{}'.format(i)) y = tf.keras.layers.Dropout(p_dropout, name='pretrain_dropout_{}'.format(i))(y) x = tf.keras.layers.Dense(18, activation='softmax', dtype='float32', name='target_output')(x) y = tf.keras.layers.Dense(18, activation='softmax', dtype='float32', name='pretrain_output')(y) model = tf.keras.Model(inputs=inputs, outputs=[x, y]) return model
def build_model(hp: HyperParameters): inputs = tf.keras.Input((15, )) x = inputs dropout = hp.Float('dropout', 0.0, 0.5, 0.1, default=0.2) for i in range(1): x = tf.keras.layers.Dense( 2**hp.Int('exponent_{}'.format(i), 5, 8, default=6), 'relu')(x) x = tf.keras.layers.Dropout(dropout)(x) x = tf.keras.layers.Dense(18, activation='softmax', dtype='float32')(x) model = tf.keras.Model(inputs=inputs, outputs=x) model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model
def build_optimizer(hp: HyperParameters): """ Helper method that defines hyperparameter optimization for optimizer""" optimizer = hp.Choice(name="optimizer", values=["adam","sgd","rms"], default="adam") learning_rate = hp.Float(name="learning_rate", min_value=1e-4, max_value=5e-3, sampling="log", default=1e-3) # probably could use enums here if optimizer == "adam": return Adam(learning_rate=learning_rate) elif optimizer == "sgd": return SGD(learning_rate=learning_rate) elif optimizer == "rms": return RMSprop(learning_rate=learning_rate) else: raise NotImplementedError()
def build(self, hp: HyperParameters): model = keras.models.Sequential([ keras.layers.Reshape((28, 28, 1, 1)), # Introduce streams keras.layers.Lambda(lambda v: tf.stack((v, tf.zeros_like(v)), axis=-1)), # Imaginary part initialized to 0 keras.layers.Lambda(print_return), # Block 1: Shape [batch, 28, 28, channels=8, streams=2, 2] Conv2DH(out_orders=2, out_channels=8), HNonLinearity(), # Defaults to ReLU Conv2DH(out_orders=2, out_channels=8), HBatchNormalization(), # Block 2: Shape [batch, 14, 14, channels=16, streams=2, 2] AvgPool2DH(strides=(2, 2)), Conv2DH(out_orders=2, out_channels=16), HNonLinearity(), Conv2DH(out_orders=2, out_channels=16), HBatchNormalization(), # Block 3: Shape [batch, 7, 7, channels=35, streams=2, 2] AvgPool2DH(), Conv2DH(out_orders=2, out_channels=35), HNonLinearity(), Conv2DH(out_orders=2, out_channels=35), # Block 4: Reduce to magnitudes and apply final activation HFlatten(), keras.layers.Lambda(print_return), keras.layers.Dense(10), keras.layers.Softmax(), ]) model.compile( optimizer=keras.optimizers.Adam( learning_rate=10 ** hp.Float('log_learning_rate', -6, -1, step=0.5, default=-3)), loss='categorical_crossentropy', metrics=['accuracy'] ) return model
def build_model(hp: kt.HyperParameters, use_avs_model: bool = False): batch_size = config.generation.batch_size if stateful else None layer_names = name_generator('layer') inputs = {} last_layer = [] for col in seq.x_cols: shape = None, *seq.shapes[col][2:] inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col) last_layer.append(inputs[col]) random.seed(43) for i in range(hp.Int(f'lstm_layers', 2, 7)): outs = [] depth = hp.Int(f'depth_{i}', 4, 64, sampling='log') connections = min(hp.Int(f'connections_{i}', 1, 3), len(last_layer)) dropout = hp.Float(f'dropout_{i}', 0, 0.5) for width_i in range(hp.Int(f'width_{i}', 1, 16)): t = layers.LSTM(depth, return_sequences=True, name=f'lstm{i:03}_{width_i:03}_{layer_names.__next__()}', stateful=stateful, )( forgiving_concatenate(random.sample(last_layer, connections), name=layer_names.__next__())) t = layers.BatchNormalization(name=layer_names.__next__())(t) t = layers.Dropout(dropout, name=layer_names.__next__())(t) outs.append(t) last_layer = outs x = forgiving_concatenate(last_layer) outputs = {} loss = {} for col in seq.y_cols: if col in seq.categorical_cols: shape = seq.shapes[col][-1] outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation='softmax'), name=col)(x) loss[col] = keras.losses.CategoricalCrossentropy( label_smoothing=tf.cast(hp.Float('label_smoothing', 0.0, 0.7), 'float32'), ) # does not work well with mixed precision and stateful model if col in seq.regression_cols: shape = seq.shapes[col][-1] outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation=None), name=col)(x) loss[col] = 'mse' if stateful or config.training.AVS_proxy_ratio == 0: if config.training.AVS_proxy_ratio == 0: logging.log(logging.WARNING, f'Not using AVSModel with superior optimizer due to ' f'{config.training.AVS_proxy_ratio=}.') model = Model(inputs=inputs, outputs=outputs) opt = keras.optimizers.Adam() else: if use_avs_model: model = AVSModel(inputs=inputs, outputs=outputs, config=config) else: model = Model(inputs=inputs, outputs=outputs) lr_schedule = FlatCosAnnealSchedule(decay_start=len(seq) * 30, # Give extra epochs to big batch_size initial_learning_rate=hp.Choice('initial_learning_rate', [3e-2, 1e-2, 8e-3, ]), decay_steps=len(seq) * 40, alpha=0.01, ) # Ranger hyper params based on https://github.com/fastai/imagenette/blob/master/2020-01-train.md opt = tfa.optimizers.RectifiedAdam(learning_rate=lr_schedule, beta_1=0.95, beta_2=0.99, epsilon=1e-6) opt = tfa.optimizers.Lookahead(opt, sync_period=6, slow_step_size=0.5) model.compile( optimizer=opt, loss=loss, metrics=metrics.create_metrics((not stateful), config), ) return model
def build_model(hp: kt.HyperParameters, use_avs_model: bool = True): batch_size = config.generation.batch_size if stateful else None layer_names = name_generator('layer') inputs = {} per_stream = {} cnn_activation = {'relu': keras.activations.relu, 'elu': keras.activations.elu, 'mish': tfa.activations.mish}[hp.Choice('cnn_activation', ['relu', 'mish'])] cat_cnn_repetition = hp.Int('cat_cnn_repetition', 0, 4) cnn_spatial_dropout = hp.Float('spatial_dropout', 0.0, 0.5) cat_cnn_filters = hp.Int('cat_cnn_filters', 64, 256, sampling='log') reg_cnn_repetition = hp.Int('reg_cnn_repetition', 0, 4) reg_cnn_filters = hp.Int('reg_cnn_filters', 64, 256, sampling='log') cnn_kernel_size = hp.Choice(f'cnn_kernel_size', ['1', '3', '35', '37', ]) for col in seq.x_cols: if col in seq.categorical_cols: shape = None, *seq.shapes[col][2:] inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col) per_stream[col] = inputs[col] for _ in range(cat_cnn_repetition): per_stream[col] = forgiving_concatenate(inputs=[ layers.Conv1D(filters=cat_cnn_filters, kernel_size=int(s), activation=cnn_activation, padding='causal', kernel_initializer='lecun_normal', name=layer_names.__next__())(per_stream[col]) for conv_i, s in enumerate(cnn_kernel_size)], axis=-1, name=layer_names.__next__(), ) per_stream[col] = layers.BatchNormalization(name=layer_names.__next__(), )(per_stream[col]) per_stream[col] = layers.SpatialDropout1D(cnn_spatial_dropout)(per_stream[col]) if col in seq.regression_cols: shape = None, *seq.shapes[col][2:] inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col) per_stream[col] = inputs[col] for _ in range(reg_cnn_repetition): per_stream[col] = forgiving_concatenate(inputs=[ layers.Conv1D(filters=reg_cnn_filters, kernel_size=int(s), activation=cnn_activation, padding='causal', kernel_initializer='lecun_normal', name=layer_names.__next__())(per_stream[col]) for conv_i, s in enumerate(cnn_kernel_size)], axis=-1, name=layer_names.__next__(), ) per_stream[col] = layers.BatchNormalization(name=layer_names.__next__(), )(per_stream[col]) per_stream[col] = layers.SpatialDropout1D(cnn_spatial_dropout)(per_stream[col]) per_stream_list = list(per_stream.values()) x = forgiving_concatenate(inputs=per_stream_list, axis=-1, name=layer_names.__next__(), ) lstm_repetition = hp.Int('lstm_repetition', 0, 4) lstm_dropout = hp.Float('lstm_dropout', 0.0, 0.6) lstm_l2_regularizer = hp.Choice('lstm_l2_regularizer', [1e-2, 1e-4, 1e-6, 0.0]) for i in range(lstm_repetition): if i > 0: x = layers.Dropout(lstm_dropout)(x) x = layers.LSTM(hp.Int(f'lstm_{i}_units', 128, 384, sampling='log'), return_sequences=True, stateful=stateful, name=layer_names.__next__(), kernel_regularizer=keras.regularizers.l2(lstm_l2_regularizer), )(x) x = layers.BatchNormalization(name=layer_names.__next__(), )(x) end_cnn_repetition = hp.Int('end_cnn_repetition', 0, 2) end_spatial_dropout = hp.Float('end_spatial_dropout', 0.0, 0.5) end_cnn_filters = hp.Int('end_cnn_filters', 128, 384, sampling='log') end_cnn_kernel_size = hp.Choice(f'end_cnn_kernel_size', ['1', '3', ]) for _ in range(end_cnn_repetition): x = layers.SpatialDropout1D(end_spatial_dropout)(x) x = forgiving_concatenate(inputs=[ layers.Conv1D(filters=end_cnn_filters, kernel_size=int(s), activation=cnn_activation, padding='causal', kernel_initializer='lecun_normal', name=layer_names.__next__())(x) for conv_i, s in enumerate(end_cnn_kernel_size)], axis=-1, name=layer_names.__next__(), ) x = layers.BatchNormalization(name=layer_names.__next__(), )(x) x = layers.SpatialDropout1D(end_spatial_dropout)(x) outputs = {} loss = {} for col in seq.y_cols: if col in seq.categorical_cols: shape = seq.shapes[col][-1] outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation='softmax'), name=col)(x) loss[col] = keras.losses.CategoricalCrossentropy( label_smoothing=tf.cast(hp.Float('label_smoothing', 0.0, 0.6), 'float32'), ) # does not work well with mixed precision and stateful model if col in seq.regression_cols: shape = seq.shapes[col][-1] outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation=None), name=col)(x) loss[col] = 'mse' if stateful or config.training.AVS_proxy_ratio == 0: if config.training.AVS_proxy_ratio == 0: logging.log(logging.WARNING, f'Not using AVSModel with superior optimizer due to ' f'{config.training.AVS_proxy_ratio=}.') model = Model(inputs=inputs, outputs=outputs) opt = keras.optimizers.Adam() else: model = AVSModel(inputs=inputs, outputs=outputs, config=config) decay_start_epoch = hp.Int('decay_start_epoch', 15, 40) decay_end_epoch = (decay_start_epoch * 4) // 3 lr_schedule = FlatCosAnnealSchedule(decay_start=len(seq) * decay_start_epoch, # Give extra epochs to big batch_size initial_learning_rate=hp.Choice('initial_learning_rate', [3e-2, 1e-2, 8e-3]), decay_steps=len(seq) * decay_end_epoch, alpha=0.001, ) # Ranger hyper params based on https://github.com/fastai/imagenette/blob/master/2020-01-train.md opt = tfa.optimizers.RectifiedAdam(learning_rate=lr_schedule, beta_1=0.95, beta_2=0.99, epsilon=1e-6) opt = tfa.optimizers.Lookahead(opt, sync_period=6, slow_step_size=0.5) model.compile( optimizer=opt, loss=loss, metrics=metrics.create_metrics((not stateful), config), ) return model
def fit_sim_model(X_train, X_test, y_train, y_test, model1, model2, results_file='results.csv', embedding_file='sim_embeddings', num_runs=1, hp_file1=None, hp_file2=None, hp_pred_file=None, params=None): params = params or PARAMS kg1 = pd.read_csv('./data/chemicals0.csv') kg2 = pd.read_csv('./data/taxonomy0.csv') kg1 = list(zip(kg1['subject'], kg1['predicate'], kg1['object'])) kg2 = list(zip(kg2['subject'], kg2['predicate'], kg2['object'])) entities1 = set([s for s, p, o in kg1]) | set([o for s, p, o in kg1]) relations1 = set([p for s, p, o in kg1]) entities2 = set([s for s, p, o in kg2]) | set([o for s, p, o in kg2]) relations2 = set([p for s, p, o in kg2]) me1 = {k: i for i, k in enumerate(entities1)} me2 = {k: i for i, k in enumerate(entities2)} mr1 = {k: i for i, k in enumerate(relations1)} mr2 = {k: i for i, k in enumerate(relations2)} kg1 = [(me1[s], mr1[p], me1[o]) for s, p, o in kg1] kg2 = [(me2[s], mr2[p], me2[o]) for s, p, o in kg2] output_dim = 1 X_train, y_train = np.asarray([ (me1[a], me2[b], float(x)) for a, b, x in X_train if a in entities1 and b in entities2 ]), np.asarray([ float(x) for x, a in zip(y_train, X_train) if a[0] in entities1 and a[1] in entities2 ]) X_test, y_test = np.asarray([(me1[a], me2[b], float(x)) for a, b, x in X_test if a in entities1 and b in entities2 ]), np.asarray([ float(x) for x, a in zip(y_test, X_test) if a[0] in entities1 and a[1] in entities2 ]) scores = [] k_best_predictions = [] hp = HyperParameters() kg_lengths = list(map(len, [kg1, kg2])) output_lengths = len(X_train) hp.Fixed('num_entities1', len(entities1)) hp.Fixed('num_entities2', len(entities2)) hp.Fixed('num_relations1', len(relations1)) hp.Fixed('num_relations2', len(relations2)) hp.Fixed('embedding_model1', model1) hp.Fixed('embedding_model2', model2) hp.Fixed('output_dim', output_dim) bs = 1024 if hp_file1 and hp_file2: for i, hp_file in enumerate([hp_file1, hp_file2]): with open(hp_file, 'r') as fp: data = json.load(fp) for k in data: hp.Fixed(k + str(i + 1), data[k]) if k == 'batch_size': bs = min(bs, data[k]) else: for i, m in zip(['1', '2'], [model1, model2]): hp.Choice('dim' + i, [100, 200, 400], default=200) hp.Choice('negative_samples' + i, [10, 100], default=10) if m in ['ConvE', 'ConvR', 'ConvKB']: bs = 128 hp.Choice('loss_function' + i, [ 'pairwize_hinge', 'pairwize_logistic', 'pointwize_hinge', 'pointwize_logistic' ], default='pairwize_hinge') w = kg_lengths[int(i) - 1] / max(kg_lengths) if hp_pred_file: with open(hp_pred_file, 'r') as fp: data = json.load(fp) for k in data: hp.Fixed(k, data[k]) else: MAX_LAYERS = 3 hp.Int('branching_num_layers_chemical', 0, MAX_LAYERS, default=1) hp.Int('branching_num_layers_species', 0, MAX_LAYERS, default=1) hp.Int('branching_num_layers_conc', 0, MAX_LAYERS, default=1) hp.Int('num_layers1', 0, 3, default=1) for i in range(MAX_LAYERS + 1): hp.Choice('branching_units_chemical_' + str(i + 1), [32, 128, 512], default=128) hp.Choice('branching_units_species_' + str(i + 1), [32, 128, 512], default=128) hp.Choice('branching_units_conc_' + str(i + 1), [32, 128, 512], default=128) hp.Choice('units_' + str(i + 1), [32, 128, 512], default=128) # Since inputs are oversampled, we must reduce the weight of losses accordingly. w = output_lengths / max(kg_lengths) hp.Float('loss_weight1', w, 5 * w, sampling='log') hp.Float('loss_weight2', w, 5 * w, sampling='log') hp.Float('classification_loss_weight', w, 5 * w, sampling='log') hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5]) hp.Fixed('batch_size', bs) m = max(map(len, [kg1, kg2, X_train ])) + (bs - max(map(len, [kg1, kg2, X_train])) % bs) Xtr, ytr = prep_data_v2(kg1, kg2, X_train, y_train, max_length=m) Xte, yte = prep_data_v2(kg1, kg2, X_test, y_test, test=True, max_length=max(bs, len(y_test))) tuner = CVTuner(hypermodel=build_model, oracle=kt.oracles.BayesianOptimization( hyperparameters=hp, objective=Objective('val_auc', 'max'), max_trials=params['MAX_TRIALS']), overwrite=True, project_name='tmp/' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(11))) tuner.search(Xtr, ytr, epochs=params['SEARCH_MAX_EPOCHS'], batch_size=bs, callbacks=[ EarlyStopping('loss', mode='min', patience=params['PATIENCE']) ], kfolds=params['NUM_FOLDS'], class_weight=params['cw']) results = [] prediction = [] best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] model = tuner.hypermodel.build(best_hps) out = dict() for k in best_hps.values.keys(): out[k] = best_hps.values[k] with open('./sim_hp/%s.json' % hp_pred_file.split('/')[-1].split('_')[0], 'w') as fp: json.dump(out, fp) for _ in range(num_runs): reset_weights(model) model.fit(Xtr, ytr, epochs=params['MAX_EPOCHS'], batch_size=bs, verbose=2, class_weight=params['cw'], callbacks=[ EarlyStopping('loss', mode='min', patience=params['PATIENCE']) ]) r = model.evaluate(Xte, yte, verbose=0, batch_size=bs) results.append(r) W1 = model.get_layer('embedding').get_weights()[0] W2 = model.get_layer('embedding_2').get_weights()[0] np.save(embedding_file + '_chemical_embeddings.npy', W1) np.save(embedding_file + '_chemical_ids.npy', np.asarray(zip(entities1, range(len(entities1))))) np.save(embedding_file + '_taxonomy_embeddings.npy', W2) np.save(embedding_file + '_taxonomy_ids.npy', np.asarray(zip(entities2, range(len(entities2))))) var = np.var(np.asarray(results), axis=0) results = np.mean(np.asarray(results), axis=0) df = pd.DataFrame( data={ 'metric': model.metrics_names, 'value': list(results), 'variance': list(var) }) df.to_csv(results_file)
def build_hyper_l2_constrained(hp: HyperParameters, n_tasks: int, all_columns: List[str], cat_features_dim: Dict[str, int], restricted_hyperparameter_search: bool, feature_sparsity_min: int = 4, feature_sparsity_max: int = 9, min_layers: int = 3, max_layers: int = 6, min_units_per_layer: int = 32, max_units_per_layer: int = 64, min_l2_alpha: float = 1e-1, max_l2_alpha: float = 1e+2 ) -> Model: """ Build model for L2 constrained multi-task learning model Parameters ---------- hp: instance of HyperParameters Hyper-Parameters that define architecture and training of neural networks n_tasks: int Number of tasks all_columns: list Names of the features cat_features_dim: dict Dictionary that maps from the name of categorical feature to its dimensionality. restricted_hyperparameter_search: bool If True, then fixes following hyperparameters and does not optimize them. - batch_size = 1024 - hidden_layer_activation = relu - optimizer = sgd feature_sparsity_min: int Minimum possible value of feature sparsity threshold feature_sparsity_max: int Maximum possible value of feature sparsity threshold min_layers: int Minimum number of layers max_layers: int Maximum number of layers min_units_per_layer: int Minimum number of neurons per layer max_units_per_layer: int Maximum number of neurons per layer min_l2_alpha: float Minimum possible value of l2 regularization coefficient max_l2_alpha: float Maximium possible value of l2 regularization coefficient Returns ------- model: tensorflow.keras.models.Model Compiled L2 Constrained Model """ # define activation functions and preproceing layer build_activation_functions(hp, restricted_hyperparameter_search) preprocessing_layer = build_preprocessing_layer_uci_income(hp, all_columns, cat_features_dim, feature_sparsity_min, feature_sparsity_max) # propagate input through preprocesing layer input_layer = Input(shape=(len(all_columns),)) x = preprocessing_layer(input_layer) # build l2 constrained model n_layers = hp.Int("number_of_hidden_layers", min_value=min_layers, max_value=max_layers) for i in range(n_layers): n_units = hp.Int("n_units_layer_{0}".format(i), min_value=min_units_per_layer, max_value=max_units_per_layer) mtl_layers = [Dense(n_units, hp['hidden_layer_activation']) for _ in range(n_tasks)] l2_regularizer = hp.Float("l2_regularizer_layer_{0}".format(i), min_value=min_l2_alpha, max_value=max_l2_alpha) constrained_l2 = ConstrainedMTL(mtl_layers, l1_regularizer=0., l2_regualrizer=l2_regularizer) x = constrained_l2(x) output_layers = [Dense(1, hp['output_layer_activation'])(x[i]) for i in range(n_tasks)] model = Model(inputs=input_layer, outputs=output_layers) return model
def build(self, hp: kerastuner.HyperParameters) -> keras.Model: """Build DAN model Notes: This is normally called within a HyperModel context. Args: hp (:obj:`HyperParameters`): `HyperParameters` instance Returns: A built/compiled keras model ready for hyperparameter tuning """ # L1/L2 vals reg_vals = [0.0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # --- Model Topology # Feature Embedding Params emb_l1 = hp.Choice("Feature Embedding L1", reg_vals, default=0.0) emb_l2 = hp.Choice("Feature Embedding L2", reg_vals, default=0.0) emb_n = hp.Int("Embedding Dimension", min_value=64, max_value=2048, default=1024, step=64) emb_dropout = hp.Float("Dropout from Embeddings", min_value=0.0, max_value=0.9, step=0.05, default=0.0) final_dropout = hp.Float("Dropout before prediction", min_value=0.0, max_value=0.9, step=0.05, default=0.5) # Final dense layer dense_size = hp.Int("Dense Units", min_value=2, max_value=128, sampling="log", default=14) # --- Model feat_input = keras.Input(shape=(self.input_size, )) # Feature Embeddings embeddings = keras.layers.Embedding( input_dim=self.vocab_size, output_dim=emb_n, embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2), mask_zero=True, name="Feature_Embeddings")(feat_input) dropout_1 = keras.layers.Dropout(rate=emb_dropout)(embeddings) # Averaging the embeddings embedding_avg = keras.backend.mean(dropout_1, 1) # Dense layers dense = keras.layers.Dense(dense_size, activation="relu", name='dense_1')(embedding_avg) dropout_2 = keras.layers.Dropout(final_dropout)(dense) activation_fn = "softmax" if self.n_classes > 2 else "sigmoid" output = keras.layers.Dense( units=self.n_classes if self.n_classes > 2 else 1, activation=activation_fn, name="Output")(dropout_2) model = keras.Model(feat_input, output) # --- Learning rate and momentum # lr = hp.Choice( # "Learning Rate", # [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1]) # momentum = hp.Float("Momentum", min_value=0.0, max_value=0.9, step=0.1) # opt = keras.optimizers.SGD(lr, momentum=momentum) # NOTE: I've had a lot of issues with SGD getting even comparable performance to Adam # so I'm saying we scrap it and just go with Adam. opt = keras.optimizers.Adam() # --- Loss FN # NOTE: I was messing around with focal loss here, but I think that's # harder to justify and explain in this context if self.loss is None: if self.n_classes > 2: loss_fn = keras.losses.categorical_crossentropy else: loss_fn = keras.losses.binary_crossentropy else: loss_fn = self.loss model.compile(optimizer=opt, loss=loss_fn, metrics=self.metrics) return model
def build(self, hp: kerastuner.HyperParameters) -> keras.Model: """Build LSTM model Notes: This is normally called within a HyperModel context. Args: hp (:obj:`HyperParameters`): `HyperParameters` instance Returns: A built/compiled keras model ready for hyperparameter tuning """ # L1/L2 vals reg_vals = [0.0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # Model Topology # Should we multiply the feature embeddings by their averages? weighting = hp.Boolean("Feature Weighting") # Should we add a dense layer between RNN and output? final_dense = hp.Boolean("Final Dense Layer") # Feature Embedding Params emb_l1 = hp.Choice("Feature Embedding L1", reg_vals) emb_l2 = hp.Choice("Feature Embedding L2", reg_vals) emb_n = hp.Int("Embedding Dimension", min_value=64, max_value=512, default=64, step=64) # Demog Embedding demog_emb_n = hp.Int("Demographics Embedding Dimension", min_value=1, max_value=64, default=self.n_demog) # Average Embedding Params avg_l1 = hp.Choice("Average Embedding L1", reg_vals, parent_name="Feature Weighting", parent_values=[True]) avg_l2 = hp.Choice("Average Embedding L2", reg_vals, parent_name="Feature Weighting", parent_values=[True]) # LSTM Params lstm_n = hp.Int("LSTM Units", min_value=32, max_value=512, default=32, step=32) lstm_dropout = hp.Float("LSTM Dropout", min_value=0.0, max_value=0.9, default=0.4, step=0.01) lstm_recurrent_dropout = hp.Float("LSTM Recurrent Dropout", min_value=0.0, max_value=0.9, default=0.4, step=0.01) lstm_l1 = hp.Choice("LSTM weights L1", reg_vals) lstm_l2 = hp.Choice("LSTM weights L2", reg_vals) # Final dense layer dense_n = hp.Int("Dense Units", min_value=2, max_value=128, sampling="log", parent_name="Final Dense Layer", parent_values=[True]) # Model code feat_input = keras.Input(shape=(None, None), ragged=True) demog_input = keras.Input(shape=(self.n_demog_bags, )) demog_emb = keras.layers.Embedding( self.n_demog, output_dim=demog_emb_n, mask_zero=True, name="Demographic_Embeddings")(demog_input) demog_avg = keras.layers.Flatten()(demog_emb) emb1 = keras.layers.Embedding( self.vocab_size, output_dim=emb_n, embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2), mask_zero=True, name="Feature_Embeddings")(feat_input) if weighting: emb2 = keras.layers.Embedding( self.vocab_size, output_dim=1, embeddings_regularizer=keras.regularizers.l1_l2( avg_l1, avg_l2), mask_zero=True, name="Average_Embeddings")(feat_input) # Multiplying the code embeddings by their respective weights mult = keras.layers.Multiply(name="Embeddings_by_Average")( [emb1, emb2]) avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(mult) else: avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(emb1) lstm_layer = keras.layers.LSTM( lstm_n, dropout=lstm_dropout, recurrent_dropout=lstm_recurrent_dropout, recurrent_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2), name="Recurrent")(avg) lstm_layer = keras.layers.Concatenate()([lstm_layer, demog_avg]) if final_dense: lstm_layer = keras.layers.Dense(dense_n, activation="relu", name="pre_output")(lstm_layer) activation_fn = "softmax" if self.n_classes > 2 else "sigmoid" output = keras.layers.Dense( self.n_classes if self.n_classes > 2 else 1, activation=activation_fn, name="Output")(lstm_layer) model = keras.Model([feat_input, demog_input], output) # --- Learning rate and momentum # lr = hp.Choice( # "Learning Rate", # [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1]) # momentum = hp.Float("Momentum", min_value=0.0, max_value=0.9, step=0.1) # opt = keras.optimizers.SGD(lr, momentum=momentum) opt = keras.optimizers.Adam() # --- Loss FN # NOTE: I was messing around with focal loss here, but I think that's # harder to justify and explain in this context if self.loss is None: if self.n_classes > 2: loss_fn = keras.losses.categorical_crossentropy else: loss_fn = keras.losses.binary_crossentropy else: loss_fn = self.loss model.compile(optimizer=opt, loss=loss_fn, metrics=self.metrics) return model