def train(model, data): x, y = data # 将数据划分为训练集合验证集 train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, shuffle=True) # 设置检查点 callbacks_list = [ keras.callbacks.ModelCheckpoint(filepath=weight_path, monitor='loss', save_best_only=True), keras.callbacks.LearningRateScheduler(schedule), ] # 编译模型 model.compile(optimizer=keras.optimizers.Adam(lr=lr), loss=total_loss) # 初始化Lookahead lookahead = Lookahead(k=5, alpha=0.5) # 插入到模型中 lookahead.inject(model) # 开始时间 start_time = time() # 训练模型 if not data_augmentation: print("不使用数据分割") history = model.fit(train_x, train_y, epochs=epoch, batch_size=batch_size, validation_data=(valid_x, valid_y), verbose=1, callbacks=callbacks_list) else: print("使用数据分割") history = model.fit_generator( generator=data_generator(train_x, train_y, batch_size), steps_per_epoch=(len(train_x) + batch_size - 1) // batch_size, epochs=epoch, verbose=1, callbacks=callbacks_list, validation_data=data_generator(valid_x, valid_y, batch_size), validation_steps=(len(valid_x) + batch_size - 1) // batch_size) model.save(weight_path) # 保存模型 # 结束时间 duration = time() - start_time print("Train Finished takes:", "{:.2f} h".format(duration / 3600.0)) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='valid') plt.xlabel('epoch') plt.ylabel('loss') plt.legend(loc='upper right') plt.show() return model
def model_fn(objective, optimizer, metrics): base_model = efn.EfficientNetB4( include_top=False, # base_model = seresnext50(include_top=False, # base_model = xception(include_top=False, # base_model = densenet201(include_top=False, # base_model = inceptionresnetv2(include_top=False, input_shape=(input_size, input_size, 3), classes=num_classes, weights='imagenet', ) x = base_model.output x = GlobalAveragePooling2D()(x) predictions = Dense(num_classes, activation='softmax')(x) model1 = Model(inputs=base_model.input, outputs=predictions) # model2 = multi_gpu_model(model1, gpus=3) # model2 = model1 model1.compile(loss=objective, optimizer=optimizer, metrics=metrics) lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead lookahead.inject(model1) # add into model model1.summary() return model1
def nvidia(optimizer, source_path, train_generator, validation_generator, train_epochs, \ num_train_samples, num_validation_samples, batch_size, conv_dropout, fc_dropout): ''' :nvidia model from the paper: https://images.nvidia.com/content/tegra/automotive/images/2016/solutions/pdf/end-to-end-dl-using-px.pdf ''' # Layer0 : normalized layer model.add(Lambda(lambda x: (x / 255.0) - 0.5, input_shape=(64, 64, 3))) # Layer1: Convolutional feature map 24@31x98 model.add(Convolution2D(24, 5, 5, activation='relu', subsample=(2, 2))) model.add(Dropout(conv_dropout)) # Layer2: Convolutional feature map 36@14x47 model.add(Convolution2D(36, 5, 5, activation='relu', subsample=(2, 2))) model.add(Dropout(conv_dropout)) # Layer3: Convolutional feature map 48@5x22 model.add(Convolution2D(48, 5, 5, activation='relu', subsample=(2, 2))) model.add(Dropout(conv_dropout)) # Layer4: Convolutional feature map 64@3x20 model.add(Convolution2D(64, 3, 3, activation='relu')) model.add(Dropout(conv_dropout)) # Layer5: Convolutional feature map 64@1x18 model.add(Convolution2D(64, 3, 3, activation='relu')) model.add(Dropout(conv_dropout)) # FC1 model.add(Flatten()) model.add(Dense(1164, activation='relu')) model.add(Dropout(fc_dropout)) # FC2 model.add(Dense(100, activation='relu')) model.add(Dropout(fc_dropout)) # FC3 model.add(Dense(50, activation='relu')) model.add(Dropout(fc_dropout)) # FC4 model.add(Dense(10, activation='relu')) # Outut layer model.add(Dense(1)) model.summary() if optimizer == 'adam': ''' Default adam optimizer works to trace1 ''' model.compile(loss='mse', optimizer='adam') if optimizer == 'sgd': ''' SGD optimizer works to trace2 with 0.005 learning rate ''' sgd = optimizers.SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='mse', optimizer=sgd) if optimizer == 'lookahead': ''' new optimizer named lookahead source site: paper https://arxiv.org/abs/1907.08610, code by keras https://github.com/bojone/keras_lookahead ''' model.compile(optimizer=optimizers.Adam(1e-3), loss='mse') # Any optimizer lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead lookahead.inject(model) # add into model history_object = model.fit_generator( train_generator, steps_per_epoch=np.ceil(num_train_samples / batch_size), validation_data=validation_generator, validation_steps=np.ceil(num_validation_samples / batch_size), epochs=train_epochs, verbose=1) model.save('model.h5') print('nvidia-model-epoch{}-{}-{}.h5'.format(source_path, train_epochs, optimizer)) plot_loss(history_object)
t1 = time() dataset = Dataset(args.path + args.dataset, k) train, user_review_fea, item_review_fea, testRatings = dataset.trainMatrix, dataset.user_review_fea, \ dataset.item_review_fea, dataset.testRatings num_users, num_items = train.shape print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" % (time() - t1, num_users, num_items, train.nnz, len(testRatings))) # Build model model = get_model(num_users, num_items, k, num_factors, regs) if learner.lower() == "adagrad": model.compile(optimizer=Adagrad(lr=learning_rate), loss="mean_squared_error") lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead lookahead.inject(model) # add into model elif learner.lower() == "rmsprop": model.compile(optimizer=RMSprop(lr=learning_rate), loss="mean_squared_error") lookahead = Lookahead(k=5, alpha=0.5) lookahead.inject(model) elif learner.lower() == "adam": model.compile(optimizer=Adam(lr=learning_rate), loss="mean_squared_error") lookahead = Lookahead(k=5, alpha=0.5) lookahead.inject(model) else: model.compile(optimizer=SGD(lr=learning_rate), loss="mean_squared_error") lookahead = Lookahead(k=5, alpha=0.5) lookahead.inject(model)
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_dir = cp["DEFAULT"].get("output_dir") image_source_dir = cp["DEFAULT"].get("image_source_dir") base_model_name = cp["DEFAULT"].get("base_model_name") class_names = cp["DEFAULT"].get("class_names").split(",") # train config use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") generator_workers = cp["TRAIN"].getint("generator_workers") image_dimension = cp["TRAIN"].getint("image_dimension") train_steps = cp["TRAIN"].get("train_steps") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") min_lr = cp["TRAIN"].getfloat("min_lr") validation_steps = cp["TRAIN"].get("validation_steps") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") dataset_csv_dir = cp["TRAIN"].get("dataset_csv_dir") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print("** use trained model weights **") # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over training_stats = {} show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print(f"backup config file to {output_dir}") shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) datasets = ["train", "dev", "test"] for dataset in datasets: shutil.copy(os.path.join(dataset_csv_dir, f"{dataset}.csv"), output_dir) # get train/dev sample counts train_counts, train_pos_counts = get_sample_counts( output_dir, "train", class_names) dev_counts, _ = get_sample_counts(output_dir, "dev", class_names) # compute steps if train_steps == "auto": train_steps = int(train_counts / batch_size) else: try: train_steps = int(train_steps) except ValueError: raise ValueError(f""" train_steps: {train_steps} is invalid, please use 'auto' or integer. """) print(f"** train_steps: {train_steps} **") if validation_steps == "auto": validation_steps = int(dev_counts / batch_size) else: try: validation_steps = int(validation_steps) except ValueError: raise ValueError(f""" validation_steps: {validation_steps} is invalid, please use 'auto' or integer. """) print(f"** validation_steps: {validation_steps} **") # compute class weights print("** compute class weights from training data **") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, ) print("** class_weights **") print(class_weights) print("** load model **") if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, f"best_{output_weights_name}") else: model_weights_file = os.path.join(output_dir, output_weights_name) else: model_weights_file = None model_factory = ModelFactory() model = model_factory.get_model( class_names, model_name=base_model_name, use_base_weights=use_base_model_weights, weights_path=model_weights_file, input_shape=(image_dimension, image_dimension, 3)) if show_model_summary: print(model.summary()) print("** create image generators **") train_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(output_dir, "train.csv"), class_names=class_names, source_image_dir=image_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=train_steps, ) validation_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(output_dir, "dev.csv"), class_names=class_names, source_image_dir=image_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=validation_steps, shuffle_on_epoch_end=False, ) output_weights_path = os.path.join(output_dir, output_weights_name) print(f"** set output weights path to: {output_weights_path} **") print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=True, verbose=1, ) print("** compile model with class weights **") #model.compile(RAdam(), loss='mse') #optimizer = Adam(lr=initial_learning_rate) optimizer = RAdam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss=[focal_loss]) lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead lookahead.inject(model_train) # add into model auroc = MultipleClassAUROC( sequence=validation_sequence, class_names=class_names, weights_path=output_weights_path, stats=training_stats, workers=generator_workers, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1, mode="min", min_lr=min_lr), auroc, EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25), ] print("** start training **") history = model_train.fit_generator( generator=train_sequence, steps_per_epoch=train_steps, epochs=epochs, validation_data=validation_sequence, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=generator_workers, shuffle=False, ) # dump history print("** dump history **") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)