def train(): env = cs.TrainingEnvironment() checkpoint_dir = env.hyperparameters.get("checkpoint_path", env.model_dir) train_steps = env.hyperparameters.get('training_steps', 1000) eval_steps = env.hyperparameters.get('evaluation_steps', 100) # https://github.com/tensorflow/tensorflow/issues/15868 # The default request timeout for S3, within the C++ SDK, is 3 seconds, which times out when # saving checkpoints of larger sizes. os.environ['S3_REQUEST_TIMEOUT_MSEC'] = str( env.hyperparameters.get('s3_checkpoint_save_timeout', 60000)) env.download_user_module() env.pip_install_requirements() customer_script = env.import_user_module() train_wrapper = Trainer(customer_script=customer_script, current_host=env.current_host, hosts=env.hosts, train_steps=train_steps, eval_steps=eval_steps, training_path=env.channel_dirs[CHANNEL_DIR], model_path=checkpoint_dir, output_path=env.output_dir, customer_params=env.hyperparameters) tf_config = train_wrapper.build_tf_config() # only creating a parameter servers for distributed runs if len(env.hosts) > 1: _run_ps_server(env.current_host, env.hosts, tf_config) save_tf_config_env_var(tf_config) try: run.train_and_log_exceptions(train_wrapper, env.output_dir) # only the master should export the model at the end of the execution if checkpoint_dir != env.model_dir and train_wrapper.task_type == 'master': serve.export_saved_model(checkpoint_dir, env.model_dir) if train_wrapper.task_type != 'master': _wait_until_master_is_down(_get_master(tf_config)) finally: # Since threads in Python cannot be stopped, this is the only way to stop the application # https://stackoverflow.com/questions/9591350/what-is-difference-between-sys-exit0-and-os-exit0 os._exit(0)
def train(): env = cs.TrainingEnvironment() checkpoint_dir = _get_checkpoint_dir(env) train_steps = env.hyperparameters.get('training_steps', 1000) eval_steps = env.hyperparameters.get('evaluation_steps', 100) # https://github.com/tensorflow/tensorflow/issues/15868 # The default request timeout for S3, within the C++ SDK, is 3 seconds, which times out when # saving checkpoints of larger sizes. os.environ['S3_REQUEST_TIMEOUT_MSEC'] = str( env.hyperparameters.get('s3_checkpoint_save_timeout', 60000)) if env.user_script_archive.lower().startswith('s3://'): env.download_user_module() env.pip_install_requirements() customer_script = env.import_user_module() trainer_class = _get_trainer_class() train_wrapper = trainer_class(customer_script=customer_script, current_host=env.current_host, hosts=env.hosts, train_steps=train_steps, eval_steps=eval_steps, input_channels=env.channel_dirs, model_path=checkpoint_dir, output_path=env.output_dir, customer_params=env.hyperparameters) tf_config = train_wrapper.build_tf_config() # only creating a parameter servers for distributed runs if len(env.hosts) > 1: _run_ps_server(env.current_host, env.hosts, tf_config) save_tf_config_env_var(tf_config) configure_mkl() train_wrapper.train() # only the master should export the model at the end of the execution if checkpoint_dir != env.model_dir and train_wrapper.task_type == 'master' and train_wrapper.saves_training( ): serve.export_saved_model(checkpoint_dir, env.model_dir) if train_wrapper.task_type != 'master': _wait_until_master_is_down(_get_master(tf_config))
def train(): env = cs.TrainingEnvironment() print(device_lib.list_local_devices()) os.system('mkdir -p logs') # ### Loading the files ### # ** You need to copy all your files to the directory where you are runing this notebook ** # ** into a folder named "data" ** data = [] for root, dirs, files in os.walk('/opt/ml/input/data/train'): for f in files: if f.endswith('.zip'): unzip_file(root, f) for root, dirs, files in os.walk('/opt/ml/input/data/train'): data.extend( [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')]) # ### Loading throttle and angle ### angle = [d[0] for d in data] angle_array = np.array(angle) # ### Loading images ### images = np.array([img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f') # slide images vs orders if env.hyperparameters.get('with_slide', False): images = images[:len(images) - 2] angle_array = angle_array[2:] # ### Start training ### def linear_bin(a): a = a + 1 b = round(a / (2 / 14)) arr = np.zeros(15) arr[int(b)] = 1 return arr logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True) save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=.0005, patience=10, verbose=1, mode='auto') # Only for export model to tensorflow sess = tf.Session() K.set_session(sess) # First layer, input layer, Shape comes from camera.py resolution, RGB img_in = Input(shape=(128, 160, 3), name='img_in') x = img_in # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(x) # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(x) # 64 features, 5px5p kernal window, 2wx2h stride, relu x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(x) # 64 features, 3px3p kernal window, 2wx2h stride, relu x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x) # 64 features, 3px3p kernal window, 1wx1h stride, relu x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x) # Possibly add MaxPooling (will make it less sensitive to position in image). Camera angle fixed, so may not to be needed x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected) x = Dense(100, activation='relu')(x) # Classify the data into 100 features, make all negatives 0 x = Dropout(.1)(x) x = Dense(50, activation='relu')(x) # Randomly drop out 10% of the neurons (Prevent overfitting) x = Dropout(.1)(x) # categorical output of the angle callbacks_list = [save_best, early_stop, logs] # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. # 15 categories and find best one based off percentage 0.0-1.0 angle_out = Dense(15, activation='softmax', name='angle_out')(x) angle_cat_array = np.array([linear_bin(a) for a in angle_array]) model = Model(inputs=[img_in], outputs=[angle_out]) model.compile(optimizer='adam', loss={'angle_out': 'categorical_crossentropy', }, loss_weights={'angle_out': 0.9 }) model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=32, epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list) # Save model for tensorflow using builder = tf.saved_model.builder.SavedModelBuilder("/opt/ml/model/tfModel") # Tag the model, required for Go builder.add_meta_graph_and_variables(sess, ["myTag"]) builder.save() sess.close()
def train(): env = cs.TrainingEnvironment() print(device_lib.list_local_devices()) os.system('mkdir -p logs') # ### Loading the files ### # ** You need to copy all your files to the directory where you are runing this notebook into a folder named "data" ** numbers = re.compile(r'(\d+)') data = [] def get_data(root, f): d = json.load(open(os.path.join(root, f))) if ('pilot/throttle' in d): return [ d['user/mode'], d['user/throttle'], d['user/angle'], root, d['cam/image_array'], d['pilot/throttle'], d['pilot/angle'] ] else: return [ d['user/mode'], d['user/throttle'], d['user/angle'], root, d['cam/image_array'] ] def numericalSort(value): parts = numbers.split(value) parts[1::2] = map(int, parts[1::2]) return parts def unzip_file(root, f): zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r') zip_ref.extractall(root) zip_ref.close() for root, dirs, files in os.walk('/opt/ml/input/data/train'): for f in files: if f.endswith('.zip'): unzip_file(root, f) for root, dirs, files in os.walk('/opt/ml/input/data/train'): data.extend([ get_data(root, f) for f in sorted(files, key=numericalSort) if f.startswith('record') and f.endswith('.json') ]) # Normalize / correct data data = [d for d in data if d[1] > 0.1] for d in data: if d[1] < 0.2: d[1] = 0.2 # ### Loading throttle and angle ### angle = [d[2] for d in data] throttle = [d[1] for d in data] angle_array = np.array(angle) throttle_array = np.array(throttle) if (len(data[0]) > 5): pilot_angle = [d[6] for d in data] pilot_throttle = [d[5] for d in data] pilot_angle_array = np.array(pilot_angle) pilot_throttle_array = np.array(pilot_throttle) else: pilot_angle = [] pilot_throttle = [] # ### Loading images ### images = np.array( [img_to_array(load_img(os.path.join(d[3], d[4]))) for d in data], 'f') # slide images vs orders if env.hyperparameters.get('with_slide', False): images = images[:len(images) - 2] angle_array = angle_array[2:] throttle_array = throttle_array[2:] # ### Start training ### def linear_bin(a): a = a + 1 b = round(a / (2 / 14)) arr = np.zeros(15) arr[int(b)] = 1 return arr logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True) save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='angle_out_loss', verbose=1, save_best_only=True, mode='min') early_stop = callbacks.EarlyStopping(monitor='angle_out_loss', min_delta=.0005, patience=10, verbose=1, mode='auto') img_in = Input( shape=(120, 160, 3), name='img_in' ) # First layer, input layer, Shape comes from camera.py resolution, RGB x = img_in x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')( x ) # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')( x) # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')( x) # 64 features, 5px5p kernal window, 2wx2h stride, relu x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')( x) # 64 features, 3px3p kernal window, 2wx2h stride, relu x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')( x) # 64 features, 3px3p kernal window, 1wx1h stride, relu # Possibly add MaxPooling (will make it less sensitive to position in image). Camera angle fixed, so may not to be needed x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected) x = Dense(100, activation='relu')( x) # Classify the data into 100 features, make all negatives 0 x = Dropout(.1)(x) x = Dense(50, activation='relu')(x) x = Dropout(.1)( x) # Randomly drop out 10% of the neurons (Prevent overfitting) #categorical output of the angle callbacks_list = [save_best, early_stop, logs] angle_out = Dense(15, activation='softmax', name='angle_out')( x ) # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. 15 categories and find best one based off percentage 0.0-1.0 #continous output of throttle throttle_out = Dense(1, activation='relu', name='throttle_out')( x) # Reduce to 1 number, Positive number only angle_cat_array = np.array([linear_bin(a) for a in angle_array]) model = Model(inputs=[img_in], outputs=[angle_out, throttle_out]) model.compile(optimizer='adam', loss={ 'angle_out': 'categorical_crossentropy', 'throttle_out': 'mean_absolute_error' }, loss_weights={ 'angle_out': 0.9, 'throttle_out': .001 }) model.fit({'img_in': images}, { 'angle_out': angle_cat_array, 'throttle_out': throttle_array }, batch_size=32, epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)