def input_fn(filenames, channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels if FLAGS.pipe_mode == 0: # Extract lines from input files using the Dataset API, can pass one filename or filename list dataset = tf.data.TextLineDataset(filenames).map( decode_libsvm, num_parallel_calls=10).prefetch( 500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) #liangaws:注意如果是单机多GPU或者多CPU,这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。 #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略 dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') #liangaws: 在sagemaker PS训练方式下,每个训练实例只有一个worker,一个ps。所以这里使用host的数量其实等于worker的数量来对训练集shard。不需要对验证集进行shard。 if channel == 'training': number_host = len(FLAGS.hosts) if number_host > 1: index = FLAGS.hosts.index(FLAGS.current_host) print("index is ", index) dataset = dataset.shard(number_host, index) if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(500000) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def input_fn(filenames='', channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels # Extract lines from input files using the Dataset API, can pass one filename or filename list print("pipe mode ", FLAGS.pipe_mode) if FLAGS.pipe_mode == 0: """ dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use """ dataset = tf.data.TextLineDataset(filenames) #liangaws: 这里假设Sagemaker用的是S3fullreplicate,也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。 dataset = dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.prefetch( 500000) # multi-thread pre-process then prefetch if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') number_host = len(FLAGS.hosts) #liangaws: horovod + pipe mode下,如果每个训练实例有多个worker,需要每个worker对应一个不同的channel,因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下,才需要对不同训练实例上的同一个channel的数据做shard。 if number_host > 1 and hvd.size() > number_host: #liangaws: 在Sagemaker horovod方式下,发现current-host都是一样的。 #index = FLAGS.hosts.index(FLAGS.current_host) index = hvd.rank() // FLAGS.worker_per_host dataset = dataset.shard(number_host, index) if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(500000) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset