def set_samples(self, sample_bboxs): timer = common.Timer() bboxs = self.get_bbox_array(sample_bboxs) self.sample_bbox.set_value(bboxs) self.sample_bbox_list = sample_bboxs logging.debug("Took %i ms to set_samples" % timer.current_ms()) return bboxs
def predict_output(self, dataset): dataset_x, dataset_y, dataset_size = dataset.export(self.batch_size) #dummy call to build function self.predict_output_step(dataset_x[:self.batch_size]) #evaluate function timer = common.Timer() n = math.ceil(dataset_size / self.batch_size) pr = [] for index in range(n): data_x = dataset_x[index * self.batch_size:(index + 1) * self.batch_size] pr_batch = self.predict_output_step(data_x) pr.append(pr_batch) pr = numpy.concatenate(pr, axis=0) logging.verbose("Prediction took %.3f sec for %i samples" % (timer.current(), pr.shape[0])) #crop dummy data if (dataset_size % self.batch_size) != 0: s = [dataset_size] + list(pr.shape[1:]) pr.resize(tuple(s), refcheck=False) return pr
def get_bbox_array(self, sample_bboxs): timer = common.Timer() bboxs = numpy.zeros( (self.batch_size, self.sample_num, self.sample_num, 4), dtype=numpy.float32) c_code.build_bbox_array(sample_bboxs, bboxs) logging.debug("Took %i ms to get_bbox_array" % timer.current_ms()) return bboxs
def run(self): logging.info("Exporting subset (%i/%i)" % (self.subset + 1, self.data.subset_num)) timer = common.Timer() self.data.load_from_subset(self.subset) timer.mark() self.data_export = self.data.export(self.batch_size) timer.mark() logging.info( "Finished exporting subset (%i/%i)" % (self.subset + 1, self.data.subset_num), "- load took %i sec, export took %i sec" % (timer.delta(0), timer.delta(1)))
def get_samples(self, data_x, train=False, store_shared=False): global profile if self.corner_func is None: logging.verbose("Building corner function - store samples:", store_shared, "train:", train) updates = [(self.corner_layer.sample_shared, self.corner_layer.sample)] if store_shared else [] self.corner_func = theano.function( [self.model_input], self.corner_layer.corner_pr, updates=updates, profile=profile, givens=[(get_train(), tensor.cast(int(train), 'int8'))], on_unused_input='ignore') #find corners timer = common.Timer() logging.debug("Running corner function") corner_pr = self.corner_func(data_x) if profile: logging.debug("Profiling corner function") theano_util.profile(self.corner_func, 10, data_x) theano_util.export_graph("./corner.graph", self.corner_func) logging.debug("Done") exit(0) #build sampling bounding boxs timer.mark() logging.debug("Build samples (%i threads)" % self.thread_num) samples = c_code.build_samples(self.thread_num, corner_pr, self.corner_threshold, self.sample_num, self.corner_max, self.local_max, self.nms_threshold) timer.mark() logging.verbose( "Took %i ms to get_samples (%i model, %i build, %i max corners) " % (timer.current_ms(), timer.delta_ms(0), timer.delta_ms(1), self.corner_max)) return samples
def train_epoch(self, dataset, epoch, learning_rate, momentum=[0, 1, 0], decay=0.0, solver_mode="sgd"): #train over batches (assume dataset size is mulitple of batch_size!) logging.info("Evaluating training function") dataset_x, dataset_m, dataset_size = dataset.export(self.batch_size) index_num = math.ceil(dataset_size / self.batch_size) total_cost = 0 for index in range(index_num): #upload data to GPU and perform train step timer = common.Timer() data_x = dataset_x[index * self.batch_size:(index + 1) * self.batch_size] data_m = dataset_m[index * self.batch_size:(index + 1) * self.batch_size] cost, _ = self.train_step(data_x, data_m, epoch, self.iteration, learning_rate, momentum, decay) #watch out for GPU's randomly producing NaN! if math.isnan(cost): raise Exception("ERROR: Cost is NaN") logging.verbose( "Batch %i.%i - iteration: %i cost:" % (epoch, index * self.batch_size, self.iteration), cost, "took: %i ms" % timer.current_ms()) total_cost += cost self.iteration += 1 return total_cost
def main(): #load arguments: parser = argparse.ArgumentParser( description='Train a convolutional network using labelled data.') logging.add_arguments(parser) parser.add_argument("--model", required=False, default=None, help="Model to continue training.") parser.add_argument("--cost-factors", default=[], nargs="+", help="Multiplicative factors for model costs") parser.add_argument( "--thread-num", type=int, default=1, help= "Number of threads to use for supported opeartions (e.g. loading/distorting datasets)" ) parser.add_argument("--extension", default="ppm", help="Image file extension") parser.add_argument("--train", default=None, help="The folder with training / validation data") parser.add_argument("--test", default=None, help="The folder with testing data (optional)") parser.add_argument("--test-epochs", type=int, default=1, help="Epochs between each test evaluation") parser.add_argument("--test-mode", default="default", help="Mode to use for testing") parser.add_argument( "--border-mode", default="valid", help="Border mode for convolutional layers (full, valid)") parser.add_argument("--output-prefix", default="./model", help="Output prefix for model files") parser.add_argument( "--activation", default="relu", help= "Activation function used in convolution / hidden layers (tanh, relu, leaky-relu)" ) parser.add_argument("--solver", type=str, default="nesterov", help="") parser.add_argument("--weight-init", nargs="+", default=["he-backward"], help="Weight initialization scheme") parser.add_argument("--learn-rate", type=float, default=0.1, help="Learning rate for weights and biases.") parser.add_argument( "--learn-momentum", type=float, default=[0.0, 0.0], nargs="+", help="Learning momentum for weights and biases (0.0 - 1.0).") parser.add_argument( "--learn-anneal", type=float, default=1, help="Annealing factor per epoch for weight and bias learning rate") parser.add_argument( "--learn-anneal-epochs", nargs="+", type=int, default=[], help="Epochs to apply learning rate annealing (default every epoch)") parser.add_argument("--learn-decay", type=float, default=0.0, help="L2 weight decay (not applied to biases). ") parser.add_argument("--epochs", type=int, default=30, help="The number of training epochs") parser.add_argument("--max-samples", type=int, default=None, help="Maximum samples to load from training set") parser.add_argument("--batch-size", type=int, default=32, help="Size of processing batchs") parser.add_argument("--seed", type=int, default=23455, help="Random Seed for weights") parser.add_argument( "--distort-mode", default=[], nargs="+", help="Distortions to apply to training data (default, cifar10, disable)" ) parser.add_argument("--disable-intermediate", default=False, action="store_true", help="Disable outputting of intermediate model files") parser.add_argument( "--augment-mirror", default=False, action="store_true", help="Augment training data with horizontally mirrored copies") parser.add_argument("--skip-train", default=False, action="store_true", help="Skip training of model") parser.add_argument("--skip-layer-updates", type=int, nargs="+", default=[], help="Skip training updates to specified layers") parser.add_argument("--model-desc", default=[ "C[100,7]", "P[2]", "C[150,4]", "P[2]", "C[250,4]", "P[2]", "C[300,1]", "R" ], nargs="+", type=str, help="Network layer description") args = parser.parse_args() logging.init(args) #set random seeds random.seed(args.seed) numpy.random.seed(args.seed) #load training dataset logging.info("Loading training data:", args.train) train_data = dataset.load(args.train, args.extension, is_training=True, thread_num=args.thread_num) data_shape = train_data.get_data_shape() class_num = train_data.get_class_num() class_labels = train_data.class_labels logging.info("Found %i class labels:\n" % class_num, class_labels) #hack for reducing training data size if not args.max_samples is None: train_data.data = random.sample(train_data.data, args.max_samples) #mirror training data if args.augment_mirror: train_data.augment_mirror() logging.info("Training: %i samples" % len(train_data)) #load test dataset if args.test: logging.info("Loading test: " + args.test) test_data = dataset.load(args.test, args.extension, is_training=False, thread_num=args.thread_num, class_labels=class_labels) #initialize model model = model_cnn.initialize(args, data_shape, class_labels, class_num) model.build_train_func(args.solver, args.cost_factors) #Run training best_test_error = 100.0 learn_rate = args.learn_rate for epoch in range(args.epochs): logging.info("----- Training Epoch: %i -----" % epoch) #perform training if not args.skip_train: logging.info("Training with solver " + args.solver + ", learning rate " + str(learn_rate) + " and momentum " + str(args.learn_momentum)) #shuffle dataset: train_data.shuffle() for subset in range(train_data.subset_num): timer = common.Timer() train_data.load_from_subset(subset) logging.info("Performing Gradient Descent...") cost = model.train_epoch(train_data, epoch, learn_rate, args.learn_momentum, args.learn_decay) nbatch = math.ceil(len(train_data) / model.batch_size) logging.info("Training subset %i - Cost: %.3f, Took %.1f sec" % (subset, cost, timer.current())) if len(args.learn_anneal_epochs) == 0 or ( epoch + 1) in args.learn_anneal_epochs: logging.verbose("Annealing learning rate") learn_rate *= args.learn_anneal #perform testing test_error = 0 if not args.test is None and ((epoch % args.test_epochs) == 0 or epoch == (args.epochs - 1)): test_error, test_class_errors = compute_error(test_data, model) logging.info( "Epoch %i test error: %.2f%% (%i samples)" % (epoch, test_error, int(test_error * len(test_data) / 100.0))) save_results(args.output_prefix + "_epoch%03i.test" % epoch, test_error, test_class_errors) #save intermediate models if not args.disable_intermediate: model_cnn.save_to_file( model, args.output_prefix + "_epoch%03i.mdl.gz" % (epoch)) #save final model model_cnn.save_to_file( model, args.output_prefix + "_epoch%03i_final.mdl.gz" % epoch) logging.info("Finished Training")
def get_detections(self, model, data_x, data_m, params): pr_threshold = params.get("prThreshold", 0.01) nms_threshold = params.get("nmsThreshold", 0.5) corner_threshold = params.get("cornerThreshold", self.sparse_layer.corner_threshold) corner_max = params.get("cornerMax", 1024) t = (pr_threshold, nms_threshold, corner_threshold, corner_max) logging.verbose( "Using detection params - pr threshold: %f, nms threshold: %f, corner_threshold: %f, corner_max: %i" % t) first_detect = False if self.detect_func is None: #get all model outputs outputs = [] outputs.append(self.det_pr) if self.use_bbox_reg: outputs.append(self.bbox_reg) logging.info("Building detection function") self.detect_func = theano.function([model.input], outputs, givens=[(get_train(), tensor.cast(0, 'int8'))], on_unused_input='ignore') logging.verbose("Exporting graph...") with open("detect_graph.txt", "w") as f: theano.printing.debugprint(self.detect_func, file=f, print_type=True) first_detect = True #get sampling bounding boxs logging.verbose("Detecting sample bboxs (%.2f)" % corner_threshold) timer = common.Timer() sample_bboxs = self.sparse_layer.get_samples(data_x, train=False, store_shared=True) timer.mark() logging.verbose("Found sample bboxs: {}".format( [len(bbox) for bbox in sample_bboxs])) #upload sampling bounding boxs bboxs = self.sparse_layer.set_samples(sample_bboxs) timer.mark() #classify sampling bounding boxs r = list(self.detect_func(data_x)) #get outputs det_pr = r[0] r_index = 1 if self.use_bbox_reg: bbox_reg = r[r_index] r_index += 1 #update bbox array bboxs_cx = 0.5 * (bboxs[:, :, :, 0] + bboxs[:, :, :, 2]) bboxs_cy = 0.5 * (bboxs[:, :, :, 1] + bboxs[:, :, :, 3]) bboxs_w = bboxs[:, :, :, 2] - bboxs[:, :, :, 0] bboxs_h = bboxs[:, :, :, 3] - bboxs[:, :, :, 1] predict_cx = bbox_reg[:, 0, :, :] * bboxs_w + bboxs_cx predict_cy = bbox_reg[:, 1, :, :] * bboxs_h + bboxs_cy predict_w = numpy.exp(bbox_reg[:, 2, :, :]) * bboxs_w predict_h = numpy.exp(bbox_reg[:, 3, :, :]) * bboxs_h bboxs[:, :, :, 0] = predict_cx - predict_w * 0.5 bboxs[:, :, :, 1] = predict_cy - predict_h * 0.5 bboxs[:, :, :, 2] = predict_cx + predict_w * 0.5 bboxs[:, :, :, 3] = predict_cy + predict_h * 0.5 timer.mark() detlists = c_code.build_detections_nms(pr_threshold, nms_threshold, det_pr, bboxs, [len(s) for s in sample_bboxs]) timer.mark() logging.verbose("Found detections:", [len(detlist) for detlist in detlists]) logging.verbose( "FPS=%.1f, Timing (ms) - get samples: %i, upload: %i, classify: %i, build+nms %i" % tuple([self.batch_size / timer.current()] + timer.deltas_ms())) if not first_detect: global detect_time, detect_num detect_time += timer.current() detect_num += self.batch_size logging.info("Average FPS=%.1f" % (detect_num / detect_time)) #results format results = [] for i, detlist in enumerate(detlists): results.append({"detections": detlist, "meta": data_m[i]}) return results
def get_target(self, model, samples, metas): timer = common.Timer() #build sample det_pr = numpy.zeros(self.det_shape, dtype=numpy.float32) det_pr[:, self.null_class, ...] = 1.0 if self.use_bbox_reg: bbox_valid = numpy.zeros( (self.batch_size, self.sample_num, self.sample_num), dtype=numpy.float32) bbox_reg = numpy.ones( (self.batch_size, 8, self.sample_num, self.sample_num), dtype=numpy.float32) for b, meta in enumerate(metas): samples = [ bbox for _, bbox in self.sparse_layer.sample_bbox_list[b] ] if len(meta["bbox"]) > 0 and len(samples) > 0: overlap = theano_util.get_overlap_iou(meta["bbox"], samples) bbox_indexs, sample_indexs = numpy.where( overlap > self.overlap_threshold) for obj, index in zip(bbox_indexs.tolist(), sample_indexs.tolist()): sample_i = index % self.sparse_layer.sample_num sample_j = index // self.sparse_layer.sample_num sample_cls = meta["class"][obj] sample_bbox = samples[index] det_pr[b, sample_cls, sample_j, sample_i] = 1.0 det_pr[b, self.null_class, sample_j, sample_i] = 0.0 if self.use_bbox_reg: overlap_max = overlap.argmax(axis=0) for index in range(len(samples)): obj = overlap_max[index] if overlap[obj, index] <= self.overlap_threshold: continue sample = samples[index] target = meta["bbox"][obj] sample_i = index % self.sparse_layer.sample_num sample_j = index // self.sparse_layer.sample_num bbox_valid[b, sample_j, sample_i] = 1.0 bbox_reg[b, 0, sample_j, sample_i] = 0.5 * (target[0] + target[2]) bbox_reg[b, 1, sample_j, sample_i] = 0.5 * (target[1] + target[3]) bbox_reg[b, 2, sample_j, sample_i] = target[2] - target[0] bbox_reg[b, 3, sample_j, sample_i] = target[3] - target[1] bbox_reg[b, 4, sample_j, sample_i] = 0.5 * (sample[0] + sample[2]) bbox_reg[b, 5, sample_j, sample_i] = 0.5 * (sample[1] + sample[3]) bbox_reg[b, 6, sample_j, sample_i] = sample[2] - sample[0] bbox_reg[b, 7, sample_j, sample_i] = sample[3] - sample[1] #normalize probabilities det_pr /= det_pr.sum(axis=1)[:, None, ...] #normalize by number of samples nfactor = self.sample_num * self.sample_num det_pr /= nfactor if self.use_bbox_reg: bbox_valid /= nfactor #pack indices / values yt_value = det_pr.flatten() if self.use_bbox_reg: yt_value = numpy.concatenate( (yt_value, bbox_valid.flatten(), bbox_reg.flatten())) return numpy.array([], dtype=numpy.int64), yt_value
def main(): #load arguments: parser = argparse.ArgumentParser( description='Train a convolutional network using labelled data') logging.add_arguments(parser) parser.add_argument( "--use-acc-mode", default=False, action="store_true", help="Use model accumulation over multiple batches (uses more VRAM)") parser.add_argument("--cost-factors", default=[], nargs="+", help="Multiplicative factors for model costs") parser.add_argument("--export-model-dims", default=False, action="store_true", help="Ignore, don't use this option!") parser.add_argument("--model-dims", default="./model-dims.json", type=str, help="export file for shared model dimensions") parser.add_argument( "--model-save-dt", default=30, type=int, help= "Minimum time (min) between saving an intermediate model. Use 0 to disable." ) parser.add_argument("--model", required=False, default=None, help="Model to continue training.") parser.add_argument("--gpus", nargs="+", default=["gpu0"], help="list of gpus to train over") parser.add_argument( "--update-server", metavar="<addr> [port] [offset] [delta]", nargs="+", default=None, help= "model update server for synchronizing multiple networked machines. Set <addr> to 'mpi' for MPI networking." ) parser.add_argument( "--subset-max", type=int, default=10000000, help= "Specify maximum number of subsets to be used in each training epoch") parser.add_argument("--train", default=None, help="The folder with training / validation data") parser.add_argument("--test", default=None, help="The folder with testing data (optional)") parser.add_argument("--test-mode", default="default", help="Testing Mode") parser.add_argument("--test-epochs", type=int, default=1, help="Epochs between each test evaluation") parser.add_argument( "--thread-num", type=int, default=1, help= "Number of threads to use for supported opeartions (e.g. loading/distorting datasets)" ) parser.add_argument("--extension", default="ppm", help="Image file extension") parser.add_argument( "--activation", default="relu", help= "Activation function used in convolution / hidden layers (tanh, relu, leaky-relu)" ) parser.add_argument( "--border-mode", default="half", help="Border mode for convolutional layers (full, valid)") parser.add_argument("--output-prefix", default="./model", help="Output prefix for model files") parser.add_argument("--solver", type=str, default="nesterov", help="") parser.add_argument("--weight-init", nargs="+", default=["he-backward"], help="Weight initialization scheme") parser.add_argument("--initial-tune", type=float, default=0.0, help="Perform initial tuning with learning rate") parser.add_argument("--learn-rate", type=float, default=0.1, help="Learning rate for weights and biases.") parser.add_argument( "--learn-momentum", type=float, default=[0.0, 0.0], nargs="+", help="Learning momentum for weights and biases (0.0 - 1.0).") parser.add_argument( "--learn-anneal", type=float, default=1, help="Annealing factor per epoch for weight and bias learning rate") parser.add_argument( "--learn-anneal-epochs", nargs="+", type=int, default=[], help="Epochs to apply learning rate annealing (default every epoch)") parser.add_argument("--learn-decay", type=float, default=0.0, help="L2 weight decay (not applied to biases). ") parser.add_argument("--epochs", type=int, default=30, help="The number of training epochs") parser.add_argument("--epoch-start", type=int, default=0, help="Epoch to start from") parser.add_argument("--subset-start", type=int, default=0, help="Subset to start from") parser.add_argument("--max-samples", type=int, default=None, help="Maximum samples to load from training set") parser.add_argument("--batch-size", type=int, default=32, help="Size of each processing batch (per GPU)") parser.add_argument( "--batch-size-factor", type=int, default=1, help= "Batch size multiplier, use when desired batch size won't fit in memory." ) parser.add_argument( "--batch-data-size", type=int, default=1, help="Number of batches to upload to GPU for processing") parser.add_argument("--seed", type=int, default=23455, help="Random Seed for weights") parser.add_argument( "--split-seed", type=int, default=0, help="Random Seed for splitting into validation / training") parser.add_argument("--export-symbolic", default=None, help="Save datasets as symbolic links") parser.add_argument( "--distort-mode", default=[], nargs="+", help="Distortions to apply to training data (default, cifar10, disable)" ) parser.add_argument( "--augment-mirror", default=False, action="store_true", help="Augment training data with horizontally mirrored copies") parser.add_argument("--skip-train", default=False, action="store_true", help="Skip training of model") parser.add_argument("--skip-layer-updates", type=int, nargs="+", default=[], help="Skip training updates to specified layers") parser.add_argument("--model-desc", default=[ "C100,7", "P2", "C150,4", "P2", "C250,4", "P2", "C300,1", "CR" ], nargs="+", type=str, help="Network layer description") parser.add_argument( "--theano-flags", type=str, default="lib.cnmem=1.0", help="Additional THEANO_FLAGS environment variables for worker threads" ) parser.add_argument("--restart", default=False, action="store_true", help="Restart training of model") args = parser.parse_args() logging.init(args) #continue training args_fname = "./train.args" if args.restart: args = load_restart_args(args_fname, args) else: logging.info("Exporting arguments:", args_fname) with open(args_fname, "wb") as f: pickle.dump(args, f) #start MPI update server if this is master node: if not args.update_server is None and args.update_server[0] == "mpi": from mpi4py import MPI if MPI.COMM_WORLD.Get_rank() == 0: momentum = float(args.update_server[1]) update_server = UpdateServer(args.model_dims, momentum=momentum, use_mpi=True, use_async=True) sys.exit(update_server.start()) #set random seeds random.seed(args.seed) numpy.random.seed(args.seed) #load training dataset logging.info("Loading training data: " + str(args.train)) train_data = dataset.load(args.train, args.extension, is_training=True, thread_num=args.thread_num) data_shape = train_data.get_data_shape() class_num = train_data.get_class_num() class_labels = train_data.class_labels logging.info( "Found %i samples across %i class Labels:\n" % (train_data.get_total_size(), class_num), class_labels) #HACK to determine model parameter dimensions for shared models without initializing theano... #Not need any more in theano-0.8.0 if not os.path.isfile(args.model_dims): logging.info("Exporting model dims file to " + args.model_dims) import model_cnn model = model_cnn.initialize(args, data_shape, class_labels, class_num) model.build_train_func(args.solver, skip_build=True) shared.ModelUpdate.save_dims(args.model_dims, model) logging.info("Done") exit(0) #construct worker processes (must be done before model due to Theano init! No longer true in theano 0.8.0): logging.info("Initializing worker procs for", args.gpus) workers = [ WorkerProcess(gpu, args, data_shape, class_labels) for gpu in args.gpus ] #initialize model (and Theano) import model_cnn model = model_cnn.initialize(args, data_shape, class_labels, class_num) model.build_train_func(args.solver, skip_build=True) #mirror training data if args.augment_mirror: train_data.augment_mirror() #load test dataset if args.test: logging.info("Loading test: " + str(args.test)) test_data = dataset.load(args.test, args.extension, is_training=False, class_labels=class_labels, thread_num=args.thread_num) logging.info("Testing: " + str(test_data.get_total_size()) + " samples") assert (test_data.get_total_size() != 0) #connect with update server if not args.update_server is None: addr = args.update_server[0] use_mpi = bool(addr == "mpi") use_async = bool(len(args.update_server) == 2) port = 0 if use_mpi else int(args.update_server[1]) offset = 0 if use_async else int(args.update_server[2]) delta = 0 if use_async else int(args.update_server[3]) logging.info( "Connecting to update server (async=%i, mpi=%i): " % (use_async, use_mpi), addr, port) sock = 0 if use_mpi else socket.create_connection((addr, port)) update_client = UpdateClient(args.epoch_start, args.subset_start, train_data.subset_num, sock, use_async, use_mpi, offset, delta) else: update_client = UpdateClient(args.epoch_start, args.subset_start, train_data.subset_num) #perform training and save models if args.initial_tune > 0: logging.info("----- Initial Fine Tune -----") logging.info("Running initial tune with learning rate:", args.initial_tune) run_train_epoch(args, update_client, workers, model, train_data, args.initial_tune) #anneal learning rate learn_rate = args.learn_rate for epoch in range(0, args.epoch_start): if len(args.learn_anneal_epochs) == 0 or ( epoch + 1) in args.learn_anneal_epochs: logging.verbose("Annealing learning rate") learn_rate *= args.learn_anneal #Run training best_test_error = 100.0 for epoch in range(args.epoch_start, args.epochs): logging.info("----- Training Epoch: " + str(epoch) + " -----") #perform training and save models if not args.skip_train: logging.info("Training with learning rates " + str(learn_rate) + " and momentum " + str(args.learn_momentum)) timer = common.Timer() cost = run_train_epoch(args, update_client, workers, model, train_data, learn_rate) logging.info("Training - mean cost:", cost, ", took %.0f sec" % timer.current()) #anneal learning rate if len(args.learn_anneal_epochs) == 0 or ( epoch + 1) in args.learn_anneal_epochs: logging.verbose("Annealing learning rate") learn_rate *= args.learn_anneal #perform testing test_error = 0 if args.test and ((epoch % args.test_epochs) == 0 or epoch == (args.epochs - 1)): ts = time.time() test_error, test_class_errors = compute_error( workers, model, test_data) logging.info("Epoch %i Test Error: %.2f%%, Took %.0f sec" % (epoch, test_error, time.time() - ts)) save_results(args.output_prefix + "_epoch%03i.test" % epoch, test_error, test_class_errors) logging.info("Finished Training")
def run_train_epoch(args, update_client, workers, model, train_data, learn_rate): import model_cnn logging.info("Perform train...") batch_size_factor = args.batch_size_factor output_prefix = args.output_prefix model_dims = args.model_dims model_save_dt = args.model_save_dt * 60 #update learning rates: for worker in workers: with worker.learn_rate.get_lock(): worker.learn_rate.value = learn_rate #randomly shuffle data before each epoch, set seed to ensure each node has same data order random.seed(args.seed + update_client.epoch) train_data.shuffle() #perform initial sync so that all nodes have the same model model_update = shared.ModelUpdate(model_dims) model_update.import_updates(model) # update_client.sync(model_update, workers, initial=True) #get subset next subset_next = update_client.get_subset_next() #start export of data batch_size = len(workers) * model.batch_size * batch_size_factor logging.info( "SGD batch size is %ix%ix%i = %i" % (batch_size_factor, len(workers), model.batch_size, batch_size)) export_thread = DatasetExportThread(model, train_data, subset_next, batch_size, True) #start processing total_cost = 0 total_it = 0 subset_current = subset_next epoch_current = update_client.epoch for worker in workers: worker.set_epoch(epoch_current) timer = common.Timer() timer_save = common.Timer() while subset_next >= 0: #wait until export is ready timer.reset() export_thread.wait() data_x, data_y, data_size = export_thread.get_export() subset_current = subset_next del export_thread if timer.current() > 1: logging.warning( "Warning: needed an additional %.1f seconds for dataset export" % timer.current()) #print training classes for checking random seed etc logging.debug("Sample Metas: ", data_y[0:min(3, len(data_y))]) #start exporting next subset subset_next = update_client.get_subset_next() if subset_next >= 0: export_thread = DatasetExportThread(model, train_data, subset_next, batch_size, True) # #store initial model before changes # model_update_delta = model_update.copy() logging.info("Evaluating training function") timer.reset() batch_num = data_x.shape[0] // model.batch_size it_num = batch_num // (len(workers) * batch_size_factor) index = 0 subset_cost = 0 while (index < batch_num): total_ts = time.time() def train_worker_thread(worker, indexs): worker.wait() worker.model_write(model_update) worker.train_begin() for i in indexs: dx = data_x[i * model.batch_size:(i + 1) * model.batch_size] dy = data_y[i * model.batch_size:(i + 1) * model.batch_size] worker.train_step(dx, dy) worker.wait() worker.train_end() worker.model_read() worker.wait() threads = [] for worker in workers: worker_indexs = [] for _ in range(batch_size_factor): if index < batch_num: worker_indexs.append(index) index += 1 t = threading.Thread(target=train_worker_thread, args=(worker, worker_indexs)) t.start() threads.append((t, time.time())) proc_ts = [] for t, start_ts in threads: t.join() proc_ts.append(int(1000 * (time.time() - start_ts))) #average models between GPUS and print batch info combine_ts = time.time() batch_cost = 0 model_update.set_mean_init() for worker in workers: model_update.set_mean_update(worker.model_update) with worker.cost.get_lock(): batch_cost += worker.cost.value model_update.set_mean_finish() batch_cost /= len(workers) subset_cost += batch_cost it_index = index // (len(workers) * batch_size_factor) combine_ts = int(1000 * (time.time() - combine_ts)) logging.verbose("Processing times (ms):", proc_ts, ", Combine time: %i ms" % combine_ts) logging.info( "Subset %i/%i, Batch It %i/%i" % (subset_current + 1, train_data.subset_num, it_index, it_num), "- Cost:", batch_cost, "Time: %i ms" % (1000 * (time.time() - total_ts))) logging.info( "Training subset %i took %0.1f sec, mean cost:" % (subset_current + 1, timer.current()), subset_cost / it_num) total_it += it_num total_cost += subset_cost #update with server (if one exists) model_update.export_updates(model) # model_update_delta.set_delta(model_update) # update_client.update(model_update_delta, model_update, workers) #save intermediate models if timer_save.current() > model_save_dt and model_save_dt > 0: model_cnn.save_to_file( model, output_prefix + "_epoch%03i_subset%03i.mdl.gz" % (epoch_current, subset_current + 1)) timer_save.reset() #perform final sync so that all nodes have the same model update_client.sync(model_update, workers) #save final models model_cnn.save_to_file( model, output_prefix + "_epoch%03i_final.mdl.gz" % (epoch_current)) return (total_cost / total_it)
def get_detections(self, model, data_x, data_m, params): pr_threshold = params.get("prThreshold", 0.01) nms_threshold = params.get("nmsThreshold", 0.5) corner_threshold = params.get("cornerThreshold", self.sparse_layer.corner_threshold) corner_max = params.get("cornerMax", 1024) use_soft_nms = params.get("useSoftNMS", 0) == 1 t = (pr_threshold, nms_threshold, corner_threshold, corner_max) logging.verbose("Using detection params - pr threshold: %f, nms threshold: %f, corner_threshold: %f, corner_max: %i"%t) first_detect = False if self.detect_func is None: #get all model outputs outputs=[] if self.use_jointfit: det_fit = self.det_pr det_fit_null = det_fit[:, self.null_class, :, :] det_fit = det_fit[:,:self.class_num*self.fitness_num, :, :] det_fit = det_fit.reshape((self.batch_size, self.class_num, self.fitness_num, self.sample_num, self.sample_num)) det_fit_pr = tensor.exp(det_fit) m = tensor.max(det_fit, axis=2) det_pr = m + tensor.log(tensor.sum(tensor.exp(det_fit - m[:,:,None,:,:]), axis=2)) det_pr = tensor.concatenate([det_pr, det_fit_null[:,None,:,:]], axis=1) outputs.append(det_pr) val = [self.overlap_threshold[0] + i*(1.0 - self.overlap_threshold[0])/self.fitness_num for i in range(self.fitness_num)] fitness_val = theano.shared(numpy.array(val, dtype=numpy.float32)) fitness = tensor.log(tensor.sum(det_fit_pr*fitness_val[None,None,:,None,None], axis=2)) outputs.append(fitness) else: outputs.append(self.det_pr) if self.use_bbox_reg: outputs.append(self.bbox_predict) if self.use_indfit: outputs.append(tensor.exp(self.indfit_pr)) logging.info("Building detection function") self.detect_func = theano.function([model.input], outputs, givens=[(get_train(), tensor.cast(0, 'int8'))], on_unused_input='ignore') logging.verbose("Exporting graph...") with open("detect_graph.txt", "w") as f: theano.printing.debugprint(self.detect_func, file=f, print_type=True) first_detect = True #get sampling bounding boxs logging.verbose("Detecting sample bboxs (%.2f)"%corner_threshold) timer = common.Timer() sample_bboxs = self.sparse_layer.get_samples(data_x, train=False, store_shared=True) timer.mark() logging.verbose("Found sample bboxs: {}".format([len(bbox) for bbox in sample_bboxs])) #upload sampling bounding boxs bboxs = self.sparse_layer.set_samples(sample_bboxs) timer.mark() #classify sampling bounding boxs r = list(self.detect_func(data_x)) #get outputs if self.use_jointfit: det_pr = r[0] fitness = r[1] r_index = 2 else: det_pr = r[0] fitness = numpy.copy(det_pr) r_index = 1 if self.use_bbox_reg: bboxs = r[r_index] r_index += 1 else: bboxs = self.sparse_layer.get_bbox_array(sample_bboxs) if self.use_indfit: indfit_pr = r[r_index] fitness_val = numpy.array([0.0] + [self.overlap_threshold[0] + i * (1.0 - self.overlap_threshold[0])/(self.fitness_num-1) for i in range(self.fitness_num-1)]) fitness_exp = numpy.sum(indfit_pr*fitness_val[None,:,None,None], axis=1).astype(numpy.float32) fitness += numpy.log(fitness_exp)[:,None,:,:] r_index += 1 timer.mark() sample_bbox_num = [len(s) for s in sample_bboxs] detlists = c_code.build_detections_nms(pr_threshold, nms_threshold, use_soft_nms, det_pr, fitness, bboxs, sample_bbox_num) timer.mark() logging.verbose("Found detections:", [len(detlist) for detlist in detlists]) logging.verbose("FPS=%.1f, Timing (ms) - get samples: %i, upload: %i, classify: %i, build+nms %i"%tuple([self.batch_size / timer.current()] + timer.deltas_ms())) if not first_detect: global detect_time, detect_num detect_time += timer.current() detect_num += self.batch_size logging.info("Average FPS=%.1f"%(detect_num / detect_time)) #results format results=[] for i, detlist in enumerate(detlists): results.append({"detections":detlist, "meta":data_m[i]}) return results