def main(num_epoch): system_init() # load data dataset = TrafficDataset(path=cfg.data.path, train_prop=cfg.data.train_prop, valid_prop=cfg.data.valid_prop, num_sensors=cfg.data.num_sensors, in_length=cfg.data.in_length, out_length=cfg.data.out_length, batch_size_per_gpu=cfg.data.batch_size_per_gpu, num_gpus=1) net = AutoSTG(in_length=cfg.data.in_length, out_length=cfg.data.out_length, node_hiddens=[ dataset.node_fts.shape[1], ] + cfg.model.node_hiddens, edge_hiddens=[ dataset.adj_mats.shape[2], ] + cfg.model.edge_hiddens, in_channels=cfg.data.in_channels, out_channels=cfg.data.out_channels, hidden_channels=cfg.model.hidden_channels, skip_channels=cfg.model.skip_channels, end_channels=cfg.model.end_channels, layer_names=cfg.model.layer_names, num_mixed_ops=cfg.model.num_mixed_ops, candidate_op_profiles=cfg.model.candidate_op_profiles) run_manager = RunManager( name=cfg.model.name, net=net, dataset=dataset, arch_lr=cfg.trainer.arch_lr, arch_lr_decay_milestones=cfg.trainer.arch_lr_decay_milestones, arch_lr_decay_ratio=cfg.trainer.arch_lr_decay_ratio, arch_decay=cfg.trainer.arch_decay, arch_clip_gradient=cfg.trainer.arch_clip_gradient, weight_lr=cfg.trainer.weight_lr, weight_lr_decay_milestones=[ 20, 40, 60, 80 ], # cfg.trainer.weight_lr_decay_milestones, weight_lr_decay_ratio=cfg.trainer.weight_lr_decay_ratio, weight_decay=cfg.trainer.weight_decay, weight_clip_gradient=cfg.trainer.weight_clip_gradient, num_search_iterations=cfg.trainer.num_search_iterations, num_search_arch_samples=cfg.trainer.num_search_arch_samples, num_train_iterations=cfg.trainer.num_train_iterations, criterion=cfg.trainer.criterion, metric_names=cfg.trainer.metric_names, metric_indexes=cfg.trainer.metric_indexes, print_frequency=cfg.trainer.print_frequency, device_ids=[0]) run_manager.load(mode='train') run_manager.clear_records() run_manager.initialize() print('# of params', run_manager._net.num_weight_parameters()) run_manager.train(num_epoch)
def run( self, hostlist_fname, popular_host_fraction,\ mean_think_time, users_per_popular_host,\ users_per_less_popular_host,\ connection_timeout_msecs, socket_timeout_msecs,\ results_dir="./results", run_duration_secs=60, \ config_dir="./config", pipe_port=7851 ): # Some pre-reqs: # 1) create the config_dir if it doesn't exist # 2) create the results_dir if it doesn't exist self.create_dir(config_dir) self.create_dir(results_dir) num_tests = 1 for i in range(num_tests): # With a single Rain launch, load an entire block of ip's config = FixedUrlTestConfig() config.hostListFile = hostlist_fname config.duration = run_duration_secs config.popularHostFraction = popular_host_fraction config.usersPerPopularHost = users_per_popular_host config.usersPerLessPopularHost = users_per_less_popular_host config.meanThinkTime = mean_think_time config.pipePort = pipe_port # Add in the parameters for the workload generator # the operation mixes etc. generatorParams = FixedUrlGeneratorParameters() generatorParams.connectionTimeoutMsecs = connection_timeout_msecs generatorParams.socketTimeoutMsecs = socket_timeout_msecs config.generatorParameters = generatorParams json_data = \ json.dumps(config, sort_keys='True',\ default=FixedUrlTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print( "[FixedUrlTestRunner] json config: {0}"\ .format(json_data) ) run_classpath = ".:rain.jar:workloads/httptest.jar" run_config_filename = config_dir + "/" + \ "run_fixed_url_config" + "_nodes.json" run_output_filename = results_dir + "/" + \ "run_fixed_url_log" + "_nodes.txt" run_results_filename = results_dir + "/" + \ "run_fixed_url_result" + "_nodes.txt" # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[FixedUrlTestRunner] Writing config file: {0}"\ .format( run_config_filename ) config_file = open(run_config_filename, 'w') config_file.write(json_data) config_file.flush() config_file.close() run_output = RunManager.run_rain( run_config_filename,\ run_classpath ) #print run_output track_results = RainOutputParser.parse_output(run_output) # Validate each of the track_results instances for result in track_results: # Set some 90th and 99th pctile thresholds result.pct_overhead_ops_threshold = 10.0 result.pct_failed_ops_threshold = 5.0 # Set the desired 90th and 99th percentile thresholds for # the 50ms, 100ms, 200ms operations - set everything to # 500 ms = 0.5s. Threshold units = seconds result.op_response_time_thresholds['FixedUrl']=\ (0.5,0.5) # Write out the run output print "[FixedUrlTestRunner] Writing output: {0}"\ .format( run_output_filename ) run_output_file = open(run_output_filename, 'w') run_output_file.write(run_output) run_output_file.flush() run_output_file.close() # Write out the run results print "[FixedUrlTestRunner] Writing results: {0}"\ .format( run_results_filename ) run_results_file = open(run_results_filename, 'w') RainOutputParser.print_results(track_results, run_results_file) run_results_file.write("\n") # After writing out the table for all the tracks # Spit out the 90th and 99th percentiles for result in track_results: for k, v in result.op_response_times.items(): run_results_file.write( "{0},{1},{2},{3}\n"\ .format(result.name, k, v[0], v[1]) ) run_results_file.flush() run_results_file.close()
def step_run( self, start_ip, num_apps_to_load, apps_powered_on, \ results_dir="./results", run_duration_secs=60, \ config_dir="./config" ): ''' Given a starting IP, a step size e.g.,: 1) run servers on ip addressed 11.0.0.1 - 11.0.0.200 2) with a step size of 10 run experiments on 11.0.0.1 - 10 11.0.0.1 - 20, ... 11.0.0.1 - 200 ''' # Some pre-reqs: # 1) create the config_dir if it does not exist # 2) create the results_dir if it does not exist self.create_dir(config_dir) self.create_dir(results_dir) num_tests = apps_powered_on / num_apps_to_load for i in range(num_tests): # with one Rain launch we can load an entire block of ip's # using the track feature #ip_address_parts = start_ip.split( "." ) #print len(ip_address_parts) # throw exception if we don't find a numeric ip v4 address #if len(ip_address_parts) != 4: # raise Exception( "Expected a numeric IPv4 address"\ # + " (format N.N.N.N)" ) #lastOctet = int( ip_address_parts[3] ) #base_ip = "{0}.{1}.{2}.{3}".format( ip_address_parts[0],\ # ip_address_parts[1],\ # ip_address_parts[2],\ # str(lastOctet+(num_apps_to_load*i))) # Create config objects to write out as files base_ip = start_ip config = HttpTestConfig() config.baseHostIp = base_ip config.numHostTargets = (i + 1) * num_apps_to_load config.duration = run_duration_secs json_data = \ json.dumps(config, sort_keys='True',\ default=HttpTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print("[HttpTestStepRunner] json config: {0}".format(json_data)) run_classpath = ".:rain.jar:workloads/httptest.jar" run_config_filename = config_dir + "/" + \ "run_config_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.json" run_output_filename = results_dir + "/" + \ "run_log_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" run_results_filename = results_dir + "/" + \ "run_result_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[HttpTestStepRunner] Writing config file: {0}"\ .format( run_config_filename ) config_file = open(run_config_filename, 'w') config_file.write(json_data) config_file.flush() config_file.close() run_output = RunManager.run_rain( run_config_filename,\ run_classpath ) #print run_output track_results = RainOutputParser.parse_output(run_output) # Write out the run output print "[HttpTestStepRunner] Writing output: {0}"\ .format( run_output_filename ) run_output_file = open(run_output_filename, 'w') run_output_file.write(run_output) run_output_file.flush() run_output_file.close() # Write out the run results print "[HttpTestStepRunner] Writing results: {0}"\ .format( run_results_filename ) run_results_file = open(run_results_filename, 'w') RainOutputParser.print_results(track_results, run_results_file) run_results_file.flush() run_results_file.close()
def run( self, hostlist_fname, popular_host_fraction,\ mean_think_time, users_per_popular_host,\ users_per_less_popular_host,\ connection_timeout_msecs, socket_timeout_msecs,\ results_dir="./results", run_duration_secs=60, \ config_dir="./config", pipe_port=7851 ): # Some pre-reqs: # 1) create the config_dir if it doesn't exist # 2) create the results_dir if it doesn't exist self.create_dir( config_dir ) self.create_dir( results_dir ) num_tests = 1 for i in range(num_tests): # With a single Rain launch, load an entire block of ip's config = FixedUrlTestConfig() config.hostListFile = hostlist_fname config.duration = run_duration_secs config.popularHostFraction = popular_host_fraction config.usersPerPopularHost = users_per_popular_host config.usersPerLessPopularHost = users_per_less_popular_host config.meanThinkTime = mean_think_time config.pipePort = pipe_port # Add in the parameters for the workload generator # the operation mixes etc. generatorParams = FixedUrlGeneratorParameters() generatorParams.connectionTimeoutMsecs = connection_timeout_msecs generatorParams.socketTimeoutMsecs = socket_timeout_msecs config.generatorParameters = generatorParams json_data = \ json.dumps(config, sort_keys='True',\ default=FixedUrlTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print( "[FixedUrlTestRunner] json config: {0}"\ .format(json_data) ) run_classpath=".:rain.jar:workloads/httptest.jar" run_config_filename = config_dir + "/" + \ "run_fixed_url_config" + "_nodes.json" run_output_filename = results_dir + "/" + \ "run_fixed_url_log" + "_nodes.txt" run_results_filename = results_dir + "/" + \ "run_fixed_url_result" + "_nodes.txt" # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[FixedUrlTestRunner] Writing config file: {0}"\ .format( run_config_filename ) config_file = open( run_config_filename, 'w' ) config_file.write( json_data ) config_file.flush() config_file.close() run_output = RunManager.run_rain( run_config_filename,\ run_classpath ) #print run_output track_results = RainOutputParser.parse_output( run_output ) # Validate each of the track_results instances for result in track_results: # Set some 90th and 99th pctile thresholds result.pct_overhead_ops_threshold=10.0 result.pct_failed_ops_threshold=5.0 # Set the desired 90th and 99th percentile thresholds for # the 50ms, 100ms, 200ms operations - set everything to # 500 ms = 0.5s. Threshold units = seconds result.op_response_time_thresholds['FixedUrl']=\ (0.5,0.5) # Write out the run output print "[FixedUrlTestRunner] Writing output: {0}"\ .format( run_output_filename ) run_output_file = open( run_output_filename, 'w' ) run_output_file.write( run_output ) run_output_file.flush() run_output_file.close() # Write out the run results print "[FixedUrlTestRunner] Writing results: {0}"\ .format( run_results_filename ) run_results_file = open( run_results_filename, 'w' ) RainOutputParser.print_results( track_results, run_results_file ) run_results_file.write( "\n" ) # After writing out the table for all the tracks # Spit out the 90th and 99th percentiles for result in track_results: for k,v in result.op_response_times.items(): run_results_file.write( "{0},{1},{2},{3}\n"\ .format(result.name, k, v[0], v[1]) ) run_results_file.flush() run_results_file.close()
network=['vae64'], shuffle=[True], num_workers=[5], loss='vae', model='vae64_1') train_set = SLFDatasetUnsampled(root_dir=os.path.join(ROOT, 'slf_mat'), csv_file=os.path.join(ROOT, 'details.csv'), total_data=500000) validation_set = SLFDatasetUnsampled( root_dir=os.path.join(VALIDATION_SET_PATH, 'slf_mat'), csv_file=os.path.join(VALIDATION_SET_PATH, 'details.csv'), total_data=5000) m = RunManager() vae_loss = VAELoss() mse_loss = torch.nn.MSELoss() for run in RunBuilder.get_runs(params): device = torch.device(run.device) network = networks[run.network]().to(run.device) loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers) validation_loader = torch.utils.data.DataLoader( validation_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers)
net_origin = nn.DataParallel(ResNet_ImageNet(depth=50, num_classes=run_config.data_provider.n_classes)) elif args.model=="mobilenetv2": assert args.dataset=='imagenet', 'mobilenetv2 only supports imagenet dataset' net = MobileNetV2(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg)) if args.base_path!=None: weight_path = args.base_path+'/checkpoint/model_best.pth.tar' net_origin = nn.DataParallel(MobileNetV2(num_classes=run_config.data_provider.n_classes)) elif args.model=="mobilenet": assert args.dataset=='imagenet', 'mobilenet only supports imagenet dataset' net = MobileNet(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg)) if args.base_path!=None: weight_path = args.base_path+'/checkpoint/model_best.pth.tar' net_origin = nn.DataParallel(MobileNet(num_classes=run_config.data_provider.n_classes)) # build run manager run_manager = RunManager(args.path, net, run_config) if args.local_rank == 0: run_manager.save_config(print_info=True) # load checkpoints if args.base_path!=None: weight_path = args.base_path+'/checkpoint/model_best.pth.tar' if args.resume: run_manager.load_model() if args.train and run_manager.best_acc == 0: loss, acc1, acc5 = run_manager.validate(is_test=True, return_top5=True) run_manager.best_acc = acc1 elif weight_path!=None and os.path.isfile(weight_path): assert net_origin != None, "original network is None" net_origin.load_state_dict(torch.load(weight_path)['state_dict']) net_origin = net_origin.module
# build net from args if 'proxyless' in args.net: from models.normal_nets.proxyless_nets import proxyless_base net_config_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.config' % args.net net = proxyless_base( net_config=net_config_url, n_classes=run_config.data_provider.n_classes, bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout, ) else: raise ValueError('do not support: %s' % args.net) # build run manager run_manager = RunManager(args.path, net, run_config, measure_latency=args.latency) run_manager.save_config(print_info=True) # load checkpoints best_model_path = '%s/checkpoint/model_best.pth.tar' % args.path if os.path.isfile(best_model_path): init_path = best_model_path else: init_path = '%s/init' % args.path if args.resume: run_manager.load_model() if args.train and run_manager.best_acc == 0: loss, acc1, acc5 = run_manager.validate(is_test=False, return_top5=True)
def main(args, myargs): torch.manual_seed(args.manual_seed) torch.cuda.manual_seed_all(args.manual_seed) np.random.seed(args.manual_seed) # os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu os.makedirs(args.path, exist_ok=True) # prepare run config run_config_path = '%s/run.config' % args.path if os.path.isfile(run_config_path): # load run config from file run_config = json.load(open(run_config_path, 'r')) run_config = ImagenetRunConfig(**run_config) if args.valid_size: run_config.valid_size = args.valid_size else: # build run config from args args.lr_schedule_param = None args.opt_param = { 'momentum': args.momentum, 'nesterov': not args.no_nesterov, } if args.no_decay_keys == 'None': args.no_decay_keys = None run_config = ImagenetRunConfig(**args.__dict__) print('Run config:') for k, v in run_config.config.items(): print('\t%s: %s' % (k, v)) # prepare network net_config_path = '%s/net.config' % args.path if os.path.isfile(net_config_path): # load net from file from models import get_net_by_name net_config = json.load(open(net_config_path, 'r')) net = get_net_by_name(net_config['name']).build_from_config(net_config) else: # build net from args if 'proxyless' in args.net: from models.normal_nets.proxyless_nets import proxyless_base net_config_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.config' % args.net net = proxyless_base( net_config=net_config_url, n_classes=run_config.data_provider.n_classes, bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout, ) else: raise ValueError('do not support: %s' % args.net) # build run manager run_manager = RunManager(args.path, net, run_config, measure_latency=args.latency) run_manager.save_config(print_info=True) # load checkpoints init_path = '%s/init' % args.path if args.resume: run_manager.load_model() if args.train and run_manager.best_acc == 0: loss, acc1, acc5 = run_manager.validate(is_test=False, return_top5=True) run_manager.best_acc = acc1 elif os.path.isfile(init_path): if torch.cuda.is_available(): checkpoint = torch.load(init_path) else: checkpoint = torch.load(init_path, map_location='cpu') if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] run_manager.net.module.load_state_dict(checkpoint) elif 'proxyless' in args.net and not args.train: from utils.latency_estimator import download_url pretrained_weight_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.pth' % args.net print('Load pretrained weights from %s' % pretrained_weight_url) init_path = download_url(pretrained_weight_url) init = torch.load(init_path, map_location='cpu') net.load_state_dict(init['state_dict']) else: print('Random initialization') # train if args.train: print('Start training') run_manager.train(print_top5=True) run_manager.save_model() output_dict = {} # validate if run_config.valid_size: print('Test on validation set') loss, acc1, acc5 = run_manager.validate(is_test=False, return_top5=True) log = 'valid_loss: %f\t valid_acc1: %f\t valid_acc5: %f' % (loss, acc1, acc5) run_manager.write_log(log, prefix='valid') output_dict = { **output_dict, 'valid_loss': ' % f' % loss, 'valid_acc1': ' % f' % acc1, 'valid_acc5': ' % f' % acc5, 'valid_size': run_config.valid_size } # test print('Test on test set') loss, acc1, acc5 = run_manager.validate(is_test=True, return_top5=True) log = 'test_loss: %f\t test_acc1: %f\t test_acc5: %f' % (loss, acc1, acc5) run_manager.write_log(log, prefix='test') output_dict = { **output_dict, 'test_loss': '%f' % loss, 'test_acc1': '%f' % acc1, 'test_acc5': '%f' % acc5 } json.dump(output_dict, open('%s/output' % args.path, 'w'), indent=4)
elif args.model == "resnet50": assert args.dataset == 'imagenet', 'resnet50 only supports imagenet dataset' net = ResNet_ImageNet(depth=50, num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg)) elif args.model == "mobilenetv2": assert args.dataset == 'imagenet', 'mobilenetv2 only supports imagenet dataset' net = MobileNetV2(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg)) elif args.model == "mobilenet": assert args.dataset == 'imagenet', 'mobilenet only supports imagenet dataset' net = MobileNet(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg)) # build run manager run_manager = RunManager(args.path, net, run_config) # load checkpoints best_model_path = '%s/checkpoint/model_best.pth.tar' % args.path assert os.path.isfile(best_model_path), 'wrong path' if torch.cuda.is_available(): checkpoint = torch.load(best_model_path) else: checkpoint = torch.load(best_model_path, map_location='cpu') if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] run_manager.net.load_state_dict(checkpoint) output_dict = {} # test
def step_run( self, start_ip, num_apps_to_load, apps_powered_on, \ results_dir="./results", run_duration_secs=60, \ config_dir="./config" ): ''' Given a starting IP, a step size e.g.,: 1) run servers on ip addressed 11.0.0.1 - 11.0.0.200 2) with a step size of 10 run experiments on 11.0.0.1 - 10 11.0.0.1 - 20, ... 11.0.0.1 - 200 ''' # Some pre-reqs: # 1) create the config_dir if it does not exist # 2) create the results_dir if it does not exist self.create_dir( config_dir ) self.create_dir( results_dir ) num_tests = apps_powered_on/num_apps_to_load for i in range(num_tests): # with one Rain launch we can load an entire block of ip's # using the track feature #ip_address_parts = start_ip.split( "." ) #print len(ip_address_parts) # throw exception if we don't find a numeric ip v4 address #if len(ip_address_parts) != 4: # raise Exception( "Expected a numeric IPv4 address"\ # + " (format N.N.N.N)" ) #lastOctet = int( ip_address_parts[3] ) #base_ip = "{0}.{1}.{2}.{3}".format( ip_address_parts[0],\ # ip_address_parts[1],\ # ip_address_parts[2],\ # str(lastOctet+(num_apps_to_load*i))) # Create config objects to write out as files base_ip = start_ip config = HttpTestConfig() config.baseHostIp = base_ip config.numHostTargets = (i+1)*num_apps_to_load config.duration = run_duration_secs json_data = \ json.dumps(config, sort_keys='True',\ default=HttpTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print( "[HttpTestStepRunner] json config: {0}".format(json_data) ) run_classpath=".:rain.jar:workloads/httptest.jar" run_config_filename = config_dir + "/" + \ "run_config_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.json" run_output_filename = results_dir + "/" + \ "run_log_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" run_results_filename = results_dir + "/" + \ "run_result_" + base_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[HttpTestStepRunner] Writing config file: {0}"\ .format( run_config_filename ) config_file = open( run_config_filename, 'w' ) config_file.write( json_data ) config_file.flush() config_file.close() run_output = RunManager.run_rain( run_config_filename,\ run_classpath ) #print run_output track_results = RainOutputParser.parse_output( run_output ) # Write out the run output print "[HttpTestStepRunner] Writing output: {0}"\ .format( run_output_filename ) run_output_file = open( run_output_filename, 'w' ) run_output_file.write( run_output ) run_output_file.flush() run_output_file.close() # Write out the run results print "[HttpTestStepRunner] Writing results: {0}"\ .format( run_results_filename ) run_results_file = open( run_results_filename, 'w' ) RainOutputParser.print_results( track_results, run_results_file ) run_results_file.flush() run_results_file.close()
import flags from run_manager import RunManager from character import Character import level ################################# flags.run_manager = RunManager()# flags.character = Character() # ################################# level.load_level(flags.run_manager.display_manager.map) flags.run_manager.start_game()
net = ResNet_CIFAR(cfg=None, depth=56, num_classes=10, cutout=False) weight_path = 'Exp_base/resnet56_base/checkpoint/model_best.pth.tar' net_origin = nn.DataParallel( ResNet_CIFAR(depth=56, num_classes=10, cutout=False)) elif args.model == "resnet110": assert args.dataset == 'cifar10', 'resnet110 only supports cifar10 dataset' net = ResNet_CIFAR(cfg=None, depth=110, num_classes=10, cutout=False) weight_path = 'Exp_base/resnet110_base/checkpoint/model_best.pth.tar' net_origin = nn.DataParallel( ResNet_CIFAR(depth=110, num_classes=10, cutout=False)) net_origin.load_state_dict(torch.load(weight_path)['state_dict']) net_origin = net_origin.module base_flops = net.cfg2flops(net.config['cfg_base']) # build run manager run_manager = RunManager(args.path, net, run_config) if args.local_rank == 0: run_manager.save_config(print_info=True) # cfg to fitness cfg2fit_dict = {} def cfg2fitness(cfg): if args.local_rank == 0: print(str(cfg)) if str(cfg) in cfg2fit_dict.keys(): return cfg2fit_dict[str(cfg)] elif cfg == run_manager.net.module.config['cfg_base']: return 0. else: run_manager.run_config.n_epochs = run_manager.run_config.search_epoch
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True if args.retrain_resume: config_file_path = os.path.join(args.resume_file, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the last retrain phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') # get EXP_time in last_retrain for flag EXP_time_last_retrain = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_last_retrain)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> loading configs {:} from the last retrain phase.'.format( config_file_path), mode='info') if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search space {:} is not supported'.format( args.search_space)) else: # resume partial configs setting and arch_checkpoint from the search phase by default. config_file_path = os.path.join(args.checkpoint_file, 'search.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the search phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() args.random_seed = config_dict['random_seed'] # get EXP_time in search phase, for flag EXP_time_search = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_search)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> starting retrain from the search phase config {:}.'.format( config_file_path), mode='info') # optimizer params if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None # scheduler params if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None # criterion params if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'epochs': args.epochs, 'class_num': args.nb_classes, } if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'counter': conv_candidates = counter elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search_space : {:} is not supported'.format( args.search_space)) # related to entropy constraint loss if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None # create run_config run_config = RunConfig(**args.__dict__) #if args.open_test == False: # retrain and validate if args.open_vis: # only open_vis in re-train phase, rather than both re-train and test. vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None if args.retrain_resume: logger.log( '=> Loading checkpoint from {:} of the last retrain phase'.format( args.resume_file), mode='info') # checkpoint_file from the last retrain phase. checkpoint_path = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-retrain.pth'.format(args.random_seed)) assert os.path.exists( checkpoint_path), 'cannot find retrain checkpoint file {:}'.format( checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) retrain_run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler']) retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0] retrain_run_manager.best_monitor = checkpoint['best_monitor'][1] retrain_run_manager.start_epoch = checkpoint['start_epoch'] logger.log( '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch' .format(checkpoint_path, checkpoint['start_epoch']), mode='info') else: # todo from the search phase, read the last arch_checkpoint, rather than the best one. arch_checkpoint_path = os.path.join( args.checkpoint_file, 'checkpoints', 'seed-{:}-arch.pth'.format(args.random_seed)) assert os.path.exists( arch_checkpoint_path ), 'cannot find arch_checkpoint file {:} from search phase'.format( arch_checkpoint_path) checkpoint = torch.load(arch_checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'Obtained actual_path and cell_genotypes:\n' \ 'Actual_path: {:}\n' \ 'Genotype:\n'.format(actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) logger.log(log_str, mode='info') args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) #normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) logger.log( '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase', mode='info') # perform train and validation in train() method retrain_run_manager.train() ''' else: # test phase checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed)) assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes'] normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) normal_network.load_state_dict(checkpoint['state_dict']) test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True) display_all_families_information(args, 'retrain', test_manager, logger) # save testing configs save_configs(args.__dict__, args.path, 'test') test_manager.validate(epoch=None, is_test= True, use_train_mode = False) ''' logger.close()
def run( self, start_ip, num_apps_to_load, apps_powered_on,\ host_port, popular_host_fraction,\ operation_work_done, operation_mix, \ operation_busy_pct, memory_sizes, memory_mix, \ mean_think_time, users_per_popular_host,\ users_per_less_popular_host,\ connection_timeout_msecs, socket_timeout_msecs,\ results_dir="./results", run_duration_secs=60, \ config_dir="./config" ): ''' Given a starting IP, a step size, e.g., 1) run servers on ip addressed 11.0.0.1 - 11.0.0.200 2) with a step size of 10 run experiments on 11.0.0.1 - 10 11.0.0.1 - 20, ... 11.0.0.1 - 200 ''' # Some pre-reqs: # 1) create the config_dir if it doesn't exist # 2) create the results_dir if it doesn't exist self.create_dir(config_dir) self.create_dir(results_dir) num_tests = apps_powered_on / num_apps_to_load for i in range(num_tests): # With a single Rain launch, load an entire block of ip's config = PreditableAppTestConfig() config.baseHostIp = start_ip config.numHostTargets = (i + 1) * num_apps_to_load config.duration = run_duration_secs config.hostPort = host_port config.popularHostFraction = popular_host_fraction config.usersPerPopularHost = users_per_popular_host config.usersPerLessPopularHost = users_per_less_popular_host #config.popularHostLoadFraction = popular_host_load_fraction #config.userPopulation = user_population config.meanThinkTime = mean_think_time # Add in the parameters for the workload generator # the operation mixes etc. generatorParams = PredictableAppGeneratorParameters() generatorParams.operationWorkDone = operation_work_done generatorParams.operationMix = operation_mix generatorParams.operationBusyPct = operation_busy_pct generatorParams.memorySizes = memory_sizes generatorParams.memoryMix = memory_mix generatorParams.connectionTimeoutMsecs = connection_timeout_msecs generatorParams.socketTimeoutMsecs = socket_timeout_msecs config.generatorParameters = generatorParams json_data = \ json.dumps(config, sort_keys='True',\ default=PreditableAppTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print( "[PredictableAppTestRunner] json config: {0}"\ .format(json_data) ) run_classpath = ".:rain.jar:workloads/httptest.jar" run_config_filename = config_dir + "/" + \ "run_predictable_config_" + start_ip + "_" + \ str(config.numHostTargets) + "_nodes.json" run_output_filename = results_dir + "/" + \ "run_predictable_log_" + start_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" run_results_filename = results_dir + "/" + \ "run_predictable_result_" + start_ip + "_" + \ str(config.numHostTargets) + "_nodes.txt" # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[PredictableAppTestRunner] Writing config file: {0}"\ .format( run_config_filename ) config_file = open(run_config_filename, 'w') config_file.write(json_data) config_file.flush() config_file.close() run_output = RunManager.run_rain( run_config_filename,\ run_classpath ) #print run_output track_results = RainOutputParser.parse_output(run_output) # Validate each of the track_results instances for result in track_results: # Set some 90th and 99th pctile thresholds result.pct_overhead_ops_threshold = 10.0 result.pct_failed_ops_threshold = 5.0 # Set the desired 90th and 99th percentile thresholds for # the 50ms, 100ms, 200ms operations - set everything to # 500 ms = 0.5s. Threshold units = seconds result.op_response_time_thresholds['PredicatableOp_50']=\ (0.5,0.5) result.op_response_time_thresholds['PredicatableOp_100']=\ (0.5,0.5) result.op_response_time_thresholds['PredicatableOp_200']=\ (0.5,0.5) # Write out the run output print "[PredictableAppTestRunner] Writing output: {0}"\ .format( run_output_filename ) run_output_file = open(run_output_filename, 'w') run_output_file.write(run_output) run_output_file.flush() run_output_file.close() # Write out the run results print "[PredictableAppTestRunner] Writing results: {0}"\ .format( run_results_filename ) run_results_file = open(run_results_filename, 'w') RainOutputParser.print_results(track_results, run_results_file) run_results_file.write("\n") # After writing out the table for all the tracks # Spit out the 90th and 99th percentiles for result in track_results: for k, v in result.op_response_times.items(): run_results_file.write( "{0},{1},{2},{3}\n"\ .format(result.name, k, v[0], v[1]) ) run_results_file.flush() run_results_file.close()
def run( self, start_ip, num_apps_to_load, apps_powered_on, host_port, popular_host_fraction, operation_work_done, operation_mix, operation_busy_pct, memory_sizes, memory_mix, mean_think_time, users_per_popular_host, users_per_less_popular_host, connection_timeout_msecs, socket_timeout_msecs, results_dir="./results", run_duration_secs=60, config_dir="./config", ): """ Given a starting IP, a step size, e.g., 1) run servers on ip addressed 11.0.0.1 - 11.0.0.200 2) with a step size of 10 run experiments on 11.0.0.1 - 10 11.0.0.1 - 20, ... 11.0.0.1 - 200 """ # Some pre-reqs: # 1) create the config_dir if it doesn't exist # 2) create the results_dir if it doesn't exist self.create_dir(config_dir) self.create_dir(results_dir) num_tests = apps_powered_on / num_apps_to_load for i in range(num_tests): # With a single Rain launch, load an entire block of ip's config = PreditableAppTestConfig() config.baseHostIp = start_ip config.numHostTargets = (i + 1) * num_apps_to_load config.duration = run_duration_secs config.hostPort = host_port config.popularHostFraction = popular_host_fraction config.usersPerPopularHost = users_per_popular_host config.usersPerLessPopularHost = users_per_less_popular_host # config.popularHostLoadFraction = popular_host_load_fraction # config.userPopulation = user_population config.meanThinkTime = mean_think_time # Add in the parameters for the workload generator # the operation mixes etc. generatorParams = PredictableAppGeneratorParameters() generatorParams.operationWorkDone = operation_work_done generatorParams.operationMix = operation_mix generatorParams.operationBusyPct = operation_busy_pct generatorParams.memorySizes = memory_sizes generatorParams.memoryMix = memory_mix generatorParams.connectionTimeoutMsecs = connection_timeout_msecs generatorParams.socketTimeoutMsecs = socket_timeout_msecs config.generatorParameters = generatorParams json_data = json.dumps(config, sort_keys="True", default=PreditableAppTestConfig.to_json) # Write this data out to a file, then invoke the run mananger # passing in the path to this file print ("[PredictableAppTestRunner] json config: {0}".format(json_data)) run_classpath = ".:rain.jar:workloads/httptest.jar" run_config_filename = ( config_dir + "/" + "run_predictable_config_" + start_ip + "_" + str(config.numHostTargets) + "_nodes.json" ) run_output_filename = ( results_dir + "/" + "run_predictable_log_" + start_ip + "_" + str(config.numHostTargets) + "_nodes.txt" ) run_results_filename = ( results_dir + "/" + "run_predictable_result_" + start_ip + "_" + str(config.numHostTargets) + "_nodes.txt" ) # write the json data out to the config file # invoke the run manager passing the location of the config file # collect the results and write them out to the results_dir print "[PredictableAppTestRunner] Writing config file: {0}".format(run_config_filename) config_file = open(run_config_filename, "w") config_file.write(json_data) config_file.flush() config_file.close() run_output = RunManager.run_rain(run_config_filename, run_classpath) # print run_output track_results = RainOutputParser.parse_output(run_output) # Validate each of the track_results instances for result in track_results: # Set some 90th and 99th pctile thresholds result.pct_overhead_ops_threshold = 10.0 result.pct_failed_ops_threshold = 5.0 # Set the desired 90th and 99th percentile thresholds for # the 50ms, 100ms, 200ms operations - set everything to # 500 ms = 0.5s. Threshold units = seconds result.op_response_time_thresholds["PredicatableOp_50"] = (0.5, 0.5) result.op_response_time_thresholds["PredicatableOp_100"] = (0.5, 0.5) result.op_response_time_thresholds["PredicatableOp_200"] = (0.5, 0.5) # Write out the run output print "[PredictableAppTestRunner] Writing output: {0}".format(run_output_filename) run_output_file = open(run_output_filename, "w") run_output_file.write(run_output) run_output_file.flush() run_output_file.close() # Write out the run results print "[PredictableAppTestRunner] Writing results: {0}".format(run_results_filename) run_results_file = open(run_results_filename, "w") RainOutputParser.print_results(track_results, run_results_file) run_results_file.write("\n") # After writing out the table for all the tracks # Spit out the 90th and 99th percentiles for result in track_results: for k, v in result.op_response_times.items(): run_results_file.write("{0},{1},{2},{3}\n".format(result.name, k, v[0], v[1])) run_results_file.flush() run_results_file.close()
def train(model, train_set, dev_set, test_set, hyper_params, batch_size, device): train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) m = RunManager() optimizer = optim.AdamW(model.parameters(), lr=hyper_params.learning_rate) logging.info("Training Started...") m.begin_run(hyper_params, model, train_loader) for epoch in range(hyper_params.num_epoch): m.begin_epoch(epoch + 1) model.train() for batch in train_loader: texts = batch['text'] lens = batch['length'] targets = batch['codes'] texts = texts.to(device) targets = targets.to(device) outputs, ldam_outputs, _ = model(texts, targets) if ldam_outputs is not None: loss = F.binary_cross_entropy_with_logits( ldam_outputs, targets) else: loss = F.binary_cross_entropy_with_logits(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() m.track_loss(loss) # m.track_num_correct(preds, affinities) m.end_epoch() m.end_run() hype = '_'.join([f'{k}_{v}' for k, v in hyper_params._asdict().items()]) m.save(f'../results/train_results_{hype}') logging.info("Training finished.\n") # Training train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) probabs, targets, _, _ = evaluate(model, train_loader, device, dtset='train') compute_scores(probabs, targets, hyper_params, dtset='train') # Validation dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True, num_workers=1) probabs, targets, _, _ = evaluate(model, dev_loader, device, dtset='dev') compute_scores(probabs, targets, hyper_params, dtset='dev') # test_dataset test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=1) probabs, targets, full_hadm_ids, full_attn_weights = evaluate(model, test_loader, device, dtset='test') compute_scores(probabs, targets, hyper_params, dtset='test', full_hadm_ids=full_hadm_ids, full_attn_weights=full_attn_weights)
# else: # print (run_config.init_lr) # build net from args from search.models.normal_nets import * net_config_url = "https://hanlab.mit.edu/files/proxylessNAS/proxyless_cifar.config" # net_config_url2 = 'https://hanlab.mit.edu/files/proxylessNAS/proxyless_cpu.config' # print(net_config_url) net = pyramid_base(net_config=net_config_url, n_classes=run_config.data_provider.n_classes, bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout) # net2 = proxyless_base(net_config=net_config_url2, n_classes=run_config.data_provider.n_classes, # bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout) # # build run manager run_manager = RunManager(args.path, net, run_config, measure_latency=args.latency) # run_manager.save_config(print_info=True) # load checkpoints init_path = '%s/init' % args.path if args.resume: run_manager.load_model() if args.train and run_manager.best_acc == 0: loss, acc1, acc5 = run_manager.validate(ist=False, return_top5=True) run_manager.best_acc = acc1 elif os.path.isfile(init_path): if torch.cuda.is_available(): checkpoint = torch.load(init_path) else: checkpoint = torch.load(init_path, map_location='cpu')
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) #print_experiment_environment() EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.epochs, 'class_num': args.nb_classes, } # TODO need modification, not need in counter_network args.conv_candidates = [ '3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6', 'Zero', #'Identity' ] run_config = RunConfig(**args.__dict__) # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to hardware constraint # TODO: get rid of if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None arch_search_config = ArchSearchConfig(**args.__dict__) # perform config save, for run_configs and arch_search_configs save_configs(run_config.config, arch_search_config.config, args.path, 'search') logger = prepare_logger(args) if args.open_vis: vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None ''' super_network = GumbelAutoDeepLab( args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger ) ''' super_network = CounterMBConvNet(2, search_space=args.search_space) train_manager = RunManager(args.path, super_network, logger, run_config, vis=vis, out_log=True) # train search phase train_manager.train() logger.close()