Exemplo n.º 1
0
def main(num_epoch):
    system_init()

    # load data
    dataset = TrafficDataset(path=cfg.data.path,
                             train_prop=cfg.data.train_prop,
                             valid_prop=cfg.data.valid_prop,
                             num_sensors=cfg.data.num_sensors,
                             in_length=cfg.data.in_length,
                             out_length=cfg.data.out_length,
                             batch_size_per_gpu=cfg.data.batch_size_per_gpu,
                             num_gpus=1)

    net = AutoSTG(in_length=cfg.data.in_length,
                  out_length=cfg.data.out_length,
                  node_hiddens=[
                      dataset.node_fts.shape[1],
                  ] + cfg.model.node_hiddens,
                  edge_hiddens=[
                      dataset.adj_mats.shape[2],
                  ] + cfg.model.edge_hiddens,
                  in_channels=cfg.data.in_channels,
                  out_channels=cfg.data.out_channels,
                  hidden_channels=cfg.model.hidden_channels,
                  skip_channels=cfg.model.skip_channels,
                  end_channels=cfg.model.end_channels,
                  layer_names=cfg.model.layer_names,
                  num_mixed_ops=cfg.model.num_mixed_ops,
                  candidate_op_profiles=cfg.model.candidate_op_profiles)

    run_manager = RunManager(
        name=cfg.model.name,
        net=net,
        dataset=dataset,
        arch_lr=cfg.trainer.arch_lr,
        arch_lr_decay_milestones=cfg.trainer.arch_lr_decay_milestones,
        arch_lr_decay_ratio=cfg.trainer.arch_lr_decay_ratio,
        arch_decay=cfg.trainer.arch_decay,
        arch_clip_gradient=cfg.trainer.arch_clip_gradient,
        weight_lr=cfg.trainer.weight_lr,
        weight_lr_decay_milestones=[
            20, 40, 60, 80
        ],  # cfg.trainer.weight_lr_decay_milestones,
        weight_lr_decay_ratio=cfg.trainer.weight_lr_decay_ratio,
        weight_decay=cfg.trainer.weight_decay,
        weight_clip_gradient=cfg.trainer.weight_clip_gradient,
        num_search_iterations=cfg.trainer.num_search_iterations,
        num_search_arch_samples=cfg.trainer.num_search_arch_samples,
        num_train_iterations=cfg.trainer.num_train_iterations,
        criterion=cfg.trainer.criterion,
        metric_names=cfg.trainer.metric_names,
        metric_indexes=cfg.trainer.metric_indexes,
        print_frequency=cfg.trainer.print_frequency,
        device_ids=[0])

    run_manager.load(mode='train')
    run_manager.clear_records()
    run_manager.initialize()
    print('# of params', run_manager._net.num_weight_parameters())
    run_manager.train(num_epoch)
    def run( self, hostlist_fname, popular_host_fraction,\
             mean_think_time, users_per_popular_host,\
             users_per_less_popular_host,\
             connection_timeout_msecs, socket_timeout_msecs,\
             results_dir="./results", run_duration_secs=60, \
             config_dir="./config", pipe_port=7851 ):

        # Some pre-reqs:
        # 1) create the config_dir if it doesn't exist
        # 2) create the results_dir if it doesn't exist
        self.create_dir(config_dir)
        self.create_dir(results_dir)

        num_tests = 1
        for i in range(num_tests):
            # With a single Rain launch, load an entire block of ip's
            config = FixedUrlTestConfig()
            config.hostListFile = hostlist_fname
            config.duration = run_duration_secs
            config.popularHostFraction = popular_host_fraction
            config.usersPerPopularHost = users_per_popular_host
            config.usersPerLessPopularHost = users_per_less_popular_host
            config.meanThinkTime = mean_think_time
            config.pipePort = pipe_port
            # Add in the parameters for the workload generator
            # the operation mixes etc.
            generatorParams = FixedUrlGeneratorParameters()
            generatorParams.connectionTimeoutMsecs = connection_timeout_msecs
            generatorParams.socketTimeoutMsecs = socket_timeout_msecs
            config.generatorParameters = generatorParams

            json_data = \
                json.dumps(config, sort_keys='True',\
                               default=FixedUrlTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file

            print( "[FixedUrlTestRunner] json config: {0}"\
                       .format(json_data) )

            run_classpath = ".:rain.jar:workloads/httptest.jar"
            run_config_filename = config_dir + "/" + \
                "run_fixed_url_config" + "_nodes.json"
            run_output_filename = results_dir + "/" + \
                "run_fixed_url_log" + "_nodes.txt"
            run_results_filename = results_dir + "/" + \
                "run_fixed_url_result" + "_nodes.txt"

            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir

            print "[FixedUrlTestRunner] Writing config file: {0}"\
                .format( run_config_filename )
            config_file = open(run_config_filename, 'w')
            config_file.write(json_data)
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain( run_config_filename,\
                                               run_classpath )
            #print run_output
            track_results = RainOutputParser.parse_output(run_output)
            # Validate each of the track_results instances

            for result in track_results:
                # Set some 90th and 99th pctile thresholds
                result.pct_overhead_ops_threshold = 10.0
                result.pct_failed_ops_threshold = 5.0
                # Set the desired 90th and 99th percentile thresholds for
                # the 50ms, 100ms, 200ms operations - set everything to
                # 500 ms = 0.5s. Threshold units = seconds
                result.op_response_time_thresholds['FixedUrl']=\
                    (0.5,0.5)

            # Write out the run output
            print "[FixedUrlTestRunner] Writing output: {0}"\
                .format( run_output_filename )
            run_output_file = open(run_output_filename, 'w')
            run_output_file.write(run_output)
            run_output_file.flush()
            run_output_file.close()

            # Write out the run results
            print "[FixedUrlTestRunner] Writing results: {0}"\
                .format( run_results_filename )
            run_results_file = open(run_results_filename, 'w')
            RainOutputParser.print_results(track_results, run_results_file)

            run_results_file.write("\n")
            # After writing out the table for all the tracks
            # Spit out the 90th and 99th percentiles
            for result in track_results:
                for k, v in result.op_response_times.items():
                    run_results_file.write( "{0},{1},{2},{3}\n"\
                               .format(result.name, k, v[0], v[1]) )

            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 3
0
    def step_run( self, start_ip, num_apps_to_load, apps_powered_on, \
                      results_dir="./results", run_duration_secs=60, \
                      config_dir="./config" ):
        '''
        Given a starting IP, a step size

        e.g.,:
        1) run servers on ip addressed 11.0.0.1 - 11.0.0.200
        2) with a step size of 10 run experiments on 11.0.0.1 - 10
           11.0.0.1 - 20, ... 11.0.0.1 - 200
        '''

        # Some pre-reqs:
        # 1) create the config_dir if it does not exist
        # 2) create the results_dir if it does not exist
        self.create_dir(config_dir)
        self.create_dir(results_dir)

        num_tests = apps_powered_on / num_apps_to_load
        for i in range(num_tests):
            # with one Rain launch we can load an entire block of ip's
            # using the track feature
            #ip_address_parts = start_ip.split( "." )
            #print len(ip_address_parts)
            # throw exception if we don't find a numeric ip v4 address
            #if len(ip_address_parts) != 4:
            #    raise Exception( "Expected a numeric IPv4 address"\
            #                         + " (format N.N.N.N)" )
            #lastOctet = int( ip_address_parts[3] )
            #base_ip = "{0}.{1}.{2}.{3}".format( ip_address_parts[0],\
            #                        ip_address_parts[1],\
            #                        ip_address_parts[2],\
            #                        str(lastOctet+(num_apps_to_load*i)))

            # Create config objects to write out as files
            base_ip = start_ip
            config = HttpTestConfig()
            config.baseHostIp = base_ip
            config.numHostTargets = (i + 1) * num_apps_to_load
            config.duration = run_duration_secs

            json_data = \
                json.dumps(config, sort_keys='True',\
                               default=HttpTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file
            print("[HttpTestStepRunner] json config: {0}".format(json_data))

            run_classpath = ".:rain.jar:workloads/httptest.jar"
            run_config_filename = config_dir + "/" + \
                "run_config_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.json"
            run_output_filename = results_dir + "/" + \
                "run_log_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"
            run_results_filename = results_dir + "/" + \
                "run_result_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"

            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir
            print "[HttpTestStepRunner] Writing config file: {0}"\
                .format( run_config_filename )
            config_file = open(run_config_filename, 'w')
            config_file.write(json_data)
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain( run_config_filename,\
                                               run_classpath )
            #print run_output
            track_results = RainOutputParser.parse_output(run_output)

            # Write out the run output
            print "[HttpTestStepRunner] Writing output: {0}"\
                .format( run_output_filename )
            run_output_file = open(run_output_filename, 'w')
            run_output_file.write(run_output)
            run_output_file.flush()
            run_output_file.close()

            # Write out the run results
            print "[HttpTestStepRunner] Writing results: {0}"\
                .format( run_results_filename )
            run_results_file = open(run_results_filename, 'w')
            RainOutputParser.print_results(track_results, run_results_file)
            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 4
0
    def run( self, hostlist_fname, popular_host_fraction,\
             mean_think_time, users_per_popular_host,\
             users_per_less_popular_host,\
             connection_timeout_msecs, socket_timeout_msecs,\
             results_dir="./results", run_duration_secs=60, \
             config_dir="./config", pipe_port=7851 ):

        # Some pre-reqs:
        # 1) create the config_dir if it doesn't exist
        # 2) create the results_dir if it doesn't exist
        self.create_dir( config_dir )
        self.create_dir( results_dir )
        
        num_tests = 1
        for i in range(num_tests):
            # With a single Rain launch, load an entire block of ip's
            config = FixedUrlTestConfig()
            config.hostListFile = hostlist_fname
            config.duration = run_duration_secs
            config.popularHostFraction = popular_host_fraction
            config.usersPerPopularHost = users_per_popular_host
            config.usersPerLessPopularHost = users_per_less_popular_host
            config.meanThinkTime = mean_think_time
            config.pipePort = pipe_port
            # Add in the parameters for the workload generator
            # the operation mixes etc.
            generatorParams = FixedUrlGeneratorParameters()
            generatorParams.connectionTimeoutMsecs = connection_timeout_msecs
            generatorParams.socketTimeoutMsecs = socket_timeout_msecs
            config.generatorParameters = generatorParams
            
            json_data = \
                json.dumps(config, sort_keys='True',\
                               default=FixedUrlTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file
                                  
            print( "[FixedUrlTestRunner] json config: {0}"\
                       .format(json_data) )

            run_classpath=".:rain.jar:workloads/httptest.jar"
            run_config_filename = config_dir + "/" + \
                "run_fixed_url_config" + "_nodes.json"
            run_output_filename = results_dir + "/" + \
                "run_fixed_url_log" + "_nodes.txt"
            run_results_filename = results_dir + "/" + \
                "run_fixed_url_result" + "_nodes.txt"
            
            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir
         
            print "[FixedUrlTestRunner] Writing config file: {0}"\
                .format( run_config_filename )
            config_file = open( run_config_filename, 'w' )
            config_file.write( json_data )
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain( run_config_filename,\
                                               run_classpath )
            #print run_output
            track_results = RainOutputParser.parse_output( run_output )
            # Validate each of the track_results instances
            
            for result in track_results:
                # Set some 90th and 99th pctile thresholds
                result.pct_overhead_ops_threshold=10.0
                result.pct_failed_ops_threshold=5.0
                # Set the desired 90th and 99th percentile thresholds for
                # the 50ms, 100ms, 200ms operations - set everything to
                # 500 ms = 0.5s. Threshold units = seconds
                result.op_response_time_thresholds['FixedUrl']=\
                    (0.5,0.5)

            # Write out the run output
            print "[FixedUrlTestRunner] Writing output: {0}"\
                .format( run_output_filename )
            run_output_file = open( run_output_filename, 'w' )
            run_output_file.write( run_output )
            run_output_file.flush()
            run_output_file.close()

            # Write out the run results
            print "[FixedUrlTestRunner] Writing results: {0}"\
                .format( run_results_filename )
            run_results_file = open( run_results_filename, 'w' )
            RainOutputParser.print_results( track_results, run_results_file )
            
            run_results_file.write( "\n" )
            # After writing out the table for all the tracks
            # Spit out the 90th and 99th percentiles
            for result in track_results:
                for k,v in result.op_response_times.items():
                    run_results_file.write( "{0},{1},{2},{3}\n"\
                               .format(result.name, k, v[0], v[1]) )

            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 5
0
                     network=['vae64'],
                     shuffle=[True],
                     num_workers=[5],
                     loss='vae',
                     model='vae64_1')

train_set = SLFDatasetUnsampled(root_dir=os.path.join(ROOT, 'slf_mat'),
                                csv_file=os.path.join(ROOT, 'details.csv'),
                                total_data=500000)

validation_set = SLFDatasetUnsampled(
    root_dir=os.path.join(VALIDATION_SET_PATH, 'slf_mat'),
    csv_file=os.path.join(VALIDATION_SET_PATH, 'details.csv'),
    total_data=5000)

m = RunManager()
vae_loss = VAELoss()
mse_loss = torch.nn.MSELoss()
for run in RunBuilder.get_runs(params):
    device = torch.device(run.device)
    network = networks[run.network]().to(run.device)
    loader = torch.utils.data.DataLoader(train_set,
                                         batch_size=run.batch_size,
                                         shuffle=run.shuffle,
                                         num_workers=run.num_workers)
    validation_loader = torch.utils.data.DataLoader(
        validation_set,
        batch_size=run.batch_size,
        shuffle=run.shuffle,
        num_workers=run.num_workers)
Exemplo n.º 6
0
            net_origin = nn.DataParallel(ResNet_ImageNet(depth=50, num_classes=run_config.data_provider.n_classes))
    elif args.model=="mobilenetv2":
        assert args.dataset=='imagenet', 'mobilenetv2 only supports imagenet dataset'
        net = MobileNetV2(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg))
        if args.base_path!=None:
            weight_path = args.base_path+'/checkpoint/model_best.pth.tar'
            net_origin = nn.DataParallel(MobileNetV2(num_classes=run_config.data_provider.n_classes))
    elif args.model=="mobilenet":
        assert args.dataset=='imagenet', 'mobilenet only supports imagenet dataset'
        net = MobileNet(num_classes=run_config.data_provider.n_classes, cfg=eval(args.cfg))
        if args.base_path!=None:
            weight_path = args.base_path+'/checkpoint/model_best.pth.tar'
            net_origin = nn.DataParallel(MobileNet(num_classes=run_config.data_provider.n_classes))

    # build run manager
    run_manager = RunManager(args.path, net, run_config)
    if args.local_rank == 0:
        run_manager.save_config(print_info=True)

    # load checkpoints
    if args.base_path!=None:
        weight_path = args.base_path+'/checkpoint/model_best.pth.tar'
    if args.resume:
        run_manager.load_model()
        if args.train and run_manager.best_acc == 0:
            loss, acc1, acc5 = run_manager.validate(is_test=True, return_top5=True)
            run_manager.best_acc = acc1
    elif weight_path!=None and os.path.isfile(weight_path):
        assert net_origin != None, "original network is None"
        net_origin.load_state_dict(torch.load(weight_path)['state_dict'])
        net_origin = net_origin.module
Exemplo n.º 7
0
        # build net from args
        if 'proxyless' in args.net:
            from models.normal_nets.proxyless_nets import proxyless_base
            net_config_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.config' % args.net
            net = proxyless_base(
                net_config=net_config_url,
                n_classes=run_config.data_provider.n_classes,
                bn_param=(args.bn_momentum, args.bn_eps),
                dropout_rate=args.dropout,
            )
        else:
            raise ValueError('do not support: %s' % args.net)

    # build run manager
    run_manager = RunManager(args.path,
                             net,
                             run_config,
                             measure_latency=args.latency)
    run_manager.save_config(print_info=True)

    # load checkpoints
    best_model_path = '%s/checkpoint/model_best.pth.tar' % args.path
    if os.path.isfile(best_model_path):
        init_path = best_model_path
    else:
        init_path = '%s/init' % args.path

    if args.resume:
        run_manager.load_model()
        if args.train and run_manager.best_acc == 0:
            loss, acc1, acc5 = run_manager.validate(is_test=False,
                                                    return_top5=True)
Exemplo n.º 8
0
def main(args, myargs):

    torch.manual_seed(args.manual_seed)
    torch.cuda.manual_seed_all(args.manual_seed)
    np.random.seed(args.manual_seed)

    # os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    os.makedirs(args.path, exist_ok=True)

    # prepare run config
    run_config_path = '%s/run.config' % args.path
    if os.path.isfile(run_config_path):
        # load run config from file
        run_config = json.load(open(run_config_path, 'r'))
        run_config = ImagenetRunConfig(**run_config)
        if args.valid_size:
            run_config.valid_size = args.valid_size
    else:
        # build run config from args
        args.lr_schedule_param = None
        args.opt_param = {
            'momentum': args.momentum,
            'nesterov': not args.no_nesterov,
        }
        if args.no_decay_keys == 'None':
            args.no_decay_keys = None
        run_config = ImagenetRunConfig(**args.__dict__)
    print('Run config:')
    for k, v in run_config.config.items():
        print('\t%s: %s' % (k, v))

    # prepare network
    net_config_path = '%s/net.config' % args.path
    if os.path.isfile(net_config_path):
        # load net from file
        from models import get_net_by_name
        net_config = json.load(open(net_config_path, 'r'))
        net = get_net_by_name(net_config['name']).build_from_config(net_config)
    else:
        # build net from args
        if 'proxyless' in args.net:
            from models.normal_nets.proxyless_nets import proxyless_base
            net_config_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.config' % args.net
            net = proxyless_base(
                net_config=net_config_url,
                n_classes=run_config.data_provider.n_classes,
                bn_param=(args.bn_momentum, args.bn_eps),
                dropout_rate=args.dropout,
            )
        else:
            raise ValueError('do not support: %s' % args.net)

    # build run manager
    run_manager = RunManager(args.path,
                             net,
                             run_config,
                             measure_latency=args.latency)
    run_manager.save_config(print_info=True)

    # load checkpoints
    init_path = '%s/init' % args.path
    if args.resume:
        run_manager.load_model()
        if args.train and run_manager.best_acc == 0:
            loss, acc1, acc5 = run_manager.validate(is_test=False,
                                                    return_top5=True)
            run_manager.best_acc = acc1
    elif os.path.isfile(init_path):
        if torch.cuda.is_available():
            checkpoint = torch.load(init_path)
        else:
            checkpoint = torch.load(init_path, map_location='cpu')
        if 'state_dict' in checkpoint:
            checkpoint = checkpoint['state_dict']
        run_manager.net.module.load_state_dict(checkpoint)
    elif 'proxyless' in args.net and not args.train:
        from utils.latency_estimator import download_url
        pretrained_weight_url = 'https://hanlab.mit.edu/files/proxylessNAS/%s.pth' % args.net
        print('Load pretrained weights from %s' % pretrained_weight_url)
        init_path = download_url(pretrained_weight_url)
        init = torch.load(init_path, map_location='cpu')
        net.load_state_dict(init['state_dict'])
    else:
        print('Random initialization')

    # train
    if args.train:
        print('Start training')
        run_manager.train(print_top5=True)
        run_manager.save_model()

    output_dict = {}
    # validate
    if run_config.valid_size:
        print('Test on validation set')
        loss, acc1, acc5 = run_manager.validate(is_test=False,
                                                return_top5=True)
        log = 'valid_loss: %f\t valid_acc1: %f\t valid_acc5: %f' % (loss, acc1,
                                                                    acc5)
        run_manager.write_log(log, prefix='valid')
        output_dict = {
            **output_dict, 'valid_loss': ' % f' % loss,
            'valid_acc1': ' % f' % acc1,
            'valid_acc5': ' % f' % acc5,
            'valid_size': run_config.valid_size
        }

    # test
    print('Test on test set')
    loss, acc1, acc5 = run_manager.validate(is_test=True, return_top5=True)
    log = 'test_loss: %f\t test_acc1: %f\t test_acc5: %f' % (loss, acc1, acc5)
    run_manager.write_log(log, prefix='test')
    output_dict = {
        **output_dict, 'test_loss': '%f' % loss,
        'test_acc1': '%f' % acc1,
        'test_acc5': '%f' % acc5
    }
    json.dump(output_dict, open('%s/output' % args.path, 'w'), indent=4)
Exemplo n.º 9
0
    elif args.model == "resnet50":
        assert args.dataset == 'imagenet', 'resnet50 only supports imagenet dataset'
        net = ResNet_ImageNet(depth=50,
                              num_classes=run_config.data_provider.n_classes,
                              cfg=eval(args.cfg))
    elif args.model == "mobilenetv2":
        assert args.dataset == 'imagenet', 'mobilenetv2 only supports imagenet dataset'
        net = MobileNetV2(num_classes=run_config.data_provider.n_classes,
                          cfg=eval(args.cfg))
    elif args.model == "mobilenet":
        assert args.dataset == 'imagenet', 'mobilenet only supports imagenet dataset'
        net = MobileNet(num_classes=run_config.data_provider.n_classes,
                        cfg=eval(args.cfg))

    # build run manager
    run_manager = RunManager(args.path, net, run_config)

    # load checkpoints
    best_model_path = '%s/checkpoint/model_best.pth.tar' % args.path
    assert os.path.isfile(best_model_path), 'wrong path'
    if torch.cuda.is_available():
        checkpoint = torch.load(best_model_path)
    else:
        checkpoint = torch.load(best_model_path, map_location='cpu')
    if 'state_dict' in checkpoint:
        checkpoint = checkpoint['state_dict']
    run_manager.net.load_state_dict(checkpoint)

    output_dict = {}

    # test
Exemplo n.º 10
0
    def step_run( self, start_ip, num_apps_to_load, apps_powered_on, \
                      results_dir="./results", run_duration_secs=60, \
                      config_dir="./config" ):
        '''
        Given a starting IP, a step size

        e.g.,:
        1) run servers on ip addressed 11.0.0.1 - 11.0.0.200
        2) with a step size of 10 run experiments on 11.0.0.1 - 10
           11.0.0.1 - 20, ... 11.0.0.1 - 200
        '''

        # Some pre-reqs:
        # 1) create the config_dir if it does not exist
        # 2) create the results_dir if it does not exist
        self.create_dir( config_dir )
        self.create_dir( results_dir )

        num_tests = apps_powered_on/num_apps_to_load
        for i in range(num_tests):
            # with one Rain launch we can load an entire block of ip's
            # using the track feature
            #ip_address_parts = start_ip.split( "." )
            #print len(ip_address_parts)
            # throw exception if we don't find a numeric ip v4 address
            #if len(ip_address_parts) != 4:
            #    raise Exception( "Expected a numeric IPv4 address"\
            #                         + " (format N.N.N.N)" )
            #lastOctet = int( ip_address_parts[3] )
            #base_ip = "{0}.{1}.{2}.{3}".format( ip_address_parts[0],\
            #                        ip_address_parts[1],\
            #                        ip_address_parts[2],\
            #                        str(lastOctet+(num_apps_to_load*i)))

            # Create config objects to write out as files
            base_ip = start_ip
            config = HttpTestConfig()
            config.baseHostIp = base_ip
            config.numHostTargets = (i+1)*num_apps_to_load
            config.duration = run_duration_secs
            
            json_data = \
                json.dumps(config, sort_keys='True',\
                               default=HttpTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file
            print( "[HttpTestStepRunner] json config: {0}".format(json_data) )
            
            run_classpath=".:rain.jar:workloads/httptest.jar"
            run_config_filename = config_dir + "/" + \
                "run_config_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.json"
            run_output_filename = results_dir + "/" + \
                "run_log_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"
            run_results_filename = results_dir + "/" + \
                "run_result_" + base_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"
            
            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir
            print "[HttpTestStepRunner] Writing config file: {0}"\
                .format( run_config_filename )
            config_file = open( run_config_filename, 'w' )
            config_file.write( json_data )
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain( run_config_filename,\
                                               run_classpath ) 
            #print run_output
            track_results = RainOutputParser.parse_output( run_output )
            
            # Write out the run output
            print "[HttpTestStepRunner] Writing output: {0}"\
                .format( run_output_filename )
            run_output_file = open( run_output_filename, 'w' )
            run_output_file.write( run_output )
            run_output_file.flush()
            run_output_file.close()
            
            # Write out the run results
            print "[HttpTestStepRunner] Writing results: {0}"\
                .format( run_results_filename )
            run_results_file = open( run_results_filename, 'w' )
            RainOutputParser.print_results( track_results, run_results_file )
            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 11
0
import flags
from run_manager import RunManager
from character import Character
import level

#################################
flags.run_manager = RunManager()#
flags.character = Character()   #
#################################

level.load_level(flags.run_manager.display_manager.map)
flags.run_manager.start_game()
Exemplo n.º 12
0
        net = ResNet_CIFAR(cfg=None, depth=56, num_classes=10, cutout=False)
        weight_path = 'Exp_base/resnet56_base/checkpoint/model_best.pth.tar'
        net_origin = nn.DataParallel(
            ResNet_CIFAR(depth=56, num_classes=10, cutout=False))
    elif args.model == "resnet110":
        assert args.dataset == 'cifar10', 'resnet110 only supports cifar10 dataset'
        net = ResNet_CIFAR(cfg=None, depth=110, num_classes=10, cutout=False)
        weight_path = 'Exp_base/resnet110_base/checkpoint/model_best.pth.tar'
        net_origin = nn.DataParallel(
            ResNet_CIFAR(depth=110, num_classes=10, cutout=False))
    net_origin.load_state_dict(torch.load(weight_path)['state_dict'])
    net_origin = net_origin.module
    base_flops = net.cfg2flops(net.config['cfg_base'])

    # build run manager
    run_manager = RunManager(args.path, net, run_config)
    if args.local_rank == 0:
        run_manager.save_config(print_info=True)

    # cfg to fitness
    cfg2fit_dict = {}

    def cfg2fitness(cfg):
        if args.local_rank == 0:
            print(str(cfg))
        if str(cfg) in cfg2fit_dict.keys():
            return cfg2fit_dict[str(cfg)]
        elif cfg == run_manager.net.module.config['cfg_base']:
            return 0.
        else:
            run_manager.run_config.n_epochs = run_manager.run_config.search_epoch
Exemplo n.º 13
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    if args.retrain_resume:
        config_file_path = os.path.join(args.resume_file, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the last retrain phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'retrain')
        # get EXP_time in last_retrain for flag
        EXP_time_last_retrain = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_last_retrain))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> loading configs {:} from the last retrain phase.'.format(
                config_file_path),
            mode='info')
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search space {:} is not supported'.format(
                args.search_space))
    else:
        # resume partial configs setting and arch_checkpoint from the search phase by default.
        config_file_path = os.path.join(args.checkpoint_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the search phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        args.random_seed = config_dict['random_seed']
        # get EXP_time in search phase, for flag
        EXP_time_search = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_search))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> starting retrain from the search phase config {:}.'.format(
                config_file_path),
            mode='info')

        # optimizer params
        if args.weight_optimizer_type == 'SGD':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'nesterov': args.nesterov,
                'weight_decay': args.weight_decay,
            }
        elif args.weight_optimizer_type == 'RMSprop':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'weight_decay': args.weight_decay,
            }
        else:
            weight_optimizer_params = None
        # scheduler params
        if args.scheduler == 'cosine':
            scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
        elif args.scheduler == 'multistep':
            scheduler_params = {
                'milestones': args.milestones,
                'gammas': args.gammas
            }
        elif args.scheduler == 'exponential':
            scheduler_params = {'gamma': args.gamma}
        elif args.scheduler == 'linear':
            scheduler_params = {'min_lr': args.min_lr}
        else:
            scheduler_params = None
        # criterion params
        if args.criterion == 'SmoothSoftmax':
            criterion_params = {'label_smooth': args.label_smoothing}
        else:
            criterion_params = None

        args.optimizer_config = {
            'optimizer_type': args.weight_optimizer_type,
            'optimizer_params': weight_optimizer_params,
            'scheduler': args.scheduler,
            'scheduler_params': scheduler_params,
            'criterion': args.criterion,
            'criterion_params': criterion_params,
            'init_lr': args.init_lr,
            'epochs': args.epochs,
            'class_num': args.nb_classes,
        }
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'counter':
            conv_candidates = counter
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search_space : {:} is not supported'.format(
                args.search_space))

        # related to entropy constraint loss
        if args.reg_loss_type == 'add#linear':
            args.reg_loss_params = {'lambda': args.reg_loss_lambda}
        elif args.reg_loss_type == 'mul#log':
            args.reg_loss_params = {
                'alpha': args.reg_loss_alpha,
                'beta': args.reg_loss_beta
            }
        else:
            args.reg_loss_params = None

    # create run_config
    run_config = RunConfig(**args.__dict__)

    #if args.open_test == False: # retrain and validate
    if args.open_vis:  # only open_vis in re-train phase, rather than both re-train and test.
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    if args.retrain_resume:
        logger.log(
            '=> Loading checkpoint from {:} of the last retrain phase'.format(
                args.resume_file),
            mode='info')
        # checkpoint_file from the last retrain phase.
        checkpoint_path = os.path.join(
            args.resume_file, 'checkpoints',
            'seed-{:}-retrain.pth'.format(args.random_seed))
        assert os.path.exists(
            checkpoint_path), 'cannot find retrain checkpoint file {:}'.format(
                checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        retrain_run_manager.optimizer.load_state_dict(
            checkpoint['weight_optimizer'])
        retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler'])
        retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0]
        retrain_run_manager.best_monitor = checkpoint['best_monitor'][1]
        retrain_run_manager.start_epoch = checkpoint['start_epoch']
        logger.log(
            '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch'
            .format(checkpoint_path, checkpoint['start_epoch']),
            mode='info')
    else:
        # todo from the search phase, read the last arch_checkpoint, rather than the best one.
        arch_checkpoint_path = os.path.join(
            args.checkpoint_file, 'checkpoints',
            'seed-{:}-arch.pth'.format(args.random_seed))
        assert os.path.exists(
            arch_checkpoint_path
        ), 'cannot find arch_checkpoint file {:} from search phase'.format(
            arch_checkpoint_path)
        checkpoint = torch.load(arch_checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        new_genotypes = []
        for _index, genotype in cell_genotypes:
            xlist = []
            for edge_genotype in genotype:
                for (node_str, select_index) in edge_genotype:
                    xlist.append((node_str, conv_candidates[select_index]))
            new_genotypes.append((_index, xlist))
        log_str = 'Obtained actual_path and cell_genotypes:\n' \
                  'Actual_path: {:}\n' \
                  'Genotype:\n'.format(actual_path)
        for _index, genotype in new_genotypes:
            log_str += 'index: {:} arch: {:}\n'.format(_index, genotype)
        logger.log(log_str, mode='info')
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        #normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        logger.log(
            '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase',
            mode='info')
    # perform train and validation in train() method
    retrain_run_manager.train()
    '''
    else: # test phase
        checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed))
        assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes']
        normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier,
                                              args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True)
        display_all_families_information(args, 'retrain', test_manager, logger)

        # save testing configs
        save_configs(args.__dict__, args.path, 'test')
        test_manager.validate(epoch=None, is_test=    True, use_train_mode = False)
    '''
    logger.close()
    def run( self, start_ip, num_apps_to_load, apps_powered_on,\
             host_port, popular_host_fraction,\
             operation_work_done, operation_mix, \
             operation_busy_pct, memory_sizes, memory_mix, \
             mean_think_time, users_per_popular_host,\
             users_per_less_popular_host,\
             connection_timeout_msecs, socket_timeout_msecs,\
             results_dir="./results", run_duration_secs=60, \
             config_dir="./config" ):
        '''
        Given a starting IP, a step size, e.g.,
        1) run servers on ip addressed 11.0.0.1 - 11.0.0.200
        2) with a step size of 10 run experiments on 11.0.0.1 - 10
           11.0.0.1 - 20, ... 11.0.0.1 - 200   
        '''
        # Some pre-reqs:
        # 1) create the config_dir if it doesn't exist
        # 2) create the results_dir if it doesn't exist
        self.create_dir(config_dir)
        self.create_dir(results_dir)

        num_tests = apps_powered_on / num_apps_to_load
        for i in range(num_tests):
            # With a single Rain launch, load an entire block of ip's
            config = PreditableAppTestConfig()
            config.baseHostIp = start_ip
            config.numHostTargets = (i + 1) * num_apps_to_load
            config.duration = run_duration_secs
            config.hostPort = host_port
            config.popularHostFraction = popular_host_fraction
            config.usersPerPopularHost = users_per_popular_host
            config.usersPerLessPopularHost = users_per_less_popular_host
            #config.popularHostLoadFraction = popular_host_load_fraction
            #config.userPopulation = user_population
            config.meanThinkTime = mean_think_time
            # Add in the parameters for the workload generator
            # the operation mixes etc.
            generatorParams = PredictableAppGeneratorParameters()
            generatorParams.operationWorkDone = operation_work_done
            generatorParams.operationMix = operation_mix
            generatorParams.operationBusyPct = operation_busy_pct
            generatorParams.memorySizes = memory_sizes
            generatorParams.memoryMix = memory_mix
            generatorParams.connectionTimeoutMsecs = connection_timeout_msecs
            generatorParams.socketTimeoutMsecs = socket_timeout_msecs
            config.generatorParameters = generatorParams

            json_data = \
                json.dumps(config, sort_keys='True',\
                               default=PreditableAppTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file

            print( "[PredictableAppTestRunner] json config: {0}"\
                       .format(json_data) )

            run_classpath = ".:rain.jar:workloads/httptest.jar"
            run_config_filename = config_dir + "/" + \
                "run_predictable_config_" + start_ip + "_" + \
                str(config.numHostTargets) + "_nodes.json"
            run_output_filename = results_dir + "/" + \
                "run_predictable_log_" + start_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"
            run_results_filename = results_dir + "/" + \
                "run_predictable_result_" + start_ip + "_" + \
                str(config.numHostTargets) + "_nodes.txt"

            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir

            print "[PredictableAppTestRunner] Writing config file: {0}"\
                .format( run_config_filename )
            config_file = open(run_config_filename, 'w')
            config_file.write(json_data)
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain( run_config_filename,\
                                               run_classpath )
            #print run_output
            track_results = RainOutputParser.parse_output(run_output)
            # Validate each of the track_results instances

            for result in track_results:
                # Set some 90th and 99th pctile thresholds
                result.pct_overhead_ops_threshold = 10.0
                result.pct_failed_ops_threshold = 5.0
                # Set the desired 90th and 99th percentile thresholds for
                # the 50ms, 100ms, 200ms operations - set everything to
                # 500 ms = 0.5s. Threshold units = seconds
                result.op_response_time_thresholds['PredicatableOp_50']=\
                    (0.5,0.5)
                result.op_response_time_thresholds['PredicatableOp_100']=\
                    (0.5,0.5)
                result.op_response_time_thresholds['PredicatableOp_200']=\
                    (0.5,0.5)

            # Write out the run output
            print "[PredictableAppTestRunner] Writing output: {0}"\
                .format( run_output_filename )
            run_output_file = open(run_output_filename, 'w')
            run_output_file.write(run_output)
            run_output_file.flush()
            run_output_file.close()

            # Write out the run results
            print "[PredictableAppTestRunner] Writing results: {0}"\
                .format( run_results_filename )
            run_results_file = open(run_results_filename, 'w')
            RainOutputParser.print_results(track_results, run_results_file)

            run_results_file.write("\n")
            # After writing out the table for all the tracks
            # Spit out the 90th and 99th percentiles
            for result in track_results:
                for k, v in result.op_response_times.items():
                    run_results_file.write( "{0},{1},{2},{3}\n"\
                               .format(result.name, k, v[0], v[1]) )

            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 15
0
    def run(
        self,
        start_ip,
        num_apps_to_load,
        apps_powered_on,
        host_port,
        popular_host_fraction,
        operation_work_done,
        operation_mix,
        operation_busy_pct,
        memory_sizes,
        memory_mix,
        mean_think_time,
        users_per_popular_host,
        users_per_less_popular_host,
        connection_timeout_msecs,
        socket_timeout_msecs,
        results_dir="./results",
        run_duration_secs=60,
        config_dir="./config",
    ):
        """
        Given a starting IP, a step size, e.g.,
        1) run servers on ip addressed 11.0.0.1 - 11.0.0.200
        2) with a step size of 10 run experiments on 11.0.0.1 - 10
           11.0.0.1 - 20, ... 11.0.0.1 - 200   
        """
        # Some pre-reqs:
        # 1) create the config_dir if it doesn't exist
        # 2) create the results_dir if it doesn't exist
        self.create_dir(config_dir)
        self.create_dir(results_dir)

        num_tests = apps_powered_on / num_apps_to_load
        for i in range(num_tests):
            # With a single Rain launch, load an entire block of ip's
            config = PreditableAppTestConfig()
            config.baseHostIp = start_ip
            config.numHostTargets = (i + 1) * num_apps_to_load
            config.duration = run_duration_secs
            config.hostPort = host_port
            config.popularHostFraction = popular_host_fraction
            config.usersPerPopularHost = users_per_popular_host
            config.usersPerLessPopularHost = users_per_less_popular_host
            # config.popularHostLoadFraction = popular_host_load_fraction
            # config.userPopulation = user_population
            config.meanThinkTime = mean_think_time
            # Add in the parameters for the workload generator
            # the operation mixes etc.
            generatorParams = PredictableAppGeneratorParameters()
            generatorParams.operationWorkDone = operation_work_done
            generatorParams.operationMix = operation_mix
            generatorParams.operationBusyPct = operation_busy_pct
            generatorParams.memorySizes = memory_sizes
            generatorParams.memoryMix = memory_mix
            generatorParams.connectionTimeoutMsecs = connection_timeout_msecs
            generatorParams.socketTimeoutMsecs = socket_timeout_msecs
            config.generatorParameters = generatorParams

            json_data = json.dumps(config, sort_keys="True", default=PreditableAppTestConfig.to_json)
            # Write this data out to a file, then invoke the run mananger
            # passing in the path to this file

            print ("[PredictableAppTestRunner] json config: {0}".format(json_data))

            run_classpath = ".:rain.jar:workloads/httptest.jar"
            run_config_filename = (
                config_dir
                + "/"
                + "run_predictable_config_"
                + start_ip
                + "_"
                + str(config.numHostTargets)
                + "_nodes.json"
            )
            run_output_filename = (
                results_dir + "/" + "run_predictable_log_" + start_ip + "_" + str(config.numHostTargets) + "_nodes.txt"
            )
            run_results_filename = (
                results_dir
                + "/"
                + "run_predictable_result_"
                + start_ip
                + "_"
                + str(config.numHostTargets)
                + "_nodes.txt"
            )

            # write the json data out to the config file
            # invoke the run manager passing the location of the config file
            # collect the results and write them out to the results_dir

            print "[PredictableAppTestRunner] Writing config file: {0}".format(run_config_filename)
            config_file = open(run_config_filename, "w")
            config_file.write(json_data)
            config_file.flush()
            config_file.close()
            run_output = RunManager.run_rain(run_config_filename, run_classpath)
            # print run_output
            track_results = RainOutputParser.parse_output(run_output)
            # Validate each of the track_results instances

            for result in track_results:
                # Set some 90th and 99th pctile thresholds
                result.pct_overhead_ops_threshold = 10.0
                result.pct_failed_ops_threshold = 5.0
                # Set the desired 90th and 99th percentile thresholds for
                # the 50ms, 100ms, 200ms operations - set everything to
                # 500 ms = 0.5s. Threshold units = seconds
                result.op_response_time_thresholds["PredicatableOp_50"] = (0.5, 0.5)
                result.op_response_time_thresholds["PredicatableOp_100"] = (0.5, 0.5)
                result.op_response_time_thresholds["PredicatableOp_200"] = (0.5, 0.5)

            # Write out the run output
            print "[PredictableAppTestRunner] Writing output: {0}".format(run_output_filename)
            run_output_file = open(run_output_filename, "w")
            run_output_file.write(run_output)
            run_output_file.flush()
            run_output_file.close()

            # Write out the run results
            print "[PredictableAppTestRunner] Writing results: {0}".format(run_results_filename)
            run_results_file = open(run_results_filename, "w")
            RainOutputParser.print_results(track_results, run_results_file)

            run_results_file.write("\n")
            # After writing out the table for all the tracks
            # Spit out the 90th and 99th percentiles
            for result in track_results:
                for k, v in result.op_response_times.items():
                    run_results_file.write("{0},{1},{2},{3}\n".format(result.name, k, v[0], v[1]))

            run_results_file.flush()
            run_results_file.close()
Exemplo n.º 16
0
def train(model, train_set, dev_set, test_set, hyper_params, batch_size,
          device):
    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=1)
    m = RunManager()
    optimizer = optim.AdamW(model.parameters(), lr=hyper_params.learning_rate)

    logging.info("Training Started...")
    m.begin_run(hyper_params, model, train_loader)
    for epoch in range(hyper_params.num_epoch):
        m.begin_epoch(epoch + 1)
        model.train()
        for batch in train_loader:
            texts = batch['text']
            lens = batch['length']
            targets = batch['codes']

            texts = texts.to(device)
            targets = targets.to(device)
            outputs, ldam_outputs, _ = model(texts, targets)

            if ldam_outputs is not None:
                loss = F.binary_cross_entropy_with_logits(
                    ldam_outputs, targets)
            else:
                loss = F.binary_cross_entropy_with_logits(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            # m.track_num_correct(preds, affinities)

        m.end_epoch()
    m.end_run()
    hype = '_'.join([f'{k}_{v}' for k, v in hyper_params._asdict().items()])
    m.save(f'../results/train_results_{hype}')
    logging.info("Training finished.\n")

    # Training
    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=1)
    probabs, targets, _, _ = evaluate(model,
                                      train_loader,
                                      device,
                                      dtset='train')
    compute_scores(probabs, targets, hyper_params, dtset='train')

    # Validation
    dev_loader = DataLoader(dev_set,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1)
    probabs, targets, _, _ = evaluate(model, dev_loader, device, dtset='dev')
    compute_scores(probabs, targets, hyper_params, dtset='dev')

    # test_dataset
    test_loader = DataLoader(test_set,
                             batch_size=batch_size,
                             shuffle=True,
                             num_workers=1)
    probabs, targets, full_hadm_ids, full_attn_weights = evaluate(model,
                                                                  test_loader,
                                                                  device,
                                                                  dtset='test')
    compute_scores(probabs,
                   targets,
                   hyper_params,
                   dtset='test',
                   full_hadm_ids=full_hadm_ids,
                   full_attn_weights=full_attn_weights)
Exemplo n.º 17
0
 # else:
 # print (run_config.init_lr)
 # build net from args
 from search.models.normal_nets import *
 net_config_url = "https://hanlab.mit.edu/files/proxylessNAS/proxyless_cifar.config"
 # net_config_url2 = 'https://hanlab.mit.edu/files/proxylessNAS/proxyless_cpu.config'
 # print(net_config_url)
 net = pyramid_base(net_config=net_config_url,
                    n_classes=run_config.data_provider.n_classes,
                    bn_param=(args.bn_momentum, args.bn_eps),
                    dropout_rate=args.dropout)
 # net2 = proxyless_base(net_config=net_config_url2, n_classes=run_config.data_provider.n_classes,
 #     bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout)
 # # build run manager
 run_manager = RunManager(args.path,
                          net,
                          run_config,
                          measure_latency=args.latency)
 # run_manager.save_config(print_info=True)
 # load checkpoints
 init_path = '%s/init' % args.path
 if args.resume:
     run_manager.load_model()
     if args.train and run_manager.best_acc == 0:
         loss, acc1, acc5 = run_manager.validate(ist=False,
                                                 return_top5=True)
         run_manager.best_acc = acc1
 elif os.path.isfile(init_path):
     if torch.cuda.is_available():
         checkpoint = torch.load(init_path)
     else:
         checkpoint = torch.load(init_path, map_location='cpu')
Exemplo n.º 18
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)
    set_manual_seed(args.random_seed)
    #print_experiment_environment()
    EXP_time = time_for_file()
    args.path = os.path.join(args.path, args.exp_name, EXP_time)
    os.makedirs(args.path, exist_ok=True)
    create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    # TODO need modification, not need in counter_network
    args.conv_candidates = [
        '3x3_MBConv3',
        '3x3_MBConv6',
        '5x5_MBConv3',
        '5x5_MBConv6',
        '7x7_MBConv3',
        '7x7_MBConv6',
        'Zero',  #'Identity'
    ]
    run_config = RunConfig(**args.__dict__)
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None

    # related to hardware constraint
    # TODO: get rid of
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {'lambda': args.reg_loss_lambda}
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None

    arch_search_config = ArchSearchConfig(**args.__dict__)
    # perform config save, for run_configs and arch_search_configs
    save_configs(run_config.config, arch_search_config.config, args.path,
                 'search')
    logger = prepare_logger(args)
    if args.open_vis:
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    '''
    super_network = GumbelAutoDeepLab(
        args.filter_multiplier, args.block_multiplier, args.steps,
        args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger
    )
    '''
    super_network = CounterMBConvNet(2, search_space=args.search_space)
    train_manager = RunManager(args.path,
                               super_network,
                               logger,
                               run_config,
                               vis=vis,
                               out_log=True)
    # train search phase
    train_manager.train()
    logger.close()