def predict_lat(self, config):
        with torch.no_grad():
            features = utils.get_config_features(config)
            features_norm = np.array(features) / self.feature_norm

            prediction = self.model(torch.Tensor(features_norm)).item() * self.lat_norm

        return prediction
示例#2
0
def main(args):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Print args
    print(args)

    # Setup task
    task = tasks.setup_task(args)

    # Build model
    model = task.build_model(args)
    print(model)
    

    # specify the length of the dummy input for profile
    # for iwslt, the average length is 23, for wmt, that is 30
    dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30}
    if 'iwslt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['iwslt']
    elif 'wmt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['wmt']
    else:
        raise NotImplementedError

    dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1)
    dummy_prev = [7] * (dummy_sentence_length - 1) + [2]

    # for latency predictor: latency dataset generation
    with open(args.lat_dataset_path, 'w') as fid:
        src_tokens_test = torch.tensor([dummy_src_tokens], dtype=torch.long)
        src_lengths_test = torch.tensor([dummy_sentence_length])
        prev_output_tokens_test_with_beam = torch.tensor([dummy_prev] * args.beam, dtype=torch.long)
        if args.latcpu:
            model.cpu()
            print('Measuring model latency on CPU for dataset generation...')
        elif args.latgpu:
            model.cuda()
            src_tokens_test = src_tokens_test.cuda()
            src_lengths_test = src_lengths_test.cuda()
            prev_output_tokens_test_with_beam = prev_output_tokens_test_with_beam.cuda()
            src_tokens_test.get_device()
            print('Measuring model latency on GPU for dataset generation...')
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)

        feature_info = utils.get_feature_info()
        fid.write(','.join(feature_info) + ',')
        latency_info = ['latency_mean_encoder', 'latency_mean_decoder', 'latency_std_encoder', 'latency_std_decoder']
        fid.write(','.join(latency_info) + '\n')

        for i in range(args.lat_dataset_size):
            print(i)
            config_sam = utils.sample_configs(utils.get_all_choices(args), reset_rand_seed=False, super_decoder_num_layer=args.decoder_layers)

            features = utils.get_config_features(config_sam)
            fid.write(','.join(map(str, features)) + ',')
            
            model.set_sample_config(config_sam)

            # dry runs
            for _ in range(5):
                encoder_out_test = model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test)

            encoder_latencies = []
            print('Measuring encoder for dataset generation...')
            for _ in tqdm(range(args.latiter)):
                if args.latgpu:
                    start.record()
                elif args.latcpu:
                    start = time.time()

                model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test)

                if args.latgpu:
                    end.record()
                    torch.cuda.synchronize()
                    encoder_latencies.append(start.elapsed_time(end))
                    if not args.latsilent:
                        print('Encoder one run on GPU (for dataset generation): ', start.elapsed_time(end))

                elif args.latcpu:
                    end = time.time()
                    encoder_latencies.append((end - start) * 1000)
                    if not args.latsilent:
                        print('Encoder one run on CPU (for dataset generation): ', (end - start) * 1000)

            # only use the 10% to 90% latencies to avoid outliers
            encoder_latencies.sort()
            encoder_latencies = encoder_latencies[int(args.latiter * 0.1): -max(1, int(args.latiter * 0.1))]
            print(f'Encoder latency for dataset generation: Mean: {np.mean(encoder_latencies)} ms; \t Std: {np.std(encoder_latencies)} ms')

            bsz = 1
            new_order = torch.arange(bsz).view(-1, 1).repeat(1, args.beam).view(-1).long()
            if args.latgpu:
                new_order = new_order.cuda()

            encoder_out_test_with_beam = model.encoder.reorder_encoder_out(encoder_out_test, new_order)

            # dry runs
            for _ in range(5):
                model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam,
                                   encoder_out=encoder_out_test_with_beam)

            # decoder is more complicated because we need to deal with incremental states and auto regressive things
            decoder_iterations_dict = {'iwslt': 23, 'wmt': 30}
            if 'iwslt' in args.arch:
                decoder_iterations = decoder_iterations_dict['iwslt']
            elif 'wmt' in args.arch:
                decoder_iterations = decoder_iterations_dict['wmt']

            decoder_latencies = []
            print('Measuring decoder for dataset generation...')
            for _ in tqdm(range(args.latiter)):
                if args.latgpu:
                    start.record()
                elif args.latcpu:
                    start = time.time()
                incre_states = {}
                for k_regressive in range(decoder_iterations):
                    model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam[:, :k_regressive + 1],
                                       encoder_out=encoder_out_test_with_beam, incremental_state=incre_states)
                if args.latgpu:
                    end.record()
                    torch.cuda.synchronize()
                    decoder_latencies.append(start.elapsed_time(end))
                    if not args.latsilent:
                        print('Decoder one run on GPU (for dataset generation): ', start.elapsed_time(end))

                elif args.latcpu:
                    end = time.time()
                    decoder_latencies.append((end - start) * 1000)
                    if not args.latsilent:
                        print('Decoder one run on CPU (for dataset generation): ', (end - start) * 1000)

            # only use the 10% to 90% latencies to avoid outliers
            decoder_latencies.sort()
            decoder_latencies = decoder_latencies[int(args.latiter * 0.1): -max(1, int(args.latiter * 0.1))]

            print(decoder_latencies)
            print(f'Decoder latency for dataset generation: Mean: {np.mean(decoder_latencies)} ms; \t Std: {np.std(decoder_latencies)} ms')

            lats = [np.mean(encoder_latencies), np.mean(decoder_latencies), np.std(encoder_latencies), np.std(decoder_latencies)]
            fid.write(','.join(map(str, lats)) + '\n')
def main(args):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    torch.manual_seed(args.seed)

    # Print args
    print(args)

    # Setup task
    task = tasks.setup_task(args)

    # Build model
    model = task.build_model(args)
    print(model)

    # specify the length of the dummy input for profile
    # for iwslt, the average length is 23, for wmt, that is 30
    dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30}
    if 'iwslt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['iwslt']
    elif 'wmt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['wmt']
    else:
        raise NotImplementedError

    dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1)
    dummy_prev = [7] * (dummy_sentence_length - 1) + [2]

    # for device predictor: device dataset generation
    # The temporary format is (SubTransformer, DeviceFeature) -> CacheMisses
    with open(args.lat_dataset_path, 'w') as fid:
        src_tokens_test = torch.tensor([dummy_src_tokens], dtype=torch.long)
        src_lengths_test = torch.tensor([dummy_sentence_length])
        prev_output_tokens_test_with_beam = torch.tensor([dummy_prev] *
                                                         args.beam,
                                                         dtype=torch.long)
        if args.latcpu:
            model.cpu()
            print(
                'Measuring model cache misses on CPU for dataset generation...'
            )
        elif args.latgpu:
            return

        feature_info = utils.get_feature_info()
        fid.write(','.join(feature_info) + ',')
        device_feature_info = [
            'l1d_size', 'l1i_size', 'l2_size', 'l3_size', 'num_cores'
        ]  # unit for cache sizes is KB
        fid.write(','.join(device_feature_info) + ',')
        misses_info = ['l1d_misses', 'l1i_misses', 'l2_misses', 'l3_misses']
        fid.write(','.join(misses_info) + '\n')

        device_feature_set = [[32, 32, 256, 4096, 1], [32, 32, 512, 4096, 1],
                              [32, 32, 512, 8192, 1], [64, 64, 512, 8192, 1]]
        num_devices = len(device_feature_set)

        for i in range(args.lat_dataset_size):
            print(i)

            config_sam = utils.sample_configs(
                utils.get_all_choices(args),
                reset_rand_seed=False,
                super_decoder_num_layer=args.decoder_layers)
            features = utils.get_config_features(config_sam)
            fid.write(','.join(map(str, features)) + ',')

            device_id = random.randint(0,
                                       num_devices - 1)  # randomly select data
            device_features = device_feature_set[device_id]
            fid.write(','.join(map(str, device_features)) + ',')

            model.set_sample_config(config_sam)
            # dry runs
            for _ in range(dry_run_iters):
                encoder_out_test = model.encoder(src_tokens=src_tokens_test,
                                                 src_lengths=src_lengths_test)

            # pass the model(features&device_features) as parameters into a new python process
            arguments = [
                dynamorio_dir + '/bin64/drrun', '-t', 'drcachesim',
                '-config_file',
                dev_conf_dir + '/dev' + str(device_id) + '.conf', '--',
                'python', '-u', 'encoder_infer.py',
                str(dummy_sentence_length)
            ]
            print(arguments)
            enproc = subprocess.Popen([
                dynamorio_dir + '/bin64/drrun', '-t', 'drcachesim',
                '-config_file', dev_conf_dir + '/dev' + str(device_id) +
                '.conf', '--', 'python', '-u', 'encoder_infer.py',
                str(dummy_sentence_length)
            ],
                                      stdout=subprocess.PIPE)
            while True:
                line = enproc.stdout.readline()
                if not line:
                    break
                print('line: ' + line.decode('utf-8'))
            encoder_misses = []
            print('Measuring encoder for dataset generation...')
            #TODO run model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test) in simulator and write results into encoder_misses
            print(encoder_misses)

            bsz = 1
            new_order = torch.arange(bsz).view(-1, 1).repeat(
                1, args.beam).view(-1).long()

            encoder_out_test_with_beam = model.encoder.reorder_encoder_out(
                encoder_out_test, new_order)

            # dry runs
            for _ in range(dry_run_iters):
                model.decoder(
                    prev_output_tokens=prev_output_tokens_test_with_beam,
                    encoder_out=encoder_out_test_with_beam)

            # decoder is more complicated because we need to deal with incremental states and auto regressive things
            decoder_iterations_dict = {'iwslt': 23, 'wmt': 30}
            if 'iwslt' in args.arch:
                decoder_iterations = decoder_iterations_dict['iwslt']
            elif 'wmt' in args.arch:
                decoder_iterations = decoder_iterations_dict['wmt']

            decoder_misses = []
            print('Measuring decoder for dataset generation...')
            #TODO run the below commands in simulator and write results into decoder_misses
            #incre_states = {}
            #for k_regressive in range(decoder_iterations):
            #    model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam[:, :k_regressive + 1],
            #                  encoder_out=encoder_out_test_with_beam, incremental_state=incre_states)
            print(decoder_misses)

            #TODO Do we need to add the encoder/decoder misses together
            fid.write(','.join(map(str, encoder_misses)) + '\n')  #temp