def restartPAPI(aspect): try: papi_high.stop_counters() except: pass if aspect == 'PAPI_L1_DCM': papi_high.start_counters([papi_events.PAPI_L1_DCM]) elif aspect == 'PAPI_TOT_INS': papi_high.start_counters([papi_events.PAPI_TOT_INS])
def main(): if not (right_args(len(sys.argv), sys.argv)): return 1 size_A = int(sys.argv[2]) size_B = int(sys.argv[3]) tot_FLOPS = 2 * MX * MX * size_A if valid_algorithm(sys.argv[1]): algorithm = int(sys.argv[1]) else: return 2 matrix_A = Matrix(size_A, size_A) fill(matrix_A) matrix_B = Matrix(size_B, size_B) fill(matrix_B) # Test Matrix Values #print_matrix(matrix_A) #print_matrix(matrix_B) # Starts some counters papi_high.start_counters([ papi_events.PAPI_L1_DCM, papi_events.PAPI_L1_ICM, papi_events.PAPI_L1_TCM, papi_events.PAPI_L2_TCM, papi_events.PAPI_L3_TCM, papi_events.PAPI_TOT_INS ]) if algorithm == 1: print 'Basic Matrix Multiplication' start_time = time.time() result = basic_multiply(matrix_A, matrix_B) end_time = time.time() #print_matrix(result) if algorithm == 2: print 'Line Matrix Multiplication' start_time = time.time() result = line_multiply(matrix_A, matrix_B) end_time = time.time() #print_matrix(result) # Reads values from counters and reset them results = papi_high.read_counters() # -> [int, int] # Print results print_times(results, tot_FLOPS, start_time, end_time) # Stop counters papi_high.stop_counters() # -> []
def profile(model, inputs, repeats=1000): # Reference for counting flops: http://www.bnikolic.co.uk/blog/python/flops/2019/10/01/pytorch-count-flops.html from pypapi import papi_high from pypapi import events as papi_events papi_high.start_counters([ papi_events.PAPI_SP_OPS, ]) model.forward(*inputs) flops = papi_high.stop_counters()[0] / 1000000.0 from time import perf_counter times = [] for _ in range(repeats): t = perf_counter() model.forward(*inputs) times.append(perf_counter() - t) params = sum(p.numel() for p in model.parameters()) / 1000000.0 times = np.array(times) * 1000 return { "params(M)": params, "flops(M)": flops, "inf_time_mean(ms)": np.mean(times), "inf_time_std(ms)": np.std(times) }
def run(self): if "nb_gpus" in self.params: if self.params["nb_gpus"] > 0: raise Exception( "Numpy framework does not work with GPU back-end") M, N, K = self.matrix_size dtype = np.float32 a = np.random.random((M, N)).astype(dtype) b = np.random.random((N, K)).astype(dtype) c = np.random.random((M, K)).astype(dtype) nb_epoch = 2 papi_availalbe = True try: high.start_counters([ events.PAPI_SP_OPS, ]) except: papi_availalbe = False time_start = timer() for _ in range(nb_epoch): c = a @ b # + c time_end = timer() if papi_availalbe: gflop_papi = high.stop_counters()[0] / (1024**3) self.params["GFLOP_papi"] = gflop_papi elapsed_time = (time_end - time_start) / nb_epoch self.params["time"] = elapsed_time self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time
def time_kernel(self, *args, **kwargs): if self.POLYBENCH_TIME or self.POLYBENCH_GFLOPS: # Simple time measurement self.__timer_start() self.kernel(*args, **kwargs) self.__timer_stop() elif self.POLYBENCH_PAPI: # Measuring performance counters is a bit tricky. The API allows to monitor multiple counters at once, but # that is not accurate so we need to measure each counter independently within a loop to ensure proper # operation. i = 0 self.__papi_init() # Initializes self.__papi_counters and self.__papi_available_counters self.__prepare_instruments() # Information for the following loop: # * self.__papi_counters holds a list of available counter ids # * self.__papi_counters_result holds the actual counter return values for counter in self.__papi_counters: if i > 0: self.initialize_array(*args, **kwargs) # force initialization i += 1 papi_high.start_counters([counter]) # requires a list of counters self.kernel(*args, **kwargs) self.__papi_counters_result.extend(papi_high.stop_counters()) # returns a list of counter results else: # Default kernel run self.__prepare_instruments() self.kernel(*args, **kwargs) # Something like stop_instruments() if self.POLYBENCH_LINUX_FIFO_SCHEDULER: self.__linux_standard_scheduler()
def count_flops( self, dataset, repeat, ): """Use PYPAPI library to count average flops for model inference. Note: It only works if the model is being run on cpu""" logger.info("Starting flop counter") high.start_counters([events.PAPI_DP_OPS]) for i, sample in enumerate(dataset): for _r in range(repeat): self.forward(sample) if i % 100 == 0: logger.info(f"Counted flops for {i}/{len(dataset)} samples") flops = high.stop_counters() flops = round(flops[0] / (repeat * len(dataset))) return flops
def spoped(*args, **kwargs): kkwargs = kwargs.copy() # check if the log_spops keyword is provided try: kkwargs.pop("log_spops") except: pass # if the event is available, do spop calculation & execute func try: high.start_counters([events.PAPI_SP_OPS]) result = func(*args, **kkwargs) spops = high.stop_counters()[0] if "log_spops" in kwargs: kwargs["log_spops"][func.__name__] = spops if show: print("{}\tSPOPS:\t\t{}".format(func.__name__, spops)) return result except pypapi.exceptions.PapiNoEventError as e: warnings.warn( "{} \nYour kernel might not " "support this function. Function {} is executed without SPOP " "counting.".format(e, func.__name__)) return func(*args, **kkwargs)
with open(saliency_path) as out: with open(output_path, 'w') as output_sal: saliency_flops = [] for j, line in enumerate(out): high.start_counters([ events.PAPI_FP_OPS, ]) try: instance_saliency = json.loads(line) except: line = next(out) instance_saliency = json.loads(line) for i, token in enumerate(instance_saliency['tokens']): if token['token'] == '[PAD]': continue for _c in classes: instance_saliency['tokens'][i][str( _c)] = np.random.rand() output_sal.write(json.dumps(instance_saliency) + '\n') x = sum(high.stop_counters()) saliency_flops.append(x) print(np.mean(saliency_flops), np.std(saliency_flops)) flops.append(np.mean(saliency_flops)) print('FLOPs', f'{np.mean(flops):.2f} ($\pm${np.std(flops):.2f})')
if __name__ == '__main__': datadir = '../data' # x_train: (n_samples, width, height) (x_train, y_train), (x_test, y_test) = load_data(args.dataset, num_classes, datadir) if args.count_flops: rcdt_ns_obj = RCDT_NS(num_classes, theta, rm_edge) for n_samples in [1, 10, 100]: high.start_counters([ events.PAPI_DP_OPS, ]) rcdt_test = x_train[:n_samples] rcdt_test = rcdt_ns_obj.fun_rcdt_batch(rcdt_test) x = high.stop_counters()[0] print('rcdt_test.shape {} GFLOPS {}'.format( rcdt_test.shape, x / 1e9)) rcdt_gflops = (x / 1e9) / n_samples print('rcdt_gflops: {}'.format(rcdt_gflops)) num_repeats = 10 accs = [] all_preds = [] if args.count_flops: all_train_gflops, all_test_gflops = [], [] for n_samples_perclass in [2**i for i in range(0, po_train_max + 1)]: for repeat in range(num_repeats): x_train_sub, y_train_sub = take_train_samples( x_train, y_train, n_samples_perclass, num_classes, repeat)
# https://github.com/Lyken17/pytorch-OpCounter # https://github.com/sovrasov/flops-counter.pytorch/issues/16 def train_gflops(model, epochs=1, num_train_samples=1, input_size=28): gflops = epochs * num_train_samples * 2 * test_gflops(model, 1, input_size) return gflops himodel = MNISTNet(3, 10, img_size=28).double() high.start_counters([ events.PAPI_DP_OPS, ]) himodel(torch.randn(1, 3, 28, 28).double()) print(high.stop_counters()[0] / 1e9) def test_gflops(model, input_size): assert model in ['shallowcnn', 'resnet18', 'vgg11'] if model == 'shallowcnn': model = MNISTNet(3, 10, img_size=input_size) if model == 'resnet18': model = models.resnet18(num_classes=10) if model == 'vgg11': model = models.vgg11_bn(num_classes=10) input = torch.randn(1, 3, input_size, input_size) macs, params = profile(model, inputs=(input, )) gflops = 2 * macs / 1e9 print(gflops) return gflops
def generate_saliency(model_path, saliency_path): test = get_dataset(path=args.dataset_dir, mode=args.split, dataset=args.dataset) checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = argparse.Namespace(**checkpoint['args']) if args.model == 'trans': model_args.batch_size = 7 transformer_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=model_args.labels) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) model.load_state_dict(checkpoint['model']) modelw = BertModelWrapper(model, device, tokenizer, model_args) else: if args.model == 'lstm': model_args.batch_size = 200 model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels'], device=device).to(device) else: model_args.batch_size = 300 model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) modelw = ModelWrapper(model, device, tokenizer, model_args) modelw.eval() explainer = LimeTextExplainer() saliency_flops = [] with open(saliency_path, 'w') as out: for instance in tqdm(test): # SALIENCY if not args.no_time: high.start_counters([events.PAPI_FP_OPS, ]) saliencies = [] if args.dataset in ['imdb', 'tweet']: token_ids = tokenizer.encode(instance[0]) else: token_ids = tokenizer.encode(instance[0], instance[1]) if len(token_ids) < 6: token_ids = token_ids + [tokenizer.pad_token_id] * ( 6 - len(token_ids)) try: exp = explainer.explain_instance( " ".join([str(i) for i in token_ids]), modelw, num_features=len(token_ids), top_labels=args.labels) except Exception as e: print(e) if not args.no_time: x = high.stop_counters()[0] saliency_flops.append(x) for token_id in token_ids: token_id = int(token_id) token_saliency = { 'token': tokenizer.ids_to_tokens[token_id] } for cls_ in range(args.labels): token_saliency[int(cls_)] = 0 saliencies.append(token_saliency) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() continue if not args.no_time: x = high.stop_counters()[0] saliency_flops.append(x) # SERIALIZE explanation = {} for cls_ in range(args.labels): cls_expl = {} for (w, s) in exp.as_list(label=cls_): cls_expl[int(w)] = s explanation[cls_] = cls_expl for token_id in token_ids: token_id = int(token_id) token_saliency = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(args.labels): token_saliency[int(cls_)] = explanation[cls_].get(token_id, None) saliencies.append(token_saliency) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() return saliency_flops
def generate_saliency(model_path, saliency_path, saliency, aggregation): checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = Namespace(**checkpoint['args']) if args.model == 'lstm': model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) elif args.model == 'trans': transformer_config = BertConfig.from_pretrained( 'bert-base-uncased', num_labels=model_args.labels) model_cp = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_cp.load_state_dict(checkpoint['model']) model = BertModelWrapper(model_cp) else: model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) model.train() pad_to_max = False if saliency == 'deeplift': ablator = DeepLift(model) elif saliency == 'guided': ablator = GuidedBackprop(model) elif saliency == 'sal': ablator = Saliency(model) elif saliency == 'inputx': ablator = InputXGradient(model) elif saliency == 'occlusion': ablator = Occlusion(model) coll_call = get_collate_fn(dataset=args.dataset, model=args.model) return_attention_masks = args.model == 'trans' collate_fn = partial(coll_call, tokenizer=tokenizer, device=device, return_attention_masks=return_attention_masks, pad_to_max_length=pad_to_max) test = get_dataset(path=args.dataset_dir, mode=args.split, dataset=args.dataset) batch_size = args.batch_size if args.batch_size != None else \ model_args.batch_size test_dl = DataLoader(batch_size=batch_size, dataset=test, shuffle=False, collate_fn=collate_fn) # PREDICTIONS predictions_path = model_path + '.predictions' if not os.path.exists(predictions_path): predictions = defaultdict(lambda: []) for batch in tqdm(test_dl, desc='Running test prediction... '): if args.model == 'trans': logits = model(batch[0], attention_mask=batch[1], labels=batch[2].long()) else: logits = model(batch[0]) logits = logits.detach().cpu().numpy().tolist() predicted = np.argmax(np.array(logits), axis=-1) predictions['class'] += predicted.tolist() predictions['logits'] += logits with open(predictions_path, 'w') as out: json.dump(predictions, out) # COMPUTE SALIENCY if saliency != 'occlusion': embedding_layer_name = 'model.bert.embeddings' if args.model == \ 'trans' else \ 'embedding' interpretable_embedding = configure_interpretable_embedding_layer( model, embedding_layer_name) class_attr_list = defaultdict(lambda: []) token_ids = [] saliency_flops = [] for batch in tqdm(test_dl, desc='Running Saliency Generation...'): if args.model == 'cnn': additional = None elif args.model == 'trans': additional = (batch[1], batch[2]) else: additional = batch[-1] token_ids += batch[0].detach().cpu().numpy().tolist() if saliency != 'occlusion': input_embeddings = interpretable_embedding.indices_to_embeddings( batch[0]) if not args.no_time: high.start_counters([ events.PAPI_FP_OPS, ]) for cls_ in range(checkpoint['args']['labels']): if saliency == 'occlusion': attributions = ablator.attribute( batch[0], sliding_window_shapes=(args.sw, ), target=cls_, additional_forward_args=additional) else: attributions = ablator.attribute( input_embeddings, target=cls_, additional_forward_args=additional) attributions = summarize_attributions( attributions, type=aggregation, model=model, tokens=batch[0]).detach().cpu().numpy().tolist() class_attr_list[cls_] += [[_li for _li in _l] for _l in attributions] if not args.no_time: saliency_flops.append( sum(high.stop_counters()) / batch[0].shape[0]) if saliency != 'occlusion': remove_interpretable_embedding_layer(model, interpretable_embedding) # SERIALIZE print('Serializing...', flush=True) with open(saliency_path, 'w') as out: for instance_i, _ in enumerate(test): saliencies = [] for token_i, token_id in enumerate(token_ids[instance_i]): token_sal = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(checkpoint['args']['labels']): token_sal[int( cls_)] = class_attr_list[cls_][instance_i][token_i] saliencies.append(token_sal) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() return saliency_flops
if args.model == 'resnet18': model = models.resnet18(num_classes=num_classes).double().to(device) torch.save(model.state_dict(), './model_init.pth') model.eval() with torch.no_grad(): high.start_counters([ events.PAPI_DP_OPS, ]) x_test_batch = torch.rand(1, 3, img_size, img_size, dtype=torch.float64) test_logit = model(x_test_batch) test_gflops = high.stop_counters()[0] / 1e9 print('test gflops: {}'.format(test_gflops)) all_train_gflops = [] for n_samples_perclass in [2**i for i in range(0, po_train_max + 1)]: model.load_state_dict(torch.load('./model_init.pth')) x_val_size = 0 if n_samples_perclass < 16 else int(n_samples_perclass * 0.1) * num_classes x_train_sub_size = n_samples_perclass * num_classes - x_val_size criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=5e-4) high.start_counters([ events.PAPI_DP_OPS, ])
def generate_saliency(model_path, saliency_path): checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = Namespace(**checkpoint['args']) model_args.batch_size = args.batch_size if args.batch_size != None else \ model_args.batch_size if args.model == 'transformer': transformer_config = BertConfig.from_pretrained( 'bert-base-uncased', num_labels=model_args.labels) modelb = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) modelb.load_state_dict(checkpoint['model']) model = BertModelWrapper(modelb) elif args.model == 'lstm': model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels'], device=device).to(device) model.load_state_dict(checkpoint['model']) model.train() model = ModelWrapper(model) else: # model_args.batch_size = 1000 model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) model.train() model = ModelWrapper(model) ablator = ShapleyValueSampling(model) coll_call = get_collate_fn(dataset=args.dataset, model=args.model) collate_fn = partial(coll_call, tokenizer=tokenizer, device=device, return_attention_masks=False, pad_to_max_length=False) test = get_dataset(args.dataset_dir, mode=args.split) test_dl = DataLoader(batch_size=model_args.batch_size, dataset=test, shuffle=False, collate_fn=collate_fn) # PREDICTIONS predictions_path = model_path + '.predictions' if not os.path.exists(predictions_path): predictions = defaultdict(lambda: []) for batch in tqdm(test_dl, desc='Running test prediction... '): logits = model(batch[0]) logits = logits.detach().cpu().numpy().tolist() predicted = np.argmax(np.array(logits), axis=-1) predictions['class'] += predicted.tolist() predictions['logits'] += logits with open(predictions_path, 'w') as out: json.dump(predictions, out) # COMPUTE SALIENCY saliency_flops = [] with open(saliency_path, 'w') as out_mean: for batch in tqdm(test_dl, desc='Running Saliency Generation...'): class_attr_list = defaultdict(lambda: []) if args.model == 'rnn': additional = batch[-1] else: additional = None if not args.no_time: high.start_counters([events.PAPI_FP_OPS]) token_ids = batch[0].detach().cpu().numpy().tolist() for cls_ in range(args.labels): attributions = ablator.attribute( batch[0].float(), target=cls_, additional_forward_args=additional) attributions = attributions.detach().cpu().numpy().tolist() class_attr_list[cls_] += attributions if not args.no_time: x = sum(high.stop_counters()) saliency_flops.append(x / batch[0].shape[0]) for i in range(len(batch[0])): saliencies = [] for token_i, token_id in enumerate(token_ids[i]): if token_id == tokenizer.pad_token_id: continue token_sal = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(args.labels): token_sal[int( cls_)] = class_attr_list[cls_][i][token_i] saliencies.append(token_sal) out_mean.write(json.dumps({'tokens': saliencies}) + '\n') out_mean.flush() return saliency_flops
def papiStopCounters(): return papi_high.stop_counters()
def finish(self): papi_high.stop_counters()