def time_kernel(self, *args, **kwargs): if self.POLYBENCH_TIME or self.POLYBENCH_GFLOPS: # Simple time measurement self.__timer_start() self.kernel(*args, **kwargs) self.__timer_stop() elif self.POLYBENCH_PAPI: # Measuring performance counters is a bit tricky. The API allows to monitor multiple counters at once, but # that is not accurate so we need to measure each counter independently within a loop to ensure proper # operation. i = 0 self.__papi_init() # Initializes self.__papi_counters and self.__papi_available_counters self.__prepare_instruments() # Information for the following loop: # * self.__papi_counters holds a list of available counter ids # * self.__papi_counters_result holds the actual counter return values for counter in self.__papi_counters: if i > 0: self.initialize_array(*args, **kwargs) # force initialization i += 1 papi_high.start_counters([counter]) # requires a list of counters self.kernel(*args, **kwargs) self.__papi_counters_result.extend(papi_high.stop_counters()) # returns a list of counter results else: # Default kernel run self.__prepare_instruments() self.kernel(*args, **kwargs) # Something like stop_instruments() if self.POLYBENCH_LINUX_FIFO_SCHEDULER: self.__linux_standard_scheduler()
def profile(model, inputs, repeats=1000): # Reference for counting flops: http://www.bnikolic.co.uk/blog/python/flops/2019/10/01/pytorch-count-flops.html from pypapi import papi_high from pypapi import events as papi_events papi_high.start_counters([ papi_events.PAPI_SP_OPS, ]) model.forward(*inputs) flops = papi_high.stop_counters()[0] / 1000000.0 from time import perf_counter times = [] for _ in range(repeats): t = perf_counter() model.forward(*inputs) times.append(perf_counter() - t) params = sum(p.numel() for p in model.parameters()) / 1000000.0 times = np.array(times) * 1000 return { "params(M)": params, "flops(M)": flops, "inf_time_mean(ms)": np.mean(times), "inf_time_std(ms)": np.std(times) }
def run(self): if "nb_gpus" in self.params: if self.params["nb_gpus"] > 0: raise Exception( "Numpy framework does not work with GPU back-end") M, N, K = self.matrix_size dtype = np.float32 a = np.random.random((M, N)).astype(dtype) b = np.random.random((N, K)).astype(dtype) c = np.random.random((M, K)).astype(dtype) nb_epoch = 2 papi_availalbe = True try: high.start_counters([ events.PAPI_SP_OPS, ]) except: papi_availalbe = False time_start = timer() for _ in range(nb_epoch): c = a @ b # + c time_end = timer() if papi_availalbe: gflop_papi = high.stop_counters()[0] / (1024**3) self.params["GFLOP_papi"] = gflop_papi elapsed_time = (time_end - time_start) / nb_epoch self.params["time"] = elapsed_time self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time
def restartPAPI(aspect): try: papi_high.stop_counters() except: pass if aspect == 'PAPI_L1_DCM': papi_high.start_counters([papi_events.PAPI_L1_DCM]) elif aspect == 'PAPI_TOT_INS': papi_high.start_counters([papi_events.PAPI_TOT_INS])
def main(): if not (right_args(len(sys.argv), sys.argv)): return 1 size_A = int(sys.argv[2]) size_B = int(sys.argv[3]) tot_FLOPS = 2 * MX * MX * size_A if valid_algorithm(sys.argv[1]): algorithm = int(sys.argv[1]) else: return 2 matrix_A = Matrix(size_A, size_A) fill(matrix_A) matrix_B = Matrix(size_B, size_B) fill(matrix_B) # Test Matrix Values #print_matrix(matrix_A) #print_matrix(matrix_B) # Starts some counters papi_high.start_counters([ papi_events.PAPI_L1_DCM, papi_events.PAPI_L1_ICM, papi_events.PAPI_L1_TCM, papi_events.PAPI_L2_TCM, papi_events.PAPI_L3_TCM, papi_events.PAPI_TOT_INS ]) if algorithm == 1: print 'Basic Matrix Multiplication' start_time = time.time() result = basic_multiply(matrix_A, matrix_B) end_time = time.time() #print_matrix(result) if algorithm == 2: print 'Line Matrix Multiplication' start_time = time.time() result = line_multiply(matrix_A, matrix_B) end_time = time.time() #print_matrix(result) # Reads values from counters and reset them results = papi_high.read_counters() # -> [int, int] # Print results print_times(results, tot_FLOPS, start_time, end_time) # Stop counters papi_high.stop_counters() # -> []
def __init__(self, cpu_events: List[str] = None): # Starts some counters # Check environment logging.info(f"CPU monitor supports {papi_high.num_counters()} counters in {papi_high.num_components()} " f"components") if papi_high.num_counters() == 0: raise CPUEventsNotSupportedException("No CPU events to measure") # Events are defined at https://flozz.github.io/pypapi/events.html try: self._event_names = ["PAPI_REF_CYC", "PAPI_TOT_INS", "PAPI_L3_TCA", "PAPI_L3_TCM", "PAPI_BR_INS", "PAPI_BR_MSP"] cpu_events = [getattr(papi_events, event) for event in self._event_names] papi_high.start_counters(cpu_events) except (PapiNoEventError, AttributeError) as e: raise CPUEventsNotSupportedException(e)
def count_flops( self, dataset, repeat, ): """Use PYPAPI library to count average flops for model inference. Note: It only works if the model is being run on cpu""" logger.info("Starting flop counter") high.start_counters([events.PAPI_DP_OPS]) for i, sample in enumerate(dataset): for _r in range(repeat): self.forward(sample) if i % 100 == 0: logger.info(f"Counted flops for {i}/{len(dataset)} samples") flops = high.stop_counters() flops = round(flops[0] / (repeat * len(dataset))) return flops
def spoped(*args, **kwargs): kkwargs = kwargs.copy() # check if the log_spops keyword is provided try: kkwargs.pop("log_spops") except: pass # if the event is available, do spop calculation & execute func try: high.start_counters([events.PAPI_SP_OPS]) result = func(*args, **kkwargs) spops = high.stop_counters()[0] if "log_spops" in kwargs: kwargs["log_spops"][func.__name__] = spops if show: print("{}\tSPOPS:\t\t{}".format(func.__name__, spops)) return result except pypapi.exceptions.PapiNoEventError as e: warnings.warn( "{} \nYour kernel might not " "support this function. Function {} is executed without SPOP " "counting.".format(e, func.__name__)) return func(*args, **kkwargs)
flops = [] for saliency_path, output_path, seed in zip(args.saliency_paths, args.output_paths, args.seeds): random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True np.random.seed(seed) with open(saliency_path) as out: with open(output_path, 'w') as output_sal: saliency_flops = [] for j, line in enumerate(out): high.start_counters([ events.PAPI_FP_OPS, ]) try: instance_saliency = json.loads(line) except: line = next(out) instance_saliency = json.loads(line) for i, token in enumerate(instance_saliency['tokens']): if token['token'] == '[PAD]': continue for _c in classes: instance_saliency['tokens'][i][str( _c)] = np.random.rand()
def generate_saliency(model_path, saliency_path): test = get_dataset(path=args.dataset_dir, mode=args.split, dataset=args.dataset) checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = argparse.Namespace(**checkpoint['args']) if args.model == 'trans': model_args.batch_size = 7 transformer_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=model_args.labels) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) model.load_state_dict(checkpoint['model']) modelw = BertModelWrapper(model, device, tokenizer, model_args) else: if args.model == 'lstm': model_args.batch_size = 200 model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels'], device=device).to(device) else: model_args.batch_size = 300 model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) modelw = ModelWrapper(model, device, tokenizer, model_args) modelw.eval() explainer = LimeTextExplainer() saliency_flops = [] with open(saliency_path, 'w') as out: for instance in tqdm(test): # SALIENCY if not args.no_time: high.start_counters([events.PAPI_FP_OPS, ]) saliencies = [] if args.dataset in ['imdb', 'tweet']: token_ids = tokenizer.encode(instance[0]) else: token_ids = tokenizer.encode(instance[0], instance[1]) if len(token_ids) < 6: token_ids = token_ids + [tokenizer.pad_token_id] * ( 6 - len(token_ids)) try: exp = explainer.explain_instance( " ".join([str(i) for i in token_ids]), modelw, num_features=len(token_ids), top_labels=args.labels) except Exception as e: print(e) if not args.no_time: x = high.stop_counters()[0] saliency_flops.append(x) for token_id in token_ids: token_id = int(token_id) token_saliency = { 'token': tokenizer.ids_to_tokens[token_id] } for cls_ in range(args.labels): token_saliency[int(cls_)] = 0 saliencies.append(token_saliency) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() continue if not args.no_time: x = high.stop_counters()[0] saliency_flops.append(x) # SERIALIZE explanation = {} for cls_ in range(args.labels): cls_expl = {} for (w, s) in exp.as_list(label=cls_): cls_expl[int(w)] = s explanation[cls_] = cls_expl for token_id in token_ids: token_id = int(token_id) token_saliency = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(args.labels): token_saliency[int(cls_)] = explanation[cls_].get(token_id, None) saliencies.append(token_saliency) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() return saliency_flops
def generate_saliency(model_path, saliency_path, saliency, aggregation): checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = Namespace(**checkpoint['args']) if args.model == 'lstm': model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) elif args.model == 'trans': transformer_config = BertConfig.from_pretrained( 'bert-base-uncased', num_labels=model_args.labels) model_cp = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_cp.load_state_dict(checkpoint['model']) model = BertModelWrapper(model_cp) else: model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) model.train() pad_to_max = False if saliency == 'deeplift': ablator = DeepLift(model) elif saliency == 'guided': ablator = GuidedBackprop(model) elif saliency == 'sal': ablator = Saliency(model) elif saliency == 'inputx': ablator = InputXGradient(model) elif saliency == 'occlusion': ablator = Occlusion(model) coll_call = get_collate_fn(dataset=args.dataset, model=args.model) return_attention_masks = args.model == 'trans' collate_fn = partial(coll_call, tokenizer=tokenizer, device=device, return_attention_masks=return_attention_masks, pad_to_max_length=pad_to_max) test = get_dataset(path=args.dataset_dir, mode=args.split, dataset=args.dataset) batch_size = args.batch_size if args.batch_size != None else \ model_args.batch_size test_dl = DataLoader(batch_size=batch_size, dataset=test, shuffle=False, collate_fn=collate_fn) # PREDICTIONS predictions_path = model_path + '.predictions' if not os.path.exists(predictions_path): predictions = defaultdict(lambda: []) for batch in tqdm(test_dl, desc='Running test prediction... '): if args.model == 'trans': logits = model(batch[0], attention_mask=batch[1], labels=batch[2].long()) else: logits = model(batch[0]) logits = logits.detach().cpu().numpy().tolist() predicted = np.argmax(np.array(logits), axis=-1) predictions['class'] += predicted.tolist() predictions['logits'] += logits with open(predictions_path, 'w') as out: json.dump(predictions, out) # COMPUTE SALIENCY if saliency != 'occlusion': embedding_layer_name = 'model.bert.embeddings' if args.model == \ 'trans' else \ 'embedding' interpretable_embedding = configure_interpretable_embedding_layer( model, embedding_layer_name) class_attr_list = defaultdict(lambda: []) token_ids = [] saliency_flops = [] for batch in tqdm(test_dl, desc='Running Saliency Generation...'): if args.model == 'cnn': additional = None elif args.model == 'trans': additional = (batch[1], batch[2]) else: additional = batch[-1] token_ids += batch[0].detach().cpu().numpy().tolist() if saliency != 'occlusion': input_embeddings = interpretable_embedding.indices_to_embeddings( batch[0]) if not args.no_time: high.start_counters([ events.PAPI_FP_OPS, ]) for cls_ in range(checkpoint['args']['labels']): if saliency == 'occlusion': attributions = ablator.attribute( batch[0], sliding_window_shapes=(args.sw, ), target=cls_, additional_forward_args=additional) else: attributions = ablator.attribute( input_embeddings, target=cls_, additional_forward_args=additional) attributions = summarize_attributions( attributions, type=aggregation, model=model, tokens=batch[0]).detach().cpu().numpy().tolist() class_attr_list[cls_] += [[_li for _li in _l] for _l in attributions] if not args.no_time: saliency_flops.append( sum(high.stop_counters()) / batch[0].shape[0]) if saliency != 'occlusion': remove_interpretable_embedding_layer(model, interpretable_embedding) # SERIALIZE print('Serializing...', flush=True) with open(saliency_path, 'w') as out: for instance_i, _ in enumerate(test): saliencies = [] for token_i, token_id in enumerate(token_ids[instance_i]): token_sal = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(checkpoint['args']['labels']): token_sal[int( cls_)] = class_attr_list[cls_][instance_i][token_i] saliencies.append(token_sal) out.write(json.dumps({'tokens': saliencies}) + '\n') out.flush() return saliency_flops
def generate_saliency(model_path, saliency_path): checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_args = Namespace(**checkpoint['args']) model_args.batch_size = args.batch_size if args.batch_size != None else \ model_args.batch_size if args.model == 'transformer': transformer_config = BertConfig.from_pretrained( 'bert-base-uncased', num_labels=model_args.labels) modelb = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) modelb.load_state_dict(checkpoint['model']) model = BertModelWrapper(modelb) elif args.model == 'lstm': model = LSTM_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels'], device=device).to(device) model.load_state_dict(checkpoint['model']) model.train() model = ModelWrapper(model) else: # model_args.batch_size = 1000 model = CNN_MODEL(tokenizer, model_args, n_labels=checkpoint['args']['labels']).to(device) model.load_state_dict(checkpoint['model']) model.train() model = ModelWrapper(model) ablator = ShapleyValueSampling(model) coll_call = get_collate_fn(dataset=args.dataset, model=args.model) collate_fn = partial(coll_call, tokenizer=tokenizer, device=device, return_attention_masks=False, pad_to_max_length=False) test = get_dataset(args.dataset_dir, mode=args.split) test_dl = DataLoader(batch_size=model_args.batch_size, dataset=test, shuffle=False, collate_fn=collate_fn) # PREDICTIONS predictions_path = model_path + '.predictions' if not os.path.exists(predictions_path): predictions = defaultdict(lambda: []) for batch in tqdm(test_dl, desc='Running test prediction... '): logits = model(batch[0]) logits = logits.detach().cpu().numpy().tolist() predicted = np.argmax(np.array(logits), axis=-1) predictions['class'] += predicted.tolist() predictions['logits'] += logits with open(predictions_path, 'w') as out: json.dump(predictions, out) # COMPUTE SALIENCY saliency_flops = [] with open(saliency_path, 'w') as out_mean: for batch in tqdm(test_dl, desc='Running Saliency Generation...'): class_attr_list = defaultdict(lambda: []) if args.model == 'rnn': additional = batch[-1] else: additional = None if not args.no_time: high.start_counters([events.PAPI_FP_OPS]) token_ids = batch[0].detach().cpu().numpy().tolist() for cls_ in range(args.labels): attributions = ablator.attribute( batch[0].float(), target=cls_, additional_forward_args=additional) attributions = attributions.detach().cpu().numpy().tolist() class_attr_list[cls_] += attributions if not args.no_time: x = sum(high.stop_counters()) saliency_flops.append(x / batch[0].shape[0]) for i in range(len(batch[0])): saliencies = [] for token_i, token_id in enumerate(token_ids[i]): if token_id == tokenizer.pad_token_id: continue token_sal = {'token': tokenizer.ids_to_tokens[token_id]} for cls_ in range(args.labels): token_sal[int( cls_)] = class_attr_list[cls_][i][token_i] saliencies.append(token_sal) out_mean.write(json.dumps({'tokens': saliencies}) + '\n') out_mean.flush() return saliency_flops
def papiStartCounters(): papi_high.start_counters([ papi_events.PAPI_L1_DCM, papi_events.PAPI_L2_DCM, ])