pylikwid.finalizetopology() err = pylikwid.init(cpus) if err > 0: print("Cannot initialize LIKWID") sys.exit(1) group = pylikwid.addeventset(eventset) if group >= 0: print("Eventset {} added with ID {}".format(eventset, group,)) else: print("Failed to add eventset {}".format(eventset)) sys.exit(1) err = pylikwid.setup(group) if err < 0: print("Setup of group {} failed".format(group)) sys.exit(1) err = pylikwid.start() s = arr.sum() print(s) err = pylikwid.stop() if err < 0: print("Stop of group {} failed".format(group)) sys.exit(1) for thread in range(0,len(cpus)): for i in range(pylikwid.getnumberofevents(group)): print("Result Event {} : {}".format(pylikwid.getnameofevent(group, i), pylikwid.getresult(group,i,thread))) pylikwid.finalize()
#!/usr/bin/env python import pylikwid liste = [] cpus = [0,1] pylikwid.init(cpus) group = pylikwid.addeventset("INSTR_RETIRED_ANY:FIXC0") pylikwid.setup(group) pylikwid.start() for i in range(0,1000000): liste.append(i) pylikwid.stop() for thread in range(0,len(cpus)): print("Result CPU %d : %f" % (cpus[thread], pylikwid.getresult(group,0,thread))) pylikwid.finalize()
def run(self): self._logtofile( "t [s]\tMFLOP/s\tMEM bandwidth [MB/s]\tOperational Intensity\t" + "\t".join( "Power {0} [W]\tDRAM Pow. {0} [W]\tUncore freq. {0} [MHz]". format(i) for i in self.sockets)) for i in range(0, pylikwid.getnumberofevents(self._gid)): logging.info("event {}: {}".format( i, pylikwid.getnameofevent(self._gid, i))) logging.info("Start metering!") t = 0 pylikwid.start() while True: if self.stop.wait(timeout=self.sleep_dt): pylikwid.stop() logging.info("Stopped metering") pylikwid.finalize() if self._outfile: self._outfile.close() break pylikwid.read() tprime = pylikwid.gettimeofgroup(self._gid) d = {} dt = tprime - t t = tprime d["t"] = t d["dt"] = dt # Read PMCs on all cores flop = 0 for core in self.cores: PMC0 = pylikwid.getlastresult(self._gid, 0, self.measured_cores.index(core)) PMC1 = pylikwid.getlastresult(self._gid, 1, self.measured_cores.index(core)) PMC2 = pylikwid.getlastresult(self._gid, 2, self.measured_cores.index(core)) flop += (PMC0 * 2.0 + PMC1 + PMC2 * 4.0) d["flop/s"] = flop / dt # Read sockets datavolume = 0 for socket in self.sockets: MBOX0C0 = pylikwid.getlastresult( self._gid, 3, self.measured_cores.index(socket)) MBOX0C1 = pylikwid.getlastresult( self._gid, 4, self.measured_cores.index(socket)) MBOX1C0 = pylikwid.getlastresult( self._gid, 5, self.measured_cores.index(socket)) MBOX1C1 = pylikwid.getlastresult( self._gid, 6, self.measured_cores.index(socket)) MBOX2C0 = pylikwid.getlastresult( self._gid, 7, self.measured_cores.index(socket)) MBOX2C1 = pylikwid.getlastresult( self._gid, 8, self.measured_cores.index(socket)) MBOX3C0 = pylikwid.getlastresult( self._gid, 9, self.measured_cores.index(socket)) MBOX3C1 = pylikwid.getlastresult( self._gid, 10, self.measured_cores.index(socket)) MBOX4C0 = pylikwid.getlastresult( self._gid, 11, self.measured_cores.index(socket)) MBOX4C1 = pylikwid.getlastresult( self._gid, 12, self.measured_cores.index(socket)) MBOX5C0 = pylikwid.getlastresult( self._gid, 13, self.measured_cores.index(socket)) MBOX5C1 = pylikwid.getlastresult( self._gid, 14, self.measured_cores.index(socket)) MBOX6C0 = pylikwid.getlastresult( self._gid, 15, self.measured_cores.index(socket)) MBOX6C1 = pylikwid.getlastresult( self._gid, 16, self.measured_cores.index(socket)) MBOX7C0 = pylikwid.getlastresult( self._gid, 17, self.measured_cores.index(socket)) MBOX7C1 = pylikwid.getlastresult( self._gid, 18, self.measured_cores.index(socket)) datavolume += ((MBOX0C0 + MBOX1C0 + MBOX2C0 + MBOX3C0 + MBOX4C0 + MBOX5C0 + MBOX6C0 + MBOX7C0 + MBOX0C1 + MBOX1C1 + MBOX2C1 + MBOX3C1 + MBOX4C1 + MBOX5C1 + MBOX6C1 + MBOX7C1) * 64.0) # read PWR and uncore freq on sockets PWR0 = pylikwid.getlastresult( self._gid, 19, self.measured_cores.index(socket)) d["pkg-power_" + str(socket)] = PWR0 / d["dt"] PWR3 = pylikwid.getlastresult( self._gid, 20, self.measured_cores.index(socket)) d["dram-power_" + str(socket)] = PWR3 / d["dt"] UNCORE_CLOCK = pylikwid.getlastresult(self._gid, 21, 0) d["uncore-freq_" + str(socket)] = UNCORE_CLOCK / d["dt"] d["memory-bandwidth"] = datavolume / dt if datavolume != 0: d["operational-intensity"] = flop / datavolume else: d["operational-intensity"] = float('inf') if self._regulator: self._regulator.regulate(d) self._logtofile("\t".join( str(i) for i in [ d["t"], 1.0E-06 * d["flop/s"], 1.0E-06 * d["memory-bandwidth"], d["operational-intensity"], "\t".join( str(d["pkg-power_" + str(s)]) + "\t" + str(d["dram-power_" + str(s)]) + "\t" + str(1.E-06 * d["uncore-freq_" + str(s)]) for s in self.sockets) ]))
def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_prefix='G', likwid_event=None, save_features=False, random=False): """Begin tuning Parameters ---------- n_trial: int Maximum number of configs to try (measure on real hardware) measure_option: dict The options for how to measure generated code. You should use the return value ot autotvm.measure_option for this argument. early_stopping: int, optional Early stop the tuning when not finding better configs in this number of trials callbacks: List of callable A list of callback functions. The signature of callback function is (Tuner, List of MeasureInput, List of MeasureResult) with no return value. These callback functions will be called on every measurement pair. See autotvm/tuner/callback.py for some examples. si_prefix: str One of tvm.autotvm.util.SI_PREFIXES. The SI prefix to use when reporting FLOPS. """ measure_batch = create_measure_batch(self.task, measure_option) n_parallel = getattr(measure_batch, 'n_parallel', 1) early_stopping = early_stopping or 1e9 self.n_trial = n_trial self.early_stopping = early_stopping start_time = time.time() # Validate si_prefix arg format_si_prefix(0, si_prefix) old_level = logger.level GLOBAL_SCOPE.in_tuning = True i = error_ct = 0 if likwid_event != None: # Get arrays for conv N, CI, H, W = self.task.args[0][1] CO, _, KH, KW = self.task.args[1][1] padding = self.task.args[3] ctx = tvm.context(self.task.target.__str__(), 0) #a_tvm = tvm.nd.array(np.random.uniform(size=(N,CI,H,W) ).astype(np.float32), ctx) #w_tvm = tvm.nd.array(np.random.uniform(size=(CO,CI,KH,KW) ).astype(np.float32), ctx) #c_tvm = tvm.nd.array(np.zeros((N,CO,H+KH-2*padding-1,W+KW-2*padding-1), dtype=np.float32), ctx) while i < n_trial: if not self.has_next(): break if random: configs = self.random_next_batch(min(n_parallel, n_trial - i)) else: configs = self.next_batch(min(n_parallel, n_trial - i)) inputs = [ MeasureInput(self.task.target, self.task, config) for config in configs ] results = measure_batch(inputs) # keep best config for k, (inp, res) in enumerate(zip(inputs, results)): config = inp.config if res.error_no == 0: flops = inp.task.flop / np.mean(res.costs) error_ct = 0 else: flops = 0 error_ct += 1 if flops > self.best_flops: self.best_flops = flops self.best_config = config self.best_measure_pair = (inp, res) self.best_iter = i + k logger.debug("No: %d\t%sFLOPS: %.2f/%.2f\tresult: %s\t%s", i + k + 1, si_prefix, format_si_prefix(flops, si_prefix), format_si_prefix(self.best_flops, si_prefix), res, config) i += len(results) self.ttl = min(early_stopping + self.best_iter, n_trial) - i if random: self.update_random(inputs, results) else: self.update(inputs, results) if likwid_event != None: pylikwid.inittopology() cpu_topo = pylikwid.getcputopology() cpus = list(range(cpu_topo['activeHWThreads'])) pylikwid.finalizetopology() err = pylikwid.init(cpus) group = pylikwid.addeventset(likwid_event) err = pylikwid.setup(group) for k, (inp, res) in enumerate(zip(inputs, results)): with inp.target: sch, args = self.task.instantiate(inp.config) #with tvm.ir.transform.PassContext(): func = tvm.build(sch, args, target_host=inp.task.target_host) evaluator = func.time_evaluator(func.entry_name, ctx, repeat=3, number=4) dshape = (N, CI // inp.config['tile_ic'].size[-1], H, W, inp.config['tile_ic'].size[-1]) kshape = (CO // inp.config['tile_oc'].size[-1], CI // inp.config['tile_ic'].size[-1], KH, KW, inp.config['tile_ic'].size[-1], inp.config['tile_oc'].size[-1]) oshape = (N, CO // inp.config['tile_oc'].size[-1], H + KH - 2 * padding - 1, W + KW - 2 * padding - 1, inp.config['tile_oc'].size[-1]) a_tvm = tvm.nd.array( np.random.uniform(size=dshape).astype(np.float32), ctx) w_tvm = tvm.nd.array( np.random.uniform(size=kshape).astype(np.float32), ctx) c_tvm = tvm.nd.array(np.zeros(oshape, dtype=np.float32), ctx) ##Warm up ### I tried this warm up and running the function once, # likwid results were very bad, resulted in barely better than # random when training RandForest model on post-tuning data #if tuple(args[1].shape) == w_tvm.shape: # for _ in range(10): # func(c_tvm, w_tvm, a_tvm) #else: # for _ in range(10): # func(c_tvm, a_tvm, w_tvm) #LIKWID PERFCTR err = pylikwid.start() if tuple(args[1].shape) == w_tvm.shape: evaluator(c_tvm, w_tvm, a_tvm) else: evaluator(c_tvm, a_tvm, w_tvm) err = pylikwid.stop() likwid_results = [] for thread in range(0, len(cpus)): likwid_results.append({}) for event_num in range( pylikwid.getnumberofevents(group)): key = pylikwid.getnameofevent(group, event_num) if key in likwid_results[-1].keys(): likwid_results[-1][key] += pylikwid.getresult( group, event_num, thread) else: likwid_results[-1][key] = pylikwid.getresult( group, event_num, thread) #END LIKWID PERFCTR if inp.config.index in self.cost_model.saved_features.keys( ): self.cost_model.saved_features[ inp.config.index].set_result(res) self.cost_model.saved_features[ inp.config.index].set_counters(likwid_results) else: self.cost_model.saved_features[ inp.config.index] = SavedFeature( result=res, counters=likwid_results) pylikwid.finalize() elif save_features == True: for k, (inp, res) in enumerate(zip(inputs, results)): if inp.config.index in self.cost_model.saved_features.keys( ): self.cost_model.saved_features[ inp.config.index].set_result(res) else: self.cost_model.saved_features[ inp.config.index] = SavedFeature(result=res) if len(self.cost_model.saved_features['scores']) > 0: self.cost_model.saved_features['scores'][-1].append( time.time() - start_time) for callback in callbacks: callback(self, inputs, results) if i >= self.best_iter + early_stopping: logger.debug("Early stopped. Best iter: %d.", self.best_iter) break if error_ct > 150: logging.basicConfig() logger.warning( "Too many errors happen in the tuning. Now is in debug mode" ) logger.setLevel(logging.DEBUG) else: logger.setLevel(old_level) GLOBAL_SCOPE.in_tuning = False del measure_batch