Exemplo n.º 1
0
def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)
    
    if issubclass(template, sc.templates.elementwise_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.reduce_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.elementwise_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.gemm):
        sweep_over = [0,1,2,3,4]
    #Evaluate the provided parameters guess
    reference = tools.benchmark(template(*parameters), tree)
    if reference==float('inf'):
        return False
    #Latency bound -- ignore
    if reference < 1e-5:
        return True
    #Determine if local minimum
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x==parameters:
            pass
        time = tools.benchmark(template(*x), tree)
        if time/reference < .98:
            return False
    return True
Exemplo n.º 2
0
def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)

    if issubclass(template, sc.templates.elementwise_1d):
        sweep_over = [0, 1, 2]
    elif issubclass(template, sc.templates.reduce_1d):
        sweep_over = [0, 1, 2]
    elif issubclass(template, sc.templates.elementwise_2d):
        sweep_over = [0, 1, 2, 3, 4]
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0, 1, 2, 3, 4]
    elif issubclass(template, sc.templates.gemm):
        sweep_over = [0, 1, 2, 3, 4]
    #Evaluate the provided parameters guess
    reference = tools.benchmark(template(*parameters), tree)
    if reference == float('inf'):
        return False
    #Latency bound -- ignore
    if reference < 1e-5:
        return True
    #Determine if local minimum
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x == parameters:
            pass
        time = tools.benchmark(template(*x), tree)
        if time / reference < .98:
            return False
    return True
Exemplo n.º 3
0
def exhaustive(template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    metric = tools.metric_of(template)
    nbits = tools.genetic_infos_of(template)['nbits']
    categorical = tools.genetic_infos_of(template)['categorical']
    ranges = [range(2**x) for x in nbits]
    ranges = list(product(*ranges))
    timings = {}
    best = None
    for idx, r in enumerate(ranges):
        parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
        try:
            time = tools.benchmark(template, parameters, tree)
            if not best or time < best[1]:
                best = parameters, time
        except profile_execution_failure:
            pass
        if best:
            stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
    return best[0]
Exemplo n.º 4
0
def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)
    
    if issubclass(template, sc.templates.elementwise_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.reduce_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.elementwise_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.matrix_product):
        sweep_over = [1,2,3,4,5,6,7]
    
    #Evaluate the provided parameters guess
    try:
        reference = tools.benchmark(template, parameters, tree)
    except profile_execution_failure:
        return False
        
    #Latency bound -- ignore
    if reference < 1e-5:
        return True
        
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x==parameters:
            pass
        try:
            time = tools.benchmark(template, x, tree)
            if time/reference < .98:
                return False
        except profile_execution_failure:
            pass
    return True
Exemplo n.º 5
0
    def run(self, level='intermediate'):

        assert level in ['simple', 'intermediate', 'full']
        tools.dtype = self.dtype
        device = self.device
        operation = self.operation
        context = sc.driver.context(device)

        if self.logger:
            self.logger.info("----------------")
            self.logger.info(operation.__name__.replace('_', '-').upper())
            self.logger.info(tools.dtype.__name__.upper())
            self.logger.info("----------------")

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            sizes = [(10**x, ) for x in range(3, 8)]

        #BLAS2 training sizes
        if operation in [
                sc.templates.elementwise_2d, sc.templates.reduce_2d_rows,
                sc.templates.reduce_2d_cols
        ]:
            sizes = []
            #Square
            for N in [896, 1280, 1760, 2560]:
                sizes += [(N, N)]
            #Short/Fat
            for M in [16, 32, 64, 128, 512, 1024]:
                for N in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
            #Tall/Skinny
            for N in [16, 32, 64, 128, 512, 1024]:
                for M in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]

        #BLAS3 training sizes
        if operation in [
                sc.templates.gemm_nn, sc.templates.gemm_nt,
                sc.templates.gemm_tn, sc.templates.gemm_tt
        ]:
            sizes = []
            #Square
            for N in [896, 1760, 2048, 2560]:
                sizes += [(N, N, N)]
            #LaPack
            for N in [896, 1760, 2048, 2560]:
                for K in [16, 32, 64, 128]:
                    sizes += [(N, N, K)]
            #Covariance
            for N in [16, 32, 64, 128, 256]:
                for K in [16000, 32000, 64000, 128000]:
                    sizes += [(N, N, K)]
            #DeepSpeech
            for M in [1760, 2048, 2560, 4096]:
                for N in [16, 32, 64, 128, 7000]:
                    sizes += [(M, N, M)]
            for K in [1760, 2048, 2560, 4096]:
                for M, N in [(5124, 9124), (35, 8457)]:
                    sizes += [(M, N, K)]
            for M, K in [(7680, 2560), (3072, 1024)]:
                for N in [16, 32, 64, 128]:
                    sizes += [(M, N, K)]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []

        #Restore progress
        savepath = os.path.join('save', tools.dtype.__name__,
                                operation.__name__)
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        try:
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [
                    tuple(map(int, row))
                    for row in csv.reader(f, delimiter=',')
                ]
            with open(os.path.join(savepath, 'profiles.csv')) as f:
                profiles = [
                    map(int, row) for v in row
                    for row in csv.reader(f, delimiter=',')
                ]
            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
            #Recompute Y
            #Y = []
            #for x in X:
            #    tree, _ = tools.tree_of(operation, x, context)
            #    Y.append([performance(x, tools.benchmark(operation(*best), tree)) for best in profiles])
        except:
            pass

    #Save data

        def save():
            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'],
                                     [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:
                    csv.writer(f).writerows(data)

        #Tuning
        for idx, x in enumerate(sizes):
            #Create new line on log
            if idx > 0:
                self.progress_bar.set_finished()
            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if already saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
                continue
            #Best existing profile for x
            tree, operands = tools.tree_of(operation, x, context)
            y = [
                performance(x, tools.benchmark(operation(*p), tree))
                for p in profiles
            ]
            best = profiles[np.argmax(y)] if y else None
            #Retune if necessary
            tune = not (best and optimize.is_local_optimum(
                best, operation, x, context))
            if tune:
                optimizer = optimize.GeneticOptimizer(
                    self.logger,
                    naccept=1000,
                    niter=1000,
                    cxpb=.4,
                    mutpb=.4,
                    popsize=20,
                    progress_bar=self.progress_bar)
                best = optimizer.run(operation, x, context, prior=best)[0]
                if best not in profiles:
                    profiles.append(best)
                    for xx, yy in zip(X, Y):
                        tree, _ = tools.tree_of(operation, xx, context)
                        time = tools.benchmark(operation(*best), tree)
                        yy.append(performance(xx, time))
            #Update dataset
            X.append(x)
            tree, operands = tools.tree_of(operation, x, context)
            y = [
                performance(x, tools.benchmark(operation(*prf), tree))
                for prf in profiles
            ]
            Y.append(y)
            #Save data
            save()
            #print performance info in case no tuning was done
            if not tune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        self.progress_bar.set_finished()
        save()
        #Adding external profiles
        for prof in tools.external_profiles(operation):
            profiles.append(prof.__class__.__name__)
            for x, y in zip(X, Y):
                tree, operands = tools.tree_of(operation, x, context)
                perf = performance(x, tools.benchmark(prof, tree, operation))
                y.append(perf)
        #Pruning of useless profiles
        X = np.array(X)
        Y = np.array(Y)
        if len(Y[0]) > 1:
            idx = np.where(
                np.bincount(np.argmax(Y, 1), minlength=len(profiles)) == 0)[0]
            profiles = [p for ip, p in enumerate(profiles) if ip not in idx]
            Y = np.delete(Y, idx, axis=1)
        #Exporting to JSON
        json_path = tools.sanitize(
            device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
        else:
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name][tools.dtype.__name__] = {}
        D = json_data[operation_name][tools.dtype.__name__]
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{
                'children_left':
                e.tree_.children_left.tolist(),
                'children_right':
                e.tree_.children_right.tolist(),
                'threshold':
                e.tree_.threshold.astype('float64').tolist(),
                'feature':
                e.tree_.feature.astype('float64').tolist(),
                'value':
                e.tree_.value[:, :, 0].astype('float64').tolist()
            } for e in clf.estimators_]
        D['profiles'] = [tools.convert(x) for x in profiles]
        json.dump(json_data, open(json_path, 'w'))
Exemplo n.º 6
0
    def run(self, template, sizes, context, initializer=None, prior=None):
        tree, _ = tools.tree_of(template, sizes, context)
        metric = tools.metric_of(template)
        genetic_infos = tools.genetic_infos_of(template)
        nbits = genetic_infos['nbits']
        offsets = cumsum([0] + nbits)

        def bin2gray(A):
            g = [int(A[0])]
            for i in range(1, len(A)):
                g += [int(A[i - 1] != A[i])]
            return g

        def gray2int(A):
            b = [A[0]]
            for i in range(1, len(A)):
                b += [int(b[i - 1] != A[i])]
            return int(''.join(map(str, b)), 2)

        def encode(genome):
            encoded = [
                bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)
            ]
            return sum(encoded, [])

        def decode(genome):
            result = []
            for off1, off2 in zip(offsets[:-1], offsets[1:]):
                result += [gray2int(genome[off1:off2])]
            result = [2**x for i, x in enumerate(result)]
            return result

        def evaluate(genome):
            idx = tuple(genome)
            if idx not in cache:
                time = tools.benchmark(template(*decode(genome)), tree)
                if time == float('inf'):
                    return time,
                cache[idx] = time
            self.progress_bar.update(max(len(cache), it), self.niter,
                                     decode(min(cache, key=cache.get)),
                                     metric(sizes, min(cache.values())))
            return cache[idx],

        cache = {}
        hof = deap_tools.HallOfFame(1)

        creator.create("FitnessMin", base.Fitness, weights=(-1.0, ))
        creator.create("Individual", list, fitness=creator.FitnessMin)

        toolbox = base.Toolbox()
        toolbox.register("evaluate", evaluate)
        toolbox.register("mate", deap_tools.cxTwoPoint)
        toolbox.register("mutate", deap_tools.mutFlipBit)
        toolbox.register("select", deap_tools.selNSGA2)

        x = []
        y = []
        it = 0

        population = []
        #Initialization
        if initializer is None:
            initializer = ([random.randint(0, 2**x) for x in nbits]
                           for i in iter(int, 1))
        genome = encode(prior if prior else list(initializer.next()))
        while len(population) < self.popsize:
            individual = creator.Individual(genome)
            individual.fitness.values = toolbox.evaluate(genome)
            if max(individual.fitness.values) != float('inf'):
                population += [individual]
            genome = encode(list(initializer.next()))
        hof.update(population)

        #Main iteration
        while len(cache) < self.naccept and it < self.niter:

            #Generate offspring
            offspring = []
            while len(offspring) < self.popsize:
                op_choice = random.random()
                #Cross-over
                if op_choice < self.cxpb:
                    ind1, ind2 = map(toolbox.clone,
                                     random.sample(population, 2))
                    ind1, ind2 = toolbox.mate(ind1, ind2)
                    ind = ind1
                    toolbox.evaluate(ind)
                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
                #Mutation
                elif op_choice < self.cxpb + self.mutpb:
                    ind = toolbox.clone(random.choice(population))
                    ind, = toolbox.mutate(ind, 1.0 / offsets[-1])
                    toolbox.evaluate(ind)
                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
                #Reproduction
                else:
                    offspring += [random.choice(population)]

            #Update fitnesses
            fitnesses = toolbox.map(toolbox.evaluate, offspring)
            for ind, fit in zip(offspring, fitnesses):
                ind.fitness.values = fit

            #Update population
            population[:] = toolbox.select(population + offspring,
                                           self.popsize)
            hof.update(population)

            it += 1
        return tuple(decode(hof[0])), x, y
Exemplo n.º 7
0
    def run(self, level='intermediate'):

        assert level in ['simple', 'intermediate', 'full']

        device = self.device
        operation = self.operation
        context = sc.driver.context(device)

        if self.logger:
            self.logger.info("----------------")
            self.logger.info(operation.__name__.replace('_', '-').upper())
            self.logger.info("----------------")

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            if level == 'simple':
                sizes = [(10000000, )]
            elif level == 'intermediate':
                sizes = [(x, ) for x in tools.expspace(1e3, 1e8, 10)]
            else:
                sizes = [(x, ) for x in tools.expspace(1e3, 1e8, 100)]

        #BLAS2 training sizes
        if operation in [
                sc.templates.elementwise_2d, sc.templates.reduce_2d_rows,
                sc.templates.reduce_2d_cols
        ]:
            if level == 'simple':
                sizes = [(1536, 1536)]
            elif level == 'intermediate':
                sizes = [(896, 896), (1536, 1536), (256, 256), (1024, 256),
                         (4096, 256), (16384, 256), (256, 1024), (256, 4096),
                         (256, 16384), (3025, 96)]
            else:
                sizes = product(pow2range(4, 17), pow2range(4, 17))

        #BLAS3 training sizes
        if operation in [
                sc.templates.matrix_product_nn, sc.templates.matrix_product_nt,
                sc.templates.matrix_product_tn, sc.templates.matrix_product_tt
        ]:
            if level == 'simple':
                sizes = [(2560, 2560, 2560)]
            elif level == 'intermediate':
                sizes = [  #Square
                    (896, 896, 896),
                    (1536, 1536, 1536),
                    (2176, 2176, 2176),
                    #Rank-32 updates
                    (896, 896, 32),
                    (1536, 1536, 32),
                    (2176, 2176, 32),
                    #Covariance
                    (32, 32, 16000),
                    (64, 64, 64000),
                    (256, 256, 32000),
                    #Convolutions
                    (3025, 64, 363),
                    (729, 192, 1200),
                    (169, 384, 1728),
                    (169, 256, 3456),
                    (169, 128, 2304),
                    (169, 2304, 256),
                    (169, 3456, 256),
                    (169, 1728, 384),
                    (729, 1600, 192),
                    (3025, 363, 64),
                    (2304, 256, 169),
                    (3456, 256, 169),
                    (1728, 384, 169),
                    (1600, 192, 729),
                    (363, 64, 3025)
                ]
            elif level == 'full':
                sizes = product(pow2range(5, 12), pow2range(5, 12),
                                pow2range(5, 17))

        #Remove duplicates and or too small/big tuples
        sizes = [
            x for x in sizes
            if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1
        ]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []

        #Restore previous run
        savepath = os.path.join('save', operation.__name__)
        if not os.path.exists(savepath):
            os.makedirs(savepath)

        try:
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [
                    tuple(map(int, row))
                    for row in csv.reader(f, delimiter=',')
                ]

            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]

            with open(os.path.join(savepath, 'profiles.csv')) as f:

                def mmap(x):
                    if x == 'FETCH_FROM_LOCAL':
                        return sc.templates.fetch_type.FETCH_FROM_LOCAL
                    if x == 'FETCH_FROM_GLOBAL_CONTIGUOUS':
                        return sc.templates.fetch_type.FETCH_FROM_GLOBAL_CONTIGUOUS
                    if x == 'FETCH_FROM_GLOBAL_STRIDED':
                        return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
                    return int(x)

                profiles = [
                    map(mmap, row) for v in row
                    for row in csv.reader(f, delimiter=',')
                ]
        except:
            pass

        ##### Exploration #####
        for idx, x in enumerate(sizes):
            if idx > 0:
                self.progress_bar.set_finished()

            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
                continue

            #Check if the current best prediction is not a local optimum
            idx = len(X)
            nparams = len(profiles)
            tree, operands = tools.tree_of(operation, x, context)
            if idx == 0:
                retune = True
                predicted = None
            else:
                if nparams == 1:
                    predicted = profiles[0]
                else:
                    clf = RandomForestRegressor(min(10, idx + 1),
                                                max_depth=min(10,
                                                              idx + 1)).fit(
                                                                  X, Y)
                    #clf, nrmse = model.train(X, Y, profiles)
                    predperf = clf.predict(x)[0]
                    best = (-predperf).argsort()[:5]
                    perf = []
                    for b in best:
                        try:
                            perf += [
                                performance(
                                    x,
                                    tools.benchmark(operation, profiles[b],
                                                    tree))
                            ]
                        except profile_execution_failure:
                            pass
                    predicted = profiles[best[argmax(perf)]]
                retune = not optimize.is_local_optimum(predicted, operation, x,
                                                       context)

            #Retune if necessary
            if retune:
                optimizer = optimize.GeneticOptimizer(
                    self.logger,
                    naccept=1000,
                    niter=1000,
                    cxpb=.4,
                    mutpb=.4,
                    popsize=20,
                    progress_bar=self.progress_bar)
                new = optimizer.run(operation, x, context, prior=predicted)[0]
                if new not in profiles:
                    profiles.append(new)
                    if idx > 0:
                        for xx, yy in zip(X, Y):
                            _tree, _operands = tools.tree_of(
                                operation, xx, context)
                            try:
                                time = tools.benchmark(operation, new, _tree)
                                perf = performance(xx, time)
                            except profile_execution_failure:
                                perf = 0
                            yy.append(0 if isinf(perf) else perf)

            ##### Training #####
            y = []
            fastest = max(predperf) if nparams > 1 else None
            for ip, p in enumerate(profiles):
                try:
                    perf = 0 if fastest and ip < nparams and predperf[
                        ip] / fastest < .1 else performance(
                            x, tools.benchmark(operation, p, tree))
                except profile_execution_failure:
                    perf = 0
                y.append(0 if isinf(perf) else perf)
            X.append(x)
            Y.append(y)

            #Save data
            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'],
                                     [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:
                    csv.writer(f).writerows(data)

            #print performance info in case no tuning was done
            if not retune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        self.progress_bar.set_finished()

        #Remove unused profiles
        if len(Y[0]) > 1:
            unused = np.where(np.bincount(np.argmax(Y, 1)) == 0)[0]
            profiles = [p for ip, p in enumerate(profiles) if ip not in unused]
            Y = np.delete(Y,
                          np.where(np.bincount(np.argmax(Y, 1)) == 0),
                          axis=1).tolist()

        ##### Exportation #####
        json_path = tools.sanitize(
            device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
        else:
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name]['float32'] = {}
        D = json_data[operation_name]['float32']
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{
                'children_left':
                e.tree_.children_left.tolist(),
                'children_right':
                e.tree_.children_right.tolist(),
                'threshold':
                e.tree_.threshold.astype('float64').tolist(),
                'feature':
                e.tree_.feature.astype('float64').tolist(),
                'value':
                e.tree_.value[:, :, 0].astype('float64').tolist()
            } for e in clf.estimators_]
        D['profiles'] = [map(int, x) for x in profiles]
        json.dump(json_data, open(json_path, 'w'))
Exemplo n.º 8
0
    def run(self, level = 'intermediate'): 
        
        assert level in ['simple', 'intermediate', 'full']
        tools.dtype = self.dtype
        device = self.device
        operation = self.operation
        context = sc.driver.context(device)
        
        if self.logger:
            self.logger.info("----------------")
            self.logger.info(operation.__name__.replace('_','-').upper())
            self.logger.info(tools.dtype.__name__.upper())
            self.logger.info("----------------")

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            sizes = [(10**x,) for x in range(3,8)]
        
        #BLAS2 training sizes
        if operation in [sc.templates.elementwise_2d, sc.templates.reduce_2d_rows, sc.templates.reduce_2d_cols]:
            sizes = []
            #Square
            for N in [896, 1280, 1760, 2560]:
                sizes += [(N, N)]
            #Short/Fat
            for M in [16, 32, 64, 128, 512, 1024]:
                for N in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
            #Tall/Skinny
            for N in [16, 32, 64, 128, 512, 1024]:
                for M in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
        
        #BLAS3 training sizes
        if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
            sizes = []
            #Square
            for N in [896, 1760, 2048, 2560]:
                sizes += [(N, N, N)]
            #LaPack
            for N in [896, 1760, 2048, 2560]:
			   for K in [16, 32, 64, 128]:
				   sizes += [(N, N, K)]
            #Covariance
            for N in [16, 32, 64, 128, 256]:
			   for K in [16000,32000,64000,128000]:
				   sizes += [(N, N, K)]
            #DeepSpeech
            for M in [1760, 2048, 2560, 4096]:
                for N in [16, 32, 64, 128, 7000]:
                    sizes += [(M, N, M)]
            for K in [1760, 2048, 2560, 4096]:
				for M, N in [(5124,9124),(35,8457)]:
					sizes += [(M, N, K)]
            for M, K in [(7680,2560),(3072,1024)]:
                for N in [16, 32, 64, 128]:
					sizes += [(M, N, K)]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []
        
        #Restore progress
        savepath = os.path.join('save', tools.dtype.__name__, operation.__name__)
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        try:
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
            with open(os.path.join(savepath, 'profiles.csv')) as f:
                profiles = [map(int,row) for v in row for row in csv.reader(f, delimiter=',')]
            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
            #Recompute Y
            #Y = []
            #for x in X:
            #    tree, _ = tools.tree_of(operation, x, context)
            #    Y.append([performance(x, tools.benchmark(operation(*best), tree)) for best in profiles])
        except:
            pass
        
      	#Save data
        def save():
            for (fname, data) in zip(['X.csv',  'Y.csv', 'profiles.csv'], [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:
                    csv.writer(f).writerows(data)
        #Tuning
        for idx, x in enumerate(sizes):
            #Create new line on log
            if idx>0:
             self.progress_bar.set_finished()
            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if already saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
                continue
            #Best existing profile for x
            tree, operands = tools.tree_of(operation, x, context)
            y = [performance(x, tools.benchmark(operation(*p), tree)) for p in profiles]
            best = profiles[np.argmax(y)] if y else None            
            #Retune if necessary
            tune =  not (best and optimize.is_local_optimum(best, operation, x, context))
            if tune:
                optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
                best = optimizer.run(operation, x, context, prior=best)[0]
                if best not in profiles:
                    profiles.append(best)
                    for xx,yy in zip(X, Y):
                        tree, _ = tools.tree_of(operation, xx, context)
                        time = tools.benchmark(operation(*best), tree)
                        yy.append(performance(xx, time))
            #Update dataset
            X.append(x)
            tree, operands = tools.tree_of(operation, x, context)
            y = [performance(x,tools.benchmark(operation(*prf), tree)) for prf in profiles]
            Y.append(y)
            #Save data
            save()
            #print performance info in case no tuning was done
            if not tune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        self.progress_bar.set_finished()
        save()     
        #Adding external profiles
        for prof in tools.external_profiles(operation):
            profiles.append(prof.__class__.__name__)
            for x, y in zip(X, Y):
                tree, operands = tools.tree_of(operation, x, context)
                perf = performance(x,tools.benchmark(prof, tree, operation))
                if max(y) <  perf:
                    print x, '\t', prof.__class__.__name__, '\toutperform: \t', int(perf), tools.metric_name_of(operation)
                y.append(perf)
        #Pruning of useless profiles
        X = np.array(X)
        Y = np.array(Y)
        if len(Y[0]) > 1:
            idx = np.where(np.bincount(np.argmax(Y, 1), minlength=len(profiles))==0)[0]
            profiles = [p for ip,p in enumerate(profiles) if ip not in idx]
            Y = np.delete(Y, idx, axis=1) 
        #Exporting to JSON
        json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
        else:
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name][tools.dtype.__name__] = {}
        D = json_data[operation_name][tools.dtype.__name__]
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                                'children_right': e.tree_.children_right.tolist(),
                                'threshold': e.tree_.threshold.astype('float64').tolist(),
                                'feature': e.tree_.feature.astype('float64').tolist(),
                                'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
        D['profiles'] = [tools.convert(x) for x in profiles]
        json.dump(json_data, open(json_path,'w'))
Exemplo n.º 9
0
    def run(self, template, sizes, context, initializer = None, prior = None):
        tree, _ = tools.tree_of(template, sizes, context)
        metric = tools.metric_of(template)
        genetic_infos = tools.genetic_infos_of(template)
        nbits = genetic_infos['nbits']
        offsets = cumsum([0] + nbits)

        def bin2gray(A):
            g = [int(A[0])]
            for i in range(1, len(A)): 
                g += [int(A[i-1] != A[i])]
            return g
        
        def gray2int(A):
            b = [A[0]]
            for i in range(1, len(A)):
                b += [int(b[i-1] != A[i])]
            return int(''.join(map(str,b)), 2)
        
        def encode(genome):
            encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)]
            return sum(encoded, [])
            
        def decode(genome):
            result = []
            for off1,off2 in zip(offsets[:-1],offsets[1:]):
                result += [gray2int(genome[off1:off2])]
            result = [2**x for i,x in enumerate(result)]
            return result

        def evaluate(genome):
            idx = tuple(genome)
            if idx not in cache:
                time = tools.benchmark(template(*decode(genome)), tree)
                if time == float('inf'):
                    return time, 
                cache[idx] = time
            self.progress_bar.update(max(len(cache), it), self.niter, decode(min(cache, key=cache.get)), metric(sizes, min(cache.values())))
            return cache[idx],
            
        cache = {}
        hof = deap_tools.HallOfFame(1)

        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)

        toolbox = base.Toolbox()
        toolbox.register("evaluate", evaluate)
        toolbox.register("mate", deap_tools.cxTwoPoint)
        toolbox.register("mutate", deap_tools.mutFlipBit)
        toolbox.register("select", deap_tools.selNSGA2)

        x = []
        y = []
        it = 0
        
        population = [] 
        #Initialization
        if initializer is None:
            initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1))
        genome = encode(prior if prior else list(initializer.next()))
        while len(population) < self.popsize:
            individual = creator.Individual(genome)
            individual.fitness.values = toolbox.evaluate(genome)
            if max(individual.fitness.values) != float('inf'):
                population += [individual]
            genome = encode(list(initializer.next()))
        hof.update(population)
        
        #Main iteration
        while len(cache) < self.naccept and it<self.niter:
            
            #Generate offspring
            offspring = []
            while len(offspring) < self.popsize:
                op_choice = random.random()
                #Cross-over
                if op_choice < self.cxpb: 
                    ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
                    ind1, ind2 = toolbox.mate(ind1, ind2)
                    ind = ind1
                    toolbox.evaluate(ind)
                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
                #Mutation
                elif op_choice < self.cxpb + self.mutpb: 
                    ind = toolbox.clone(random.choice(population))
                    ind, = toolbox.mutate(ind, 1.0/offsets[-1])
                    toolbox.evaluate(ind)
                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
                #Reproduction
                else: 
                    offspring += [random.choice(population)]

            #Update fitnesses
            fitnesses = toolbox.map(toolbox.evaluate, offspring)
            for ind, fit in zip(offspring, fitnesses):
                ind.fitness.values = fit
                
            #Update population
            population[:] = toolbox.select(population + offspring, self.popsize)
            hof.update(population)
            
            it += 1
        return tuple(decode(hof[0])), x, y