Пример #1
0
    def _adapt(self, info):
        # First, have an adaptive algorithm
        if self.n_initial_parameters == "grid":
            start = len(ParameterGrid(self.parameters))
        else:
            start = self.n_initial_parameters

        def inverse(time):
            """ Decrease target number of models inversely with time """
            return int(start / (1 + time)**self.decay_rate)

        example = toolz.first(info.values())
        time_step = example[-1]["partial_fit_calls"]

        current_time_step = time_step + 1
        next_time_step = current_time_step

        if inverse(current_time_step) == 0:
            # we'll never get out of here
            next_time_step = 1

        while inverse(current_time_step) == inverse(next_time_step) and (
                self.decay_rate and not self.patience
                or next_time_step - current_time_step < self.fits_per_score):
            next_time_step += 1

        target = max(1, inverse(next_time_step))
        best = toolz.topk(target, info, key=lambda k: info[k][-1]["score"])

        if len(best) == 1:
            [best] = best
            return {best: 0}
        steps = next_time_step - current_time_step
        instructions = {b: steps for b in best}
        return instructions
Пример #2
0
    def _adapt(self, info, first_step_completed=False):
        if all(v[-1]["partial_fit_calls"] == 1 for v in info.values()):
            # Do all the models have one partial fit call?
            self._steps = 0
        if first_step_completed:
            # Sometimes, IncrementalSearchCV completes one step for us. We
            # recurse in this case -- see below for a note on the condition
            self._steps = 1
        n, eta = self.n_initial_parameters, self.aggressiveness
        r = self.n_initial_iter

        n_i = int(math.floor(n * eta**-self._steps))
        r_i = np.round(r * eta**self._steps).astype(int)
        if r_i == 1:
            # if r_i == 1, a step has already been completed for us (because
            # IncrementalSearchCV completes 1 partial_fit call automatically)
            return self._adapt(info, first_step_completed=True)

        best = toolz.topk(n_i, info, key=lambda k: info[k][-1]["score"])
        self._steps += 1

        if len(best) == 0:
            return {id_: 0 for id_ in info}

        pf_calls = {k: info[k][-1]["partial_fit_calls"] for k in best}
        additional_calls = {k: r_i - pf_calls[k] for k in best}
        return additional_calls
Пример #3
0
    def tune_tolerance(self):
        """
        Tune tolerance (1 - threshold) for setting as the default API parameter

        (Uses GPU to reduce computation from ~ 1 hour to a matter of seconds)
        """

        if len(self.val_generator) > 1:
            warnings.warn(
                'Less than the entire validation batch will be used...')

        images, truth_masks = self.val_generator.__getitem__(0)
        print(
            f'Tuning of the tolerance parameter will occur on {images.shape[0]} images.'
        )

        predictions = self.model.predict(images)

        tolerances = np.linspace(0.02, 1, 50)
        results = self._calculate_f1_scores(tolerances, truth_masks,
                                            predictions)

        self._plot_tolerances(tolerances, results)

        populations = [(f'{tolerance:.2f}', np.median(scores), np.std(scores))
                       for tolerance, scores in zip(tolerances, results)]

        tolerance, f1_median, f1_stdev = min(topk(5, populations, key=second),
                                             key=third)

        print(
            f'Tuned tolerance: {tolerance} w/ median={f1_median:.4f} stdev={f1_stdev:.4f}'
        )
        return float(tolerance)
Пример #4
0
    def _additional_calls(self, info):
        # First, have an adaptive algorithm
        if self.n_initial_parameters == "grid":
            start = len(ParameterGrid(self.parameters))
        else:
            start = self.n_initial_parameters

        def inverse(time):
            """ Decrease target number of models inversely with time """
            return int(start / (1 + time)**self.decay_rate)

        example = toolz.first(info.values())
        time_step = example[-1]["partial_fit_calls"]

        current_time_step = time_step + 1
        next_time_step = current_time_step

        if inverse(current_time_step) == 0:
            # we'll never get out of here
            next_time_step = 1

        while inverse(current_time_step) == inverse(next_time_step) and (
                self.decay_rate and not self.patience
                or next_time_step - current_time_step < self.scores_per_fit):
            next_time_step += 1

        target = max(1, inverse(next_time_step))
        best = toolz.topk(target, info, key=lambda k: info[k][-1]["score"])

        if len(best) == 1:
            [best] = best
            return {best: 0}
        steps = next_time_step - current_time_step
        instructions = {b: steps for b in best}

        # Second, stop on plateau if any models have already converged
        out = {}
        for k, steps in instructions.items():
            records = info[k]
            current_calls = records[-1]["partial_fit_calls"]
            if self.max_iter and current_calls >= self.max_iter:
                out[k] = 0
            elif self.patience and current_calls >= self.patience:
                plateau = [
                    h["score"] for h in records
                    if current_calls - h["partial_fit_calls"] <= self.patience
                ]
                if all(score <= plateau[0] + self.tol
                       for score in plateau[1:]):
                    out[k] = 0
                else:
                    out[k] = steps

            else:
                out[k] = steps
        return out
Пример #5
0
    def _additional_calls(self, info):
        if self.n_initial_parameters == "grid":
            start = len(ParameterGrid(self.parameters))
        else:
            start = self.n_initial_parameters

        def inverse(time):
            """ Decrease target number of models inversely with time """
            return int(start / (1 + time)**self.decay_rate)

        example = toolz.first(info.values())
        time_step = example[-1]["partial_fit_calls"]

        current_time_step = time_step + 1
        next_time_step = current_time_step

        if inverse(current_time_step) == 0:
            # we'll never get out of here
            next_time_step = 1

        while inverse(current_time_step) == inverse(next_time_step) and (
                not self.patience
                or next_time_step - current_time_step < self.scores_per_fit):
            next_time_step += 1

        target = max(1, inverse(next_time_step))
        best = toolz.topk(target, info, key=lambda k: info[k][-1]["score"])

        if len(best) == 1:
            [best] = best
            return {best: 0}

        out = {}
        for k in best:
            records = info[k]
            if self.max_iter and len(records) >= self.max_iter:
                out[k] = 0
            elif self.patience and len(records) >= self.patience:
                old = records[-self.patience]["score"]
                if all(d["score"] < old + self.tol
                       for d in records[-self.patience:]):
                    out[k] = 0
                else:
                    out[k] = next_time_step - current_time_step

            else:
                out[k] = next_time_step - current_time_step

        return out
Пример #6
0
Файл: zmq.py Проект: mindw/partd
def keys_to_flush(lengths, fraction=0.1, maxcount=100000):
    """ Which keys to remove

    >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15,
    ...            'e': 10, 'f': 25, 'g': 5}
    >>> keys_to_flush(lengths, 0.5)
    ['f', 'a']
    """
    top = topk(max(len(lengths) // 2, 1), lengths.items(), key=1)
    total = sum(lengths.values())
    cutoff = min(
        maxcount,
        max(1, bisect(list(accumulate(add, pluck(1, top))), total * fraction)))
    result = [k for k, v in top[:cutoff]]
    assert result
    return result
Пример #7
0
def keys_to_flush(lengths, fraction=0.1, maxcount=100000):
    """ Which keys to remove

    >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15,
    ...            'e': 10, 'f': 25, 'g': 5}
    >>> keys_to_flush(lengths, 0.5)
    ['f', 'a']
    """
    top = topk(max(len(lengths) // 2, 1),
               lengths.items(),
               key=1)
    total = sum(lengths.values())
    cutoff = min(maxcount, max(1,
                   bisect(list(accumulate(add, pluck(1, top))),
                          total * fraction)))
    result = [k for k, v in top[:cutoff]]
    assert result
    return result
Пример #8
0
def get_closest_indices(new_sent, feats, k=3):
    """
    Parameters
    ----------
    new_sent : str
        New sentence
    feats : array-like, 2d
        Features for different sentences.

    Returns
    -------
    idx : int
        Index of ``feats`` that is closest to ``new_sent``.

    """
    model = initialize(download=False)
    feat = model.encode([new_sent], bsize=128, tokenize=False)
    dists = cdist(feat, feats, "cosine")
    assert dists.shape[0] == 1
    dists = dists[0]
    vals = topk(k, -dists)
    idxs = [k for k, v in enumerate(dists) if -v in vals]
    return idxs[::-1]
Пример #9
0
def topk_dict(d, k=10):
    return dict(toolz.topk(k, d.items(), key=lambda x: x[1]))
Пример #10
0
    def balance(self):
        with log_errors():
            i = 0
            s = self.scheduler
            occupancy = s.occupancy
            idle = s.idle
            saturated = s.saturated
            if not idle or len(idle) == len(self.scheduler.workers):
                return

            log = list()
            start = time()

            broken = False
            seen = False
            acted = False

            if not s.saturated:
                saturated = topk(10, s.workers, key=occupancy.get)
                saturated = [
                    w for w in saturated if occupancy[w] > 0.2
                    and len(s.processing[w]) > s.ncores[w]
                ]
            elif len(s.saturated) < 20:
                saturated = sorted(saturated, key=occupancy.get, reverse=True)

            if len(idle) < 20:
                idle = sorted(idle, key=occupancy.get)

            for level, cost_multiplier in enumerate(self.cost_multipliers):
                if not idle:
                    break
                for sat in list(saturated):
                    stealable = self.stealable[sat][level]
                    if not stealable or not idle:
                        continue
                    else:
                        seen = True

                    for key in list(stealable):
                        i += 1
                        if not idle:
                            break
                        idl = idle[i % len(idle)]
                        duration = s.processing[sat][key]

                        if (occupancy[idl] + cost_multiplier * duration <=
                                occupancy[sat] - duration / 2):
                            self.move_task(key, sat, idl)
                            log.append((start, level, key, duration, sat,
                                        occupancy[sat], idl, occupancy[idl]))
                            self.scheduler.check_idle_saturated(sat)
                            self.scheduler.check_idle_saturated(idl)
                            seen = True

                if self.cost_multipliers[
                        level] < 20:  # don't steal from public at cost
                    stealable = self.stealable_all[level]
                    if stealable:
                        seen = True
                    for key in list(stealable):
                        if not idle:
                            break

                        sat = s.rprocessing[key]
                        if occupancy[sat] < 0.2:
                            continue
                        if len(s.processing[sat]) <= s.ncores[sat]:
                            continue

                        i += 1
                        idl = idle[i % len(idle)]
                        duration = s.processing[sat][key]

                        if (occupancy[idl] + cost_multiplier * duration <=
                                occupancy[sat] - duration / 2):
                            self.move_task(key, sat, idl)
                            log.append((start, level, key, duration, sat,
                                        occupancy[sat], idl, occupancy[idl]))
                            self.scheduler.check_idle_saturated(sat)
                            self.scheduler.check_idle_saturated(idl)
                            seen = True

                if seen and not acted:
                    break

            if log:
                self.log.append(log)
                self.count += 1
            stop = time()
            if self.scheduler.digests:
                self.scheduler.digests['steal-duration'].add(stop - start)
Пример #11
0
    def run(self):
        print("Evolution has been launched")
        start_run_time = self.current_secs_time()

        population = Population([
            CNNGenome([
                AugmentationGene(),
                SequentialModelGene(),
                OptimizerGene(),
                OutputActivationGene()
            ]),
            CNNGenome([
                AugmentationGene(),
                SequentialModelGene(),
                OptimizerGene(),
                OutputActivationGene()
            ]),
            CNNGenome([
                AugmentationGene(),
                SequentialModelGene(),
                OptimizerGene(),
                OutputActivationGene()
            ]),
            CNNGenome([
                AugmentationGene(),
                SequentialModelGene(),
                OptimizerGene(),
                OutputActivationGene()
            ]),
            CNNGenome([
                AugmentationGene(),
                SequentialModelGene(),
                OptimizerGene(),
                OutputActivationGene()
            ])
        ])

        # Get timebox segments reversed ( descending lengths)
        timeboxes = self.calculate_exp_segments(self.__number_of_evolutions,
                                                self.__max_runtime)[::-1]

        # Get databox segments reversed ( descending lengths)
        databoxes = list(
            map(
                round,
                self.calculate_exp_segments(
                    self.__number_of_evolutions,
                    self.__data_context.train_nrows())))
        assert sum(databoxes) == self.__data_context.train_nrows()

        best_individual = None

        evaluator = Evaluator()

        for (evolution_number,
             timebox), databox in zip(enumerate(timeboxes, 1), databoxes):
            generation_number = 1
            start_evolution_time = self.current_secs_time()

            X_train_smpl, Y_train_smpl = self.databox_train_sample(databox)

            while self.current_secs_time() - start_evolution_time < timebox \
                    and self.current_secs_time() - start_run_time < self.__max_runtime:

                # Split randomly into train and the validation set for the fitting
                sample_data_ctx = self.get_shuffled_sample_data_ctx(
                    X_train_smpl, Y_train_smpl)

                #Build population
                individuals = list(
                    map(lambda cnn_ind: cnn_ind.build(self.__seed),
                        population.get_individuals()))
                print("Individuals" + str(individuals))

                # Evaluate population
                ev_individuals = list(
                    map(
                        lambda cnn_ind: evaluator.evaluate(
                            cnn_ind, sample_data_ctx), individuals))

                # Print evaluation results
                self.__print_ev_individuals(ev_individuals)

                # TODO Select parents for cross-over and mutations. Note minus sign to pick smallest values
                parents = toolz.topk(3,
                                     ev_individuals,
                                     key=lambda ev_individual: -ev_individual.
                                     get_fitness().get_valid_loss())
                self.__print_ev_individuals(parents)

                # Crossover or mutate individuals
                mutator = Mutator()
                offspring_genomes = list(
                    map(
                        lambda parent: mutator.mutate(parent.
                                                      get_original_genome()),
                        copy.deepcopy(parents)))

                # Materialize offspring
                offspring = list(
                    map(
                        lambda offspring_genome: offspring_genome.build(
                            self.__seed), offspring_genomes))

                # Evaluate offspring
                ev_offspring = list(
                    map(
                        lambda offspring_ind: evaluator.evaluate(
                            offspring_ind, sample_data_ctx), offspring))

                print("\n Evaluated offspring \n")
                self.__print_ev_individuals(ev_offspring)

                # Combine original population and offspring
                expanded_individuals = ev_individuals + ev_offspring

                # Run survival phase for evaluated individuals
                survived = toolz.topk(len(individuals),
                                      expanded_individuals,
                                      key=lambda ev_individual: -ev_individual.
                                      get_fitness().get_valid_loss())

                # Update best individual
                best_individual = toolz.topk(
                    1,
                    survived,
                    key=lambda ev_individual: -ev_individual.get_fitness(
                    ).get_valid_loss())

                survived_genomes = list(
                    map(
                        lambda survived_ind: survived_ind.get_original_genome(
                        ), survived))

                population = Population(individuals=survived_genomes)
                print("Evolution #{} | generation #{} is finished".format(
                    evolution_number, generation_number))
                generation_number += 1

        # Evaluate best individual on whole data
        total_sample_data_ctx = self.get_shuffled_sample_data_ctx(
            *self.__data_context.get_train())
        best_individual_evaluated = evaluator.evaluate(
            best_individual[0].get_individual(), total_sample_data_ctx)

        return (population, best_individual_evaluated)