Exemplo n.º 1
0
    def update(self) -> bool:
        new_lines = _find_new_lines(self._filename, start_from=self._lines_read)
        if len(new_lines) > 0:
            self._lines_read += len(new_lines)
            print(f"read {len(new_lines)} new lines")
            events_by_type = _lines_to_dict(new_lines)
            if len(self.evaluations) == 0:
                search_start = None
            else:
                search_start = self.evaluations.start.min()
            start_n = self.evaluations.n.max()
            if math.isnan(start_n):
                start_n = -1

            new_evaluations = _evaluations_to_dataframe(
                events_by_type[TOKENS.EVALUATION_RESULT],
                metric_names=self.metrics,
                search_start=search_start,
                start_n=start_n + 1,
            )
            self.evaluations = pd.concat([self.evaluations, new_evaluations])
            for metric in self.metrics:
                self.evaluations[f"{metric}_cummax"] = self.evaluations[metric].cummax()
            new_individuals = {
                id_: Individual.from_string(pipeline, pset)
                for id_, pipeline in zip(new_evaluations.id, new_evaluations.pipeline)
            }
            self.individuals.update(new_individuals)
        return len(new_lines) > 0
Exemplo n.º 2
0
def pipeline_to_children(pipeline, automl):
    ''' Converts pipeline format of Gama log file to individual scikit-learn
            components.

    Parameters:
    -----------
    pipeline: pipeline set
        Contains pipeline in Gama format.
    automl: GamaClassifier or GamaRegressor
        Contains either a GamaClassifier object or GamaRegressor object.

    Returns:
    --------
    scikit-learn predictor, scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional)
        Contains the GAMA individuals converted to the respective pipeline scikit-learn components.
    '''
    ind = Individual.from_string(pipeline, automl._pset)
    inds = [p.str_nonrecursive for p in ind.primitives]
    if len(inds) == 1:
        return inds[0], np.nan, np.nan, np.nan
    elif len(inds) == 2:
        return inds[0], inds[1], np.nan, np.nan
    elif len(inds) == 3:
        return inds[0], inds[2], inds[1], np.nan
    else:
        return inds[0], inds[3], inds[2], inds[1]
Exemplo n.º 3
0
def evaluate_individual(
    individual: Individual,
    evaluate_pipeline: Callable,
    timeout: float = 1e6,
    deadline: Optional[float] = None,
    add_length_to_score: bool = True,
    **kwargs,
) -> Evaluation:
    """ Evaluate the pipeline specified by individual, and record

    Parameters
    ----------
    individual: Individual
        Blueprint for the pipeline to evaluate.
    evaluate_pipeline: Callable
        Function which takes the pipeline and produces validation predictions,
        scores, estimators and errors.
    timeout: float (default=1e6)
        Maximum time in seconds that the evaluation is allowed to take.
        Don't depend on high accuracy.
        A shorter timeout is imposed if `deadline` is in less than `timeout` seconds.
    deadline: float, optional
        A time in seconds since epoch.
        Cut off evaluation at `deadline` even if `timeout` seconds have not yet elapsed.
    add_length_to_score: bool (default=True)
        Add the length of the individual to the score result of the evaluation.
    **kwargs: Dict, optional (default=None)
        Passed to `evaluate_pipeline` function.

    Returns
    -------
    Evaluation

    """
    result = Evaluation(individual, pid=os.getpid())
    result.start_time = datetime.now()

    if deadline is not None:
        time_to_deadline = deadline - time.time()
        timeout = min(timeout, time_to_deadline)

    with Stopwatch() as wall_time, Stopwatch(
            time.process_time) as process_time:
        evaluation = evaluate_pipeline(individual.pipeline,
                                       timeout=timeout,
                                       **kwargs)
        result._predictions, result.score, result._estimators, result.error = evaluation
    result.duration = wall_time.elapsed_time

    if add_length_to_score:
        result.score = result.score + (-len(individual.primitives), )
    individual.fitness = Fitness(
        result.score,
        result.start_time,
        wall_time.elapsed_time,
        process_time.elapsed_time,
    )

    return result
Exemplo n.º 4
0
def InvalidLinearSVC(pset):
    individual_str = """LinearSVC(data,
            LinearSVC.C=0.001,
            LinearSVC.dual=True,
            LinearSVC.loss='squared_hinge',
            LinearSVC.penalty='l1',
            LinearSVC.tol=1e-05)"""
    individual_str = "".join(individual_str.split()).replace(",", ", ")
    return Individual.from_string(individual_str, pset, compile_individual)
Exemplo n.º 5
0
def LinearSVC(pset):
    individual_str = """LinearSVC(data,
            LinearSVC.C=0.001,
            LinearSVC.dual=True,
            LinearSVC.loss='squared_hinge',
            LinearSVC.penalty='l2',
            LinearSVC.tol=1e-05)"""
    individual_str = ''.join(individual_str.split()).replace(',', ', ')
    return Individual.from_string(individual_str, pset, None)
Exemplo n.º 6
0
def crossover_terminals(
        individual1: Individual,
        individual2: Individual) -> Tuple[Individual, Individual]:
    """ Crossover two individuals in-place by exchanging two Terminals with shared output type but different values.

    Parameters
    ----------
    individual1: Individual
        The individual to crossover with individual2.
    individual2: Individual
        The individual to crossover with individual1.
    """
    candidates = list(
        _shared_terminals(individual1,
                          individual2,
                          with_indices=True,
                          value_match='different'))
    i, ind1_term, j, ind2_term = random.choice(candidates)
    individual1.replace_terminal(i, ind2_term)
    individual2.replace_terminal(j, ind1_term)
    return individual1, individual2
Exemplo n.º 7
0
def crossover_terminals(ind1: Individual,
                        ind2: Individual) -> Tuple[Individual, Individual]:
    """ Crossover two individuals in-place by exchanging two Terminals.

    Terminals must share output type but have different values.

    Parameters
    ----------
    ind1: Individual
        The individual to crossover with individual2.
    ind2: Individual
        The individual to crossover with individual1.
    """
    options = _shared_terminals(ind1,
                                ind2,
                                with_indices=True,
                                value_match="different")
    i, ind1_term, j, ind2_term = random.choice(list(options))
    ind1.replace_terminal(i, ind2_term)
    ind2.replace_terminal(j, ind1_term)
    return ind1, ind2
Exemplo n.º 8
0
    def update(self, force: bool = False) -> bool:
        if not force and not self.incomplete:
            return False

        with open(os.path.join(self._log_directory, "evaluations.log"),
                  "r") as fh:
            header = fh.readline()[:-1]
            self._last_tell = max(self._last_tell, fh.tell())
            fh.seek(self._last_tell)
            try:
                df = pd.read_csv(fh, sep=";", header=None, index_col=False)
            except pd.errors.EmptyDataError:
                return False
            self._last_tell = fh.tell()

            df.columns = header.split(";")
            df["n"] = df.index
            df = df.rename(
                columns=dict(t_start="start", t_wallclock="duration"))

            def tuple_to_metrics(tuple_str):
                return pd.Series(
                    [float(value) for value in tuple_str[1:-1].split(",")])

            df[self.metrics] = df.score.apply(tuple_to_metrics)
            df.start = pd.to_datetime(df.start)  # needed?
            df.duration = pd.to_timedelta(df.duration, unit="s")

            new_individuals = {
                id_: Individual.from_string(pipeline, pset)
                for id_, pipeline in zip(df.id, df.pipeline)
            }

            # Merge with previous records
            self.individuals.update(new_individuals)
            if self.evaluations.empty:
                self.evaluations = df
            else:
                df["n"] += self.evaluations.n.max() + 1
                self.evaluations = pd.concat([self.evaluations, df])
            df = self.evaluations

            search_start = df.start.min()
            for metric in self.metrics:
                df[f"{metric}_cummax"] = df[metric].cummax()
            if len(df.start) > 0:
                df["relative_end"] = ((df.start + df.duration) -
                                      search_start).dt.total_seconds()
            else:
                df["relative_end"] = pd.Series()
        return True
Exemplo n.º 9
0
def ForestPipeline(pset):
    individual_str = """RandomForestClassifier(
            FeatureAgglomeration(
                    data,
                    FeatureAgglomeration.affinity='l2',
                    FeatureAgglomeration.linkage='complete'
                    ),
            RandomForestClassifier.bootstrap=True,
            RandomForestClassifier.criterion='gini',
            RandomForestClassifier.max_features=0.6,
            RandomForestClassifier.min_samples_leaf=7,
            RandomForestClassifier.min_samples_split=6,
            RandomForestClassifier.n_estimators=100)"""
    individual_str = "".join(individual_str.split()).replace(",", ", ")

    return Individual.from_string(individual_str, pset, None)
Exemplo n.º 10
0
def _test_mutation(individual: Individual, mutation, mutation_check, pset):
    """ Test if an individual mutated by `mutation` passes `mutation_check` and compiles.

    :param individual: The individual to be mutated.
    :param mutation: function: ind -> (ind,). Should mutate the individual
    :param mutation_check: function: (ind1, ind2)->(bool, str).
       A function to check if ind2 could have been created by `mutation(ind1)`, see above functions.
    """
    ind_clone = individual.copy_as_new()
    mutation(ind_clone, pset)

    applied, message = mutation_check(individual, ind_clone)
    assert applied, message

    # Should be able to compile the individual, will raise an Exception if not.
    compile_individual(ind_clone, pset)
Exemplo n.º 11
0
def BernoulliNBThreeScalers(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)",
        pset, compile_individual)
Exemplo n.º 12
0
    def __init__(
        self,
        logfile: Optional[str] = None,
        log_lines: Optional[List[str]] = None,
        name: Optional[str] = None,
    ):
        """ Parse the logfile or log lines provided.

        Parameters
        ----------
        logfile: str, optional (default=None)
            Path to the log file. If not specified, loglines must be provided.
        log_lines: List[str], optional (default=None)
            A list with each element one line from the log file.
            If not specified, logfile must be provided.
        name: str, optional (default=None)
            Name of the report.
            If set to None, defaults to `logfile` if it is not None else 'nameless'.
        """
        if not ((logfile is None) ^ (log_lines is None)):
            raise ValueError("Must provide exactly one of 'logfile' or 'loglines'.")

        if log_lines is None:
            log_lines = _find_new_lines(cast(str, logfile))

        self._lines_read = len(log_lines)
        self._individuals = None
        self.name = (
            name
            if name is not None
            else (logfile if logfile is not None else "nameless")
        )

        events_by_type = _lines_to_dict(log_lines)

        if len(events_by_type[TOKENS.INIT]) == 0:
            raise ValueError("The log must contain at least contain an INIT string.")

        config = _find_metric_configuration(events_by_type[TOKENS.INIT])
        self.metrics, self.search_method, self.postprocessing, self._filename = config

        self.phases: List[Tuple[str, str, datetime, float]] = _find_phase_information(
            events_by_type
        )
        search_start = self.phases[1][2] if len(self.phases) > 1 else None
        self.evaluations: pd.DataFrame = _evaluations_to_dataframe(
            events_by_type[TOKENS.EVALUATION_RESULT],
            metric_names=self.metrics,
            search_start=search_start,
        )

        # This can take a while for long logs (e.g. ~1sec for 10k individuals)
        self.individuals: Dict[str, Individual] = {
            id_: Individual.from_string(pipeline, pset)
            for id_, pipeline in zip(self.evaluations.id, self.evaluations.pipeline)
        }

        parse_method_data: Dict[str, Callable[..., pd.DataFrame]] = defaultdict(
            lambda: lambda *args: None,
            AsynchronousSuccessiveHalving=_ASHA_data_to_dataframe,
        )
        # search_method is formatted like NAME(kwargs)
        # where kwargs could contain additional parentheses.
        method_name, _ = self.search_method.split("(", maxsplit=1)
        method_token = METHOD_TOKENS.get(method_name)
        self.method_data = parse_method_data[method_name](
            events_by_type[method_token], self.metrics
        )

        self.incomplete = len(self.phases) < 3
Exemplo n.º 13
0
def SS_RBS_SS_BNB(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)",  # noqa: E501
        pset,
        compile_individual,
    )
Exemplo n.º 14
0
def SS_BNB(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(data), alpha=0.1, fit_prior=True)",
        pset,
        compile_individual,
    )
Exemplo n.º 15
0
def RS_MNB(pset):
    return Individual.from_string(
        "MultinomialNB(RobustScaler(data), alpha=1.0, fit_prior=True)",
        pset,
        compile_individual,
    )
Exemplo n.º 16
0
def GNB(pset):
    return Individual.from_string("GaussianNB(data)", pset, compile_individual)
Exemplo n.º 17
0
def execute_recommendations(X, y, cat_ind, recommendations, task, n_jobs=1):
    ''' Executes the recommendations made by the nearest neighbor model based on
            a learning task and sets the number of jobs to n_jobs for the estimators
            and preprocessing algorithms.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    cat_ind: list
        Contains boolean values to determine whether a column is categorical or
        not based.
    recommendations: list
        Contains the list with the recommendations made by the nearest neighbor model.
    task: str
        Contains the learning task (i.e. "classification" or "regression")
    n_jobs: int
        Contains what to set the number of jobs at for the estimators and preprocessing algorithms
            available in the recommended pipelines.

    Returns:
    --------
    list
        Contains scores of each pipeline run on X and y.
    '''
    categorical, numeric, string = category_numeric_or_string(X, cat_ind)

    if task.lower() == "classification":
        gama = GamaClassifier(scoring='accuracy')
    elif task.lower() == "regression":
        gama = GamaRegressor(scoring='r2')
    else:
        return "{} is not implemented, please try 'classification' or 'regression'".format(
            task)

    scores = []

    for recommendation in recommendations:
        pipeline, k, did = recommendation
        ind = Individual.from_string(pipeline, gama._pset)

        X_pipe = deepcopy(X)
        y_pipe = deepcopy(y)

        X_pipe, y_pipe = onehot_or_targ(X_pipe, y_pipe, categorical, k)

        pipeline = [eval(p.str_nonrecursive) for p in ind.primitives]
        pipeline.reverse()

        try:
            for component in pipeline:
                if pipeline.index(component) == len(pipeline) - 1:
                    try:
                        setattr(component, 'n_jobs', n_jobs)
                    except:
                        pass

                    X_train, X_test, y_train, y_test = train_test_split(
                        X_pipe, y_pipe, test_size=0.30, random_state=42)
                    cv_scores = cross_val_score(component,
                                                X_pipe,
                                                y_pipe,
                                                cv=10)
                    score = sum(cv_scores) / 10
                    #component.fit(X_train, y_train)
                    #score = component.score(X_test, y_test)
                    scores.append(score)
                else:
                    if isinstance(component, SelectPercentile) | isinstance(
                            component, SelectFwe):
                        X_pipe = component.fit_transform(X_pipe, y_pipe)
                    else:
                        X_pipe = component.fit_transform(X_pipe)
        except:
            scores.append(0)

    return scores