예제 #1
0
    def get_data_summary(self) -> pd.DataFrame:
        """Get summary statistics and identified intent for training dataset.

        Returns:
            a DataFrame, in which each column represent the summary of a column

        """
        if not self.has_fitted:
            logging.info("The foreshadow object is not trained yet.")
            return None
        X_summary = self.X_preparer.cache_manager[AcceptedKey.SUMMARY]
        y_summary = self.y_preparer.cache_manager[AcceptedKey.SUMMARY]

        X_summary[y_summary.columns[0]] = y_summary
        return X_summary
예제 #2
0
def execute_model(fs, X_train, y_train, X_test, y_test):
    """Execute the model produced by `generate_model()`.

    Also, exports the data to json and returns the exported json object
    containing the results and the serialized Foreshadow object. Also, prints
    simple model accuracy metrics.

    Args:
        fs (foreshadow.Foreshadow): An unfit foreshadow object.
        X_train (:obj:`DataFrame <pandas.DataFrame>`): The X train data.
        X_test (:obj:`DataFrame <pandas.DataFrame>`): The X test data.
        y_train (:obj:`DataFrame <pandas.DataFrame>`): The y train data.
        y_test (:obj:`DataFrame <pandas.DataFrame>`): The y test data.

    Returns:
        dict: A dictionary with the following keys `X_Model`, `X_Summary`, \
            `y_model`, and `y_summary` which each represent the serialized \
            and summarized forms of each of those steps.

    """
    logging.info("Fitting final model...")
    fs.fit(X_train, y_train)

    logging.info("Scoring final model...")
    score = fs.score(X_test, y_test)

    logging.info("Final Results: ")
    logging.info(score)

    # TODO Temporarily turn off the serialization as this requires
    #  additional change and plus we may not need it at all.
    # json_file_location = "foreshadow.json"
    # fs.to_json(json_file_location)

    pickled_fitted_pipeline_location = "foreshadow_fitted_pipeline.p"
    fs.pickle_fitted_pipeline(pickled_fitted_pipeline_location)

    # logging.info(
    #     "Serialized foreshadow pipeline has been saved to {} "
    #     "and {}. Refer to docs to read and process.".format(
    #         json_file_location, pickled_fitted_pipeline_location
    #     )
    # )

    logging.info(
        "Serialized foreshadow pipeline has been saved to {}. Refer to docs "
        "to read and process.".format(pickled_fitted_pipeline_location))
예제 #3
0
    def resolve(self, X, *args, **kwargs):
        """Pick the appropriate transformer if necessary.

        Note:
            Column info sharer is set based on the chosen transformer.

        Args:
            X: input observations
            *args: args to pass to resolve
            **kwargs: params to resolve

        """
        # Override the SmartTransformer resolve method to allow the setting of
        # column info sharer data when resolving.
        super().resolve(X, *args, **kwargs)
        column_name = self.column
        self.column_intent = self.transformer.__class__.__name__
        logging.info("Column {} has intent type: {}".format(
            column_name, self.column_intent))
예제 #4
0
    def _has_column_in_cache_manager(self, column: str) -> Union[bool, None]:
        """Check if the column exists in the cache manager.

        If the foreshadow object has not been trained, it will return None.

        Args:
            column: the column name

        Returns:
            Whether a column exists in the cache manager

        """
        if not self.has_fitted:
            logging.info(
                "The foreshadow object is not trained yet. Please make sure "
                "the column {} exist to ensure the override takes "
                "effect.".format(column))
            return False
        cache_manager = self.X_preparer.cache_manager
        return True if column in cache_manager[AcceptedKey.INTENT] else False
예제 #5
0
    def get_intent(self, column_name: str) -> Union[str, None]:
        """Retrieve the intent of a column.

        Args:
            column_name: the column name

        Returns:
            str: the intent of the column

        """
        # Note: this retrieves intent from cache_manager. Only columns have
        # been processed will be visible.
        cache_manager = self.X_preparer.cache_manager
        if self._has_column_in_cache_manager(column_name):
            return cache_manager[AcceptedKey.INTENT][column_name]
        else:
            logging.info("No intent exists for column {}. Either the column "
                         "doesn't exist or foreshadow object has not "
                         "been fitted yet.".format(column_name))
            return None
예제 #6
0
    def _check_empty_columns(self, original_columns: List) -> List:
        empty_columns = []
        for cleaner_tuple in self.feature_processor.transformers_:
            _, cleaner, column_name = cleaner_tuple
            if isinstance(cleaner.transformer, DropCleaner):
                empty_columns.append(column_name)

        if len(empty_columns) == len(original_columns):
            error_message = (
                "All columns are dropped since they all have "
                "over 90% of missing values. Aborting foreshadow.")
            logging.error(error_message)
            raise ValueError(error_message)
        elif len(empty_columns) > 0:
            logging.info(
                "Identified columns with over 90% missing values: {} and "
                "they will be dropped."
                "".format(",".join(empty_columns)))

        return empty_columns
예제 #7
0
    def transform(self, X):
        """Execute fancyimpute transformer on X data.

        Args:
            X (:obj:`pandas.DataFrame`): Input data

        Returns:
            :obj:`pandas.DataFrame`: Output data

        """
        if X.isnull().values.any():
            """This is a temporary fix since the newer version of
            fancyimpute package has already fixed the issue of throwing
            exception when there's no missing value. However, due to the
            constraint on the requirements, we have to stay with an older
            version and use this workaround until we figure out how to
            upgrade all the associated dependencies.
            """
            return self.imputer.complete(X)
        else:
            logging.info("No missing value found in column {}".format(
                X.columns[0]))
            return X
예제 #8
0
def generate_model(args):  # noqa: C901
    """Process command line args and generate a Foreshadow model to fit.

    Args:
        args (list): A list of string arguments to process

    Returns:
        tuple: A tuple of `fs, X_train, y_train, X_test, y_test` which \
            represents the foreshadow model along with the split data.

    Raises:
        ValueError: if invalid file or invalid y.

    """
    cargs = process_argument(args)

    if cargs.level == 3 and cargs.method is not None:
        warnings.warn(
            "WARNING: Level 3 model search enabled. Method will be ignored.")

    if cargs.level != 3 and cargs.time != 10:
        warnings.warn("WARNING: Time parameter not applicable "
                      "to feature engineering. Must be in level 3.")

    try:
        df = pd.read_csv(cargs.data)
    except Exception:
        raise ValueError(
            "Failed to load file. Please verify it exists and is a valid CSV.")

    try:
        X_df = df.drop(columns=cargs.target)
        y_df = df[[cargs.target]]
    except Exception:
        raise ValueError("Invalid target variable")

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    if cargs.level == 1:
        # Default everything with basic estimator
        fs = Foreshadow(
            problem_type=cargs.problem_type,
            estimator=get_method(cargs.method, y_train, cargs.family,
                                 cargs.problem_type),
        )

    # elif cargs.level == 2:
    #     # Parameter search on all matched intents
    #
    #     if cargs.x_config is not None:
    #         try:
    #             with open(cargs.x_config, "r") as f:
    #                 X_search = Preprocessor(from_json=json.load(f))
    #         except Exception:
    #             raise ValueError(
    #                 "Could not read X config file {}".format(cargs.x_config)
    #             )
    #         print("Reading config for X Preprocessor")
    #     else:
    #         X_search = search_intents(X_train)
    #         print("Searching over valid intent space for X data")
    #
    #     if cargs.y_config is not None:
    #         try:
    #             with open(cargs.y_config, "r") as f:
    #                 y_search = Preprocessor(from_json=json.load(f))
    #         except Exception:
    #             raise ValueError(
    #                 "Could not read y config file {}".format(cargs.y_config)
    #             )
    #         print("Reading config for y Preprocessor")
    #     else:
    #         y_search = search_intents(y_train, y_var=True)
    #         print("Searching over valid intent space for y data")
    #
    #     # If level 3 also do model parameter search with AutoEstimator
    #     # Input time limit into Foreshadow to be passed into AutoEstimator
    #
    #     fs = Foreshadow(
    #         X_preparer=X_search,
    #         y_preparer=y_search,
    #         estimator=get_method(cargs.method, y_train),
    #         optimizer=GridSearchCV,
    #     )
    #
    elif cargs.level == 3:
        # Default intent and advanced model search using 3rd party AutoML

        estimator = AutoEstimator(problem_type=cargs.problem_type, auto="tpot")
        estimator.construct_estimator(y_train)

        # TODO move this into the configure_estimator method "max_time_mins"
        #  is an argument for the TPOT library. We cannot assign it
        #   based on the problem type here. For testing purpose, I'm going
        #   to hardcode it for TPOT.
        # kwargs = (
        #     "max_time_mins"
        #     if estimator.problem_type == ProblemType.REGRESSION
        #     else "time_left_for_this_task"
        # )
        kwargs = "max_time_mins"
        estimator.estimator_kwargs = {
            kwargs: cargs.time,
            **estimator.estimator_kwargs,
        }

        fs = Foreshadow(problem_type=cargs.problem_type, estimator=estimator)

    else:
        raise ValueError("Invalid Level. Only levels 1 and 3 supported.")

    if cargs.multiprocess:
        fs.configure_multiprocessing(-1)
        logging.info("multiprocessing enabled.")

    return fs, X_train, y_train, X_test, y_test
예제 #9
0
    def transform(self, X, y=None):
        """Clean string columns.

        Here, we assume that any list output means that these are desired
        to be new columns in our dataset. Contractually, this could change
        to be that a boolean flag is passed to indicate when this is
        desired, as of right now, there should be no need to return a list
        for any case other than this case of desiring new column.

        The same is assumed for dicts, where the key is the new column name,
        the value is the value for that row in that column. NaNs
        are automatically put into the columns that don't exist for given rows.

        Args:
            X (:obj:`pandas.Series`): X data
            y: input labels

        Returns:
            :obj:`pandas.DataFrame`: Transformed data

        Raises:
            InvalidDataFrame: If unexpected output returned that was not
                handled correctly. This happens if the output specified by the
                child does not match what is actually returned. The child
                should ensure it's implementation is consistent.

        """
        X = check_df(X, single_column=True)
        logging.info("Starting cleaning rows...")
        out = X[X.columns[0]].apply(self.transform_row, return_tuple=False)
        logging.info("Ending cleaning rows...")
        # access single column as series and apply the list of
        # transformations to each row in the series.
        if any(
            [
                isinstance(out.iloc[i], (list, tuple))
                for i in range(out.shape[0])
            ]
        ):  # out are lists == new columns
            if not all(
                [
                    len(out.iloc[0]) == len(out.iloc[i])
                    for i in range(len(out.iloc[0]))
                ]
            ):
                raise InvalidDataFrame(
                    "length of lists: {}, returned not of same value.".format(
                        [out.iloc[i] for i in range(len(out[0]))]
                    )
                )
            columns = self.output_columns
            if columns is None:
                # by default, pandas would have given a unique integer to
                # each column, instead, we keep the previous column name and
                # add that integer.
                columns = [
                    X.columns[0] + str(c) for c in range(len(out.iloc[0]))
                ]
            # We need to set the index. Otherwise, the new data frame might
            # misalign with other columns.
            X = pd.DataFrame([*out.values], index=out.index, columns=columns)
        elif any(
            [isinstance(out.iloc[i], (dict)) for i in range(out.shape[0])]
        ):  # out are dicts ==  named new columns
            all_keys = dict()
            for row in out:
                all_keys.update({key: True for key in row})  # get all columns
            columns = list(all_keys.keys())
            out = pd.DataFrame([*out.values], columns=columns)
            out.columns = [X.columns[0] + "_" + c for c in columns]
            X = out
            # by default, this will create a DataFrame where if a row
            # contains the value, it will be added, if not NaN is added.
        else:  # no lists, still 1 column output
            X[X.columns[0]] = out
        return X
예제 #10
0
 def _export_data(self, X, is_train=True):
     data_path = self._determine_export_path(is_train)
     X.to_csv(data_path, index=False)
     logging.info("Exported processed data to {}".format(data_path))