Пример #1
0
    def __init__(self, problem=None, database="featurehub"):
        """Create the ORMManager and connect to DB.

        If problem name is given, load it.

        Parameters
        ----------
        problem : str, optional (default=None)
            Name of problem
        database : str, optional (default="featurehub")
            Name of database within DBMS.
        """

        self.__orm = ORMManager(database)

        if not database_exists(self.__orm.engine.url):
            print("database {} does not seem to exist.".format(database))
            print("You might want to create it by calling set_up method")
        elif  problem:
            try:
                with self.__orm.session_scope() as session:
                    problem = session.query(Problem)\
                                     .filter(Problem.name == problem).one()
                    self.__problemid = problem.id
            except NoResultFound:
                print("WARNING: Problem {} does not exist!".format(problem))
                print("You might want to create it by calling create_problem"
                      " method")
Пример #2
0
def log_evaluation_attempt(user):
    """Log user evaluation of feature.

    Extracts 'database', 'problem_id', and 'code' from POST body.
    """

    try:
        try:
            database = request.form["database"]
            problem_id = request.form["problem_id"]
            code = request.form["code"]
        except Exception:
            app.logger.exception("Couldn't read parameters from form.")
        app.logger.debug("Read parameters from form.")

        try:
            user_name = user["name"]
            orm = ORMManager(database, admin=True)
            with orm.session_scope() as session:
                user_obj = session.query(User).filter(
                    User.name == user_name).one()
                problem_obj = session.query(Problem).filter(
                    Problem.id == problem_id).one()
                evaluation_attempt_obj = EvaluationAttempt(user=user_obj,
                                                           problem=problem_obj,
                                                           code=code)
                session.add(evaluation_attempt_obj)
        except Exception:
            app.logger.exception("Couldn't insert evaluation attempt into "
                                 "database.")
        app.logger.debug("Inserted evaluation attempt into database.")
    finally:
        return Response()
Пример #3
0
    def __init__(self, database="featurehub"):
        """Create the ORMManager and connect to DB.

        Parameters
        ----------
        database : str, optional (default="featurehub")
            Name of database within DBMS.
        """

        self.__orm = ORMManager(database)

        if not database_exists(self.__orm.engine.url):
            print("Database {} does not exist.".format(database),
                  file=sys.stderr)
            print("You might want to create it by calling set_up method",
                  file=sys.stderr)
Пример #4
0
def create_user(user):
    """Create User object in the database.
    """

    user_name = user["name"]
    try:
        database = request.form["database"]
        orm = ORMManager(database, admin=True)
        with orm.session_scope() as session:
            session.add(User(name=user_name))
    except Exception:
        app.logger.exception(
            "Couldn't create new user (name '{}') in db".format(user_name))
        return Response(status=500)

    app.logger.debug("Created new user (name '{}') in db".format(user_name))
    return Response(status=201)
Пример #5
0
    def __init__(self, problem, database="featurehub"):
        self.__database = database
        self.__orm = ORMManager(database)
        self.__username = None

        with self.__orm.session_scope() as session:
            try:
                problem = session.query(Problem)\
                                 .filter(Problem.name == problem)\
                                 .one()
                self.__problem_id = problem.id
            except NoResultFound:
                raise ValueError("Invalid problem name: {}".format(problem))
            except MultipleResultsFound:
                raise ValueError("Unexpected issue talking to database. " +
                                 TRY_AGAIN_LATER)

        # "log in" to the system
        self._login()

        # initialize evaluation client
        self.__evaluation_client = EvaluatorClient(self.__problem_id,
                                                   self.__username, self.__orm)
Пример #6
0
def submit(user):
    """Process user request to submit feature.

    Extracts 'database', 'problem_id', 'code', and 'description' from POST
    body.
    """

    try:
        database = request.form["database"]
        problem_id = request.form["problem_id"]
        feature_dill = request.form["feature_dill"]
        code = request.form["code"]
        description = request.form["description"]
    except Exception:
        app.logger.exception("Couldn't read parameters from form.")
        return EvaluationResponse(
            status_code=EvaluationResponse.STATUS_CODE_BAD_REQUEST)
    app.logger.debug("Read parameters from form.")

    # preprocessing
    # - look up the problem in the databasse
    # - look up the user in the database
    # - compute the md5 hash of the feature code
    # - convert the feature code into a function
    orm = ORMManager(database, admin=True)
    with orm.session_scope() as session:
        try:
            problem_obj = session.query(Problem)\
                    .filter(Problem.id == problem_id).one()
        except (NoResultFound, MultipleResultsFound) as e:
            app.logger.exception(
                "Couldn't access problem (id '{}') from db".format(problem_id))
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_BAD_REQUEST)
        except Exception:
            app.logger.exception(
                "Unexpected issue accessing problem (id '{}') from db".format(
                    problem_id))
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_SERVER_ERROR)

        app.logger.debug(
            "Accessed problem (id '{}') from db".format(problem_id))

        user_name = user["name"]
        try:
            user_obj = session.query(User).filter(User.name == user_name).one()
        except (NoResultFound, MultipleResultsFound) as e:
            app.logger.exception(
                "Couldn't access user (name '{}') from db".format(user_name))
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_BAD_REQUEST)
        app.logger.debug("Accessed user (name '{}') from db".format(user_name))

        md5 = myhash(code)
        app.logger.debug("Computed feature hash.")

        evaluator = EvaluatorServer(problem_id, user_name, orm)
        try:
            is_registered = evaluator.check_if_registered(code)
            if is_registered:
                app.logger.debug("feature already registered.")
                return EvaluationResponse(status_code=EvaluationResponse.
                                          STATUS_CODE_DUPLICATE_FEATURE)
        except Exception:
            app.logger.exception("Unexpected error checking if feature is "
                                 "registered")
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_SERVER_ERROR)
        app.logger.debug("Confirmed that feature is not already registered")

        try:
            feature = dill.loads(unquote_to_bytes(feature_dill))
        except Exception:
            app.logger.exception(
                "Couldn't extract function (code '{}')".format(code))
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_BAD_FEATURE)
        app.logger.debug("Extracted function.")

        # processing
        # - compute the CV score
        # - compute any other metrics
        try:
            metrics = evaluator.evaluate(feature)
            # TODO expand schema
        except ValueError:
            app.logger.exception(
                "Couldn't evaluate feature (code '{}')".format(code))
            # feature is invalid
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_BAD_FEATURE)
        except Exception:
            app.logger.exception(
                "Unexpected error evaluating feature (code '{}')".format(code))
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_SERVER_ERROR)
        app.logger.debug("Evaluated feature.")

        try:
            # write to db
            feature_obj = Feature(description=description,
                                  feature_dill_quoted=feature_dill,
                                  code=code,
                                  md5=md5,
                                  user=user_obj,
                                  problem=problem_obj)
            session.add(feature_obj)
            for metric in metrics:
                metric_db = metric.convert(kind="db")
                metric_obj = Metric(feature=feature_obj,
                                    name=metric_db["name"],
                                    scoring=metric_db["scoring"],
                                    value=metric_db["value"])
                session.add(metric_obj)
        except Exception:
            app.logger.exception("Unexpected error inserting into db")
            return EvaluationResponse(
                status_code=EvaluationResponse.STATUS_CODE_DB_ERROR)
        app.logger.debug("Inserted into db.")

        # post to forum
        problem_name = problem_obj.name
        if is_positive_env(os.environ.get("USE_DISCOURSE")) and \
            problem_name != DEMO_PROBLEM_NAME:
            try:
                topic_url = post_feature(feature_obj, metrics)
                app.logger.debug("Posted to forum")
            except Exception:
                topic_url = ""
                app.logger.exception("Unexpected error posting to forum")
        else:
            topic_url = ""

    # return
    # - status code
    # - metrics dict
    return EvaluationResponse(
        status_code=EvaluationResponse.STATUS_CODE_OKAY,
        metrics=metrics,
        topic_url=topic_url,
    )
Пример #7
0
class Commands(object):
    """Admin interface for the database.

    Create the schema, add or remove problems, and view problems, users, and
    features.
    """

    def __init__(self, problem=None, database="featurehub"):
        """Create the ORMManager and connect to DB.

        If problem name is given, load it.

        Parameters
        ----------
        problem : str, optional (default=None)
            Name of problem
        database : str, optional (default="featurehub")
            Name of database within DBMS.
        """

        self.__orm = ORMManager(database)

        if not database_exists(self.__orm.engine.url):
            print("database {} does not seem to exist.".format(database))
            print("You might want to create it by calling set_up method")
        elif  problem:
            try:
                with self.__orm.session_scope() as session:
                    problem = session.query(Problem)\
                                     .filter(Problem.name == problem).one()
                    self.__problemid = problem.id
            except NoResultFound:
                print("WARNING: Problem {} does not exist!".format(problem))
                print("You might want to create it by calling create_problem"
                      " method")

    def set_up(self, drop=False):
        """Create a new DB and create the initial scheme.

        If the database exists and drop=True, the existing database is dropped
        and recreated. Regardless, any tables defined by the schema that do not
        exist are created.

        Parameters
        ----------
        drop : bool, optional (default=False)
            Drop database if it already exists.
        """

        # todo extract database name from engine url and report for brevity
        engine = self.__orm.engine
        if database_exists(engine.url):
            print("Database {} already exists.".format(engine.url))
            if drop:
                print("Dropping old database {}".format(engine.url))
                drop_database(engine.url)
                with possibly_talking_action("Re-creating database..."):
                    create_database(engine.url)
        else:
            with possibly_talking_action("Creating database..."):
                create_database(engine.url)

        with possibly_talking_action("Creating tables..."):
            Base.metadata.create_all(engine)

        print("Database {} created successfully".format(engine.url))

    def bulk_create_problem_yml(self, path):
        """Create new problem entries in database from yml document stream.

        Can create a yml file with individual problems delimited into documents
        using the `---` ... `---` document stream syntax.

        Parameters
        ----------
        path: str or path-like
            Path to yml file
        """
        with open(path, "r") as f:
            obj_all = yaml.load_all(f)
            for obj in obj_all:
                self.create_problem(**obj)

    def create_problem_yml(self, path):
        """Create new problem entry in database from yml file.

        Parameters
        ----------
        path: str or path-like
            Path to yml file
        """
        with open(path, "r") as f:
            obj = yaml.load(f)

        self.create_problem(**obj)

    def create_problem(self, name="", problem_type="", problem_type_details={},
            data_dir_train="", data_dir_test="", files=[], table_names=[],
            entities_table_name="", entities_featurized_table_name="",
            target_table_name=""):
        """Creates new problem entry in database.

        Parameters
        ----------
        name : str
        problem_type : str
            Classification or regression
        problem_type_details : dict
            Dict with additional details about problem.
            For example, the dict may be {"classification_type" : "multiclass"}.
        data_dir_train : str
            Absolute path of containing directory of data files for training.
        data_dir_test : str
            Absolute path of containing directory of data files for testing
        files : list of str
            List of file paths relative to data_dir; files must be named
            identically within both data_dir directories.
        table_names : list of str
            List of table names, corresponding exactly to `files`
        entities_table_name : str
            Name of table that contains the entity variables. Must be found in
            `table_names`.
        entities_featurized_table_name : str
            Name of table that contains the pre-processed, featurized, entity
            variables. Must be found in `table_names`.
        target_table_name : str
            Name of table that contains the target variable (label). Must be
            found in table_names. Table must hold a single column with label
            values only.
        """

        with self.__orm.session_scope() as session:
            try:
                problem = session.query(Problem).filter(Problem.name == name).one()
                self.__problemid = problem.id
                print("Problem {} already exists".format(name))
                return
            except NoResultFound:
                pass    # we will create it

            problem = Problem(
                name                           = name,
                problem_type                   = problem_type,
                problem_type_details           = json.dumps(problem_type_details),
                data_dir_train                 = data_dir_train,
                data_dir_test                  = data_dir_test,
                files                          = json.dumps(files),
                table_names                    = json.dumps(table_names),
                entities_table_name            = entities_table_name,
                entities_featurized_table_name = entities_featurized_table_name,
                target_table_name              = target_table_name,
            )
            session.add(problem)
            self.__problemid = problem.id
            print("Problem {} successfully created".format(name))

    def get_problems(self):
        """Return a list of problems in the database."""

        with self.__orm.session_scope() as session:
            try:
                problems = session.query(Problem.name).all()
                return [problem[0] for problem in problems]
            except NoResultFound:
                return []

    def get_features(self, user_name=None):
        """Get a DataFrame with the details about all registered features."""
        with self.__orm.session_scope() as session:
            results = self._get_features(session, user_name).all()
            feature_dicts = []
            for feature, user_name in results:
                d = {
                    "user"        : user_name,
                    "description" : feature.description,
                    "md5"         : feature.md5,
                    "created_at"  : feature.created_at,
                }
                feature_metrics = session.query(Metric.name,
                        Metric.value).filter(Metric.feature_id ==
                                feature.id).all()
                # feature_metrics = feature.metrics
                for metric in feature_metrics:
                    d[metric.name] = metric.value

                feature_dicts.append(d)

            if not feature_dicts:
                print("No features found")
            else:
                return pd.DataFrame(feature_dicts)

    def _get_features(self, session, user_name=None):
        """Return a query filtering a given user for the current problem.

        Parameters
        ----------
        user_name : str, optional(default=None)
            If no user name provided, returns features for all users.
        """

        #TODO pivot metrics tables
        query = session.query(Feature, User.name)
        #query = session.query(Feature, User.name, Metric)

        if user_name:
            query = query.filter(User.name == user_name)

        query = query.filter(Feature.problem_id == self.__problemid)

        return query
Пример #8
0
class Session(object):
    """Represents a user's session within FeatureHub.

    Includes commands for discovering, testing, and registering new features.
    """
    def __init__(self, problem, database="featurehub"):
        self.__database = database
        self.__orm = ORMManager(database)
        self.__username = None

        with self.__orm.session_scope() as session:
            try:
                problem = session.query(Problem)\
                                 .filter(Problem.name == problem)\
                                 .one()
                self.__problem_id = problem.id
            except NoResultFound:
                raise ValueError("Invalid problem name: {}".format(problem))
            except MultipleResultsFound:
                raise ValueError("Unexpected issue talking to database. " +
                                 TRY_AGAIN_LATER)

        # "log in" to the system
        self._login()

        # initialize evaluation client
        self.__evaluation_client = EvaluatorClient(self.__problem_id,
                                                   self.__username, self.__orm)

    @property
    def __dataset(self):
        return self.__evaluation_client.dataset

    @property
    def __entities_featurized(self):
        return self.__evaluation_client.entities_featurized

    @property
    def __target(self):
        return self.__evaluation_client.target

    @staticmethod
    def _eval_server_post(route, data):
        url = "http://{}:{}/services/eval-server/{}".format(
            os.environ.get("EVAL_CONTAINER_NAME"),
            os.environ.get("EVAL_CONTAINER_PORT"), route)
        headers = {
            "Authorization":
            "token {}".format(os.environ.get("JUPYTERHUB_API_TOKEN")),
        }
        return requests.post(url=url, data=data, headers=headers)

    def _login(self):
        name = os.environ.get("USER")
        if not name:
            raise ValueError("Missing environment variable 'USER'. FeatureHub"
                             " session not initialized.")

        with self.__orm.session_scope() as session:
            try:
                user = session.query(User)\
                              .filter(User.name == name)\
                              .one()
                self.__username = user.name
            except NoResultFound:
                data = {"database": self.__orm.database}
                response = Session._eval_server_post("create-user", data)
                if response.ok:
                    self.__username = name
                else:
                    raise ValueError("Couldn't log in to FeatureHub. " \
                                     + TRY_AGAIN_LATER)

            except MultipleResultsFound as e:
                raise ValueError("Unexpected error logging in to FeatureHub. " \
                                 + TRY_AGAIN_LATER)

    def get_sample_dataset(self):
        """Loads sample of problem training dataset.

        Returns the dataset a dict mapping table names to pandas DataFrames.

        Returns
        -------
        dataset : dict (str => pd.DataFrame)
            A dict mapping table names to pandas DataFrames.
        target : pd.DataFrame
            A DataFrame that holds a single column: the target variable (label).

        Examples
        --------
        >>> dataset = commands.get_sample_dataset()
        >>> dataset["users"] # -> returns DataFrame
        >>> dataset["stores"] # -> returns DataFrame
        """
        self.__evaluation_client._load_dataset()

        # Return a *copy* of the dataset, ensuring we have enough memory.
        gc.collect()
        dataset = {
            table_name: self.__dataset[table_name].copy()
            for table_name in self.__dataset
        }
        target = self.__target.copy()  # pylint: disable=no-member

        return (dataset, target)

    def get_entity_features(self):
        """Loads preprocessed entity-level features of problem training dataset.
        
        The entity-level features are the same length as the entity DataFrame
        and the target DataFrame.

        Returns
        -------
        entity_features : pd.DataFrame or None
        """
        self.__evaluation_client._load_dataset()
        if not pd.DataFrame(self.__entities_featurized).empty:
            entity_features = self.__entities_featurized.copy()
        else:
            entity_features = None
        return entity_features

    def discover_features(self, code_fragment=None):
        """Print features written by other users.

        A code fragment can be used to filter search results. For each feature,
        prints feature id, feature description, metrics, and source code.

        Parameters
        ----------
        code_fragment : string, default=None
            Source code fragment to filter for.
        """
        self._print_some_features(code_fragment, User.name != self.__username)

    def print_my_features(self, code_fragment=None):
        """Print features written by me.

        A code fragment can be used to filter search results. For each feature,
        prints feature id, feature description, metrics, and source code.

        Parameters
        ----------
        code_fragment : string, default=None
            Source code fragment to filter for.
        """
        self._print_some_features(code_fragment, User.name == self.__username)

    def _print_some_features(self, code_fragment, predicate):
        """Driver function for discover_features and print_my_features."""
        with self.__orm.session_scope() as session:
            query = self._filter_features(session, code_fragment)

            # Filter only users that are not me
            query = query.join(Feature.user).filter(predicate)
            features = query.all()

            if features:
                for feature in features:
                    # Join with metrics table
                    query = session.query(Metric.name, Metric.value)\
                                .filter(Metric.feature_id == feature.id)
                    metrics = query.all()
                    metric_list = [(metric.name, metric.value)
                                   for metric in metrics]
                    self._print_one_feature(feature.description, feature.id,
                                            feature.code, metric_list)
            else:
                print("No features found.")

    def evaluate(self, feature):
        """Evaluate feature on training dataset and return key performance metrics.

        Runs the feature in an isolated environment to extract the feature
        values. Validates the feature values. Then, builds a model on that one
        feature and computes key cross-validated metrics. Prints results and
        returns a dictionary with (metric => value) entries. If the feature is
        invalid, prints reason and returns empty dictionary.

        Parameters
        ----------
        feature : function
            Feature to evaluate
        """

        if self.__evaluation_client.check_if_registered(feature, verbose=True):
            return

        return self.__evaluation_client.evaluate(feature)

    def submit(self, feature, description=""):
        """Submit feature to server for evaluation on test data.
        
        If successful, registers feature in feature database and returns key
        performance metrics.

        Runs the feature in an isolated environment to extract the feature
        values. Validates the feature values. Then, builds a model on that one
        feature, performs cross validation, and returns key performance
        metrics.

        Parameters
        ----------
        feature : function
            Feature to evaluate
        description : str
            Feature description. If left empty, will prompt for user imput.
        """

        if not description:
            description = self._prompt_description()

        self.__evaluation_client.submit(feature, description)

    def _filter_features(self, session, code_fragment):
        """Return query that filters this problem and given code fragment.
        
        Return a query object that filters features written for the appropriate
        problem by code snippets. This query object can be added to by the
        caller.
        """
        filter_ = (Feature.problem_id == self.__problem_id, )

        if code_fragment:
            filter_ = filter_ + (Feature.code.contains(code_fragment), )

        return session.query(Feature).filter(*filter_)

    def _prompt_description(self):
        """Prompt user for description of feature"""
        print(
            "First, enter feature description. Your feature description "
            "should be clear, concise, and meaningful to non-data scientists."
            " (If your feature fails to register, this description will be "
            "discarded.)")

        description = input("Enter description: ")
        print("")
        return description

    @staticmethod
    def _print_one_feature(feature_description, feature_id, feature_code,
                           metric_list):
        """Print one feature in user-readable format.
        
        Parameters
        ----------
        feature_description : str
        feature_id : int
        feature_code : str
        metric_list : MetricList

        Examples
        --------
        >>> Session._print_one_feature("Age", 1, "def age(dataset):    pass\n",
                metric_list_)
        -------------------
        Feature id: 1
        Feature description: Age

        Feature code:
            def age(dataset):    pass

        Feature metrics:
            Accuracy: 0.5
            ROC AUC: 0.35
        """
        result = "------------------\n" + \
                 "Feature id: {}\n".format(feature_id) + \
                 "Feature description: {}\n".format(feature_description)

        result += "\n" + \
                  "Feature code:\n"

        indent = "    "
        tmp = []
        for line in feature_code.split("\n"):
            tmp.append(indent + line)
        result += "\n".join(tmp)

        result += "\n" + \
                  "Feature metrics:\n"

        for metric_name, metric_value in metric_list:
            result += "    {}: {}\n".format(metric_name, metric_value)

        print(result)
Пример #9
0
class Commands(object):
    """Admin interface for the database.

    Create the schema, add or remove problems, and view problems, users, and features.
    """
    def __init__(self, database="featurehub"):
        """Create the ORMManager and connect to DB.

        Parameters
        ----------
        database : str, optional (default="featurehub")
            Name of database within DBMS.
        """

        self.__orm = ORMManager(database)

        if not database_exists(self.__orm.engine.url):
            print("Database {} does not exist.".format(database),
                  file=sys.stderr)
            print("You might want to create it by calling set_up method",
                  file=sys.stderr)

    def set_up(self, drop=False):
        """Create a new DB and create the initial scheme.

        If the database exists and drop=True, the existing database is dropped
        and recreated. Regardless, any tables defined by the schema that do not
        exist are created.

        Parameters
        ----------
        drop : bool, optional (default=False)
            Drop database if it already exists.
        """

        # todo extract database name from engine url and report for brevity
        engine = self.__orm.engine
        if database_exists(engine.url):
            print("Database {} already exists.".format(engine.url))
            if drop:
                print("Dropping old database {}".format(engine.url))
                drop_database(engine.url)
                with possibly_talking_action("Re-creating database..."):
                    create_database(engine.url)
        else:
            with possibly_talking_action("Creating database..."):
                create_database(engine.url)

        with possibly_talking_action("Creating tables..."):
            Base.metadata.create_all(engine)

        print("Database {} created successfully".format(engine.url))

    def bulk_create_problem_yml(self, path):
        """Create new problem entries in database from yml document stream.

        Can create a yml file with individual problems delimited into documents
        using the `---` ... `---` document stream syntax.

        Parameters
        ----------
        path: str or path-like
            Path to yml file
        """
        with open(path, "r") as f:
            obj_all = yaml.load_all(f)
            for obj in obj_all:
                self.create_problem(**obj)

    def create_problem_yml(self, path):
        """Create new problem entry in database from yml file.

        Parameters
        ----------
        path: str or path-like
            Path to yml file
        """
        with open(path, "r") as f:
            obj = yaml.load(f)

        self.create_problem(**obj)

    def create_problem(self,
                       name="",
                       problem_type="",
                       problem_type_details={},
                       data_dir_train="",
                       data_dir_test="",
                       files=[],
                       table_names=[],
                       entities_table_name="",
                       entities_featurized_table_name="",
                       target_table_name=""):
        """Creates new problem entry in database.

        Parameters
        ----------
        name : str
        problem_type : str
            Classification or regression
        problem_type_details : dict
            Dict with additional details about problem.
            For example, the dict may be {"classification_type" : "multiclass"}.
        data_dir_train : str
            Absolute path of containing directory of data files for training.
        data_dir_test : str
            Absolute path of containing directory of data files for testing
        files : list of str
            List of file paths relative to data_dir; files must be named
            identically within both data_dir directories.
        table_names : list of str
            List of table names, corresponding exactly to `files`
        entities_table_name : str
            Name of table that contains the entity variables. Must be found in
            `table_names`.
        entities_featurized_table_name : str
            Name of table that contains the pre-processed, featurized, entity
            variables. Must be found in `table_names`.
        target_table_name : str
            Name of table that contains the target variable (label). Must be
            found in table_names. Table must hold a single column with label
            values only.
        """

        with self.__orm.session_scope() as session:
            try:
                problem = session.query(Problem).filter(
                    Problem.name == name).one()
                print("Problem {} already exists".format(name))
                return
            except NoResultFound:
                pass  # we will create it

            problem = Problem(
                name=name,
                problem_type=problem_type,
                problem_type_details=json.dumps(problem_type_details),
                data_dir_train=data_dir_train,
                data_dir_test=data_dir_test,
                files=json.dumps(files),
                table_names=json.dumps(table_names),
                entities_table_name=entities_table_name,
                entities_featurized_table_name=entities_featurized_table_name,
                target_table_name=target_table_name,
            )
            session.add(problem)
            print("Problem {} successfully created".format(name))

    def get_problems(self):
        """Return a list of problems in the database."""

        with self.__orm.session_scope() as session:
            try:
                problems = session.query(Problem.name).all()
                return [problem[0] for problem in problems]
            except NoResultFound:
                return []

    def get_features(self, problem_name=None, user_name=None):
        """Get a DataFrame with the details about all registered features."""
        with self.__orm.session_scope() as session:
            results = self._get_features(session, problem_name,
                                         user_name).all()
            feature_dicts = []
            for feature, user_name in results:
                d = {
                    "user": user_name,
                    "description": feature.description,
                    "md5": feature.md5,
                    "created_at": feature.created_at,
                }
                feature_metrics = session.query(
                    Metric.name, Metric.value).filter(
                        Metric.feature_id == feature.id).all()
                # feature_metrics = feature.metrics
                for metric in feature_metrics:
                    d[metric.name] = metric.value

                feature_dicts.append(d)

            if not feature_dicts:
                print("No features found")
            else:
                return pd.DataFrame(feature_dicts)

    def _get_features(self, session, problem_name="", user_name=""):
        """Return a query filtering a given user for the current problem.

        Parameters
        ----------
        problem_name : str, optional
            If no problem name provided, returns features for all problems.
        user_name : str, optional
            If no user name provided, returns features for all users.
        """

        #TODO pivot metrics tables
        query = session.query(Feature, User.name)
        #query = session.query(Feature, User.name, Metric)

        if user_name:
            query = query.filter(User.name == user_name)

        if problem_name:
            query = query.filter(Feature.problem.name == problem_name)

        return query

    def load_dataset(self, problem_name="", split="train"):
        """Load dataset for given problem with given split.

        Parameters
        ----------
        problem_name : str, optional
            If no problem name provided, returns dataset for *first problem in
            database*.
        split : str
            Valid options include "train", "test", "both" (concatenated)
        """

        orm = self.__orm
        username = "******"  # should be unused (unless submit new feature to db)

        with orm.session_scope() as session:
            if not problem_name:
                problem_name = session.query(Problem.name)\
                        .filter(Problem.name != "demo").scalar()
            problem_id = session.query(Problem.id)\
                    .filter(Problem.name == problem_name).scalar()

            data_dir = os.path.join("/data", split)
            dataset, entities_featurized, target = load_dataset_from_dir(
                session, data_dir, problem_name)

        suffix = "_" + split

        return problem_name, dataset, entities_featurized, target

    def _extract_everything(self, suffix):
        with self.__orm.session_scope() as session:
            build_and_save_all_features(self, session, suffix)
            extract_and_save_all_tables(session, suffix)

    def _get_final_model_X_Y(self, problem_name, split, suffix):
        feature_matrix = load_feature_matrix(problem_name, split, suffix)
        _, _, entities_featurized, target = self.load_dataset(
            problem_name=problem_name, split=split)
        X = pd.concat([entities_featurized, feature_matrix], axis=1)
        Y = target
        return X, Y

    def _train_model(self, problem_name, split, suffix, **kwargs):
        with self.__orm.session_scope() as session:
            result = session.query(Problem)\
                        .filter(Problem.name == problem_name).one()
            problem_type = result.problem_type
        X_train, Y_train = self._get_final_model_X_Y(problem_name, split,
                                                     suffix)
        automl = AutoModel(problem_type, **kwargs)  # TODO increase time
        automl.fit(X_train, Y_train, dataset_name=problem_name)
        absname = prepare_automl_file_name(problem_name, split, suffix)
        automl.dump(absname)
        return automl, X_train, Y_train

    def _do_final_model(self,
                        problem_name,
                        suffix,
                        split_train="train",
                        split_test="test",
                        **kwargs):
        """
        """
        automl, X_train, Y_train = self._train_model(problem_name, split_train,
                                                     suffix, **kwargs)
        X_test, Y_test = self._get_final_model_X_Y(problem_name, split_test,
                                                   suffix)
        if automl._is_classification():
            Y_test_pred = automl.predict_proba(X_test)
        else:
            Y_test_pred = automl.predict(X_test)
        return automl, X_train, Y_train, X_test, Y_test, Y_test_pred