예제 #1
0
    def execute_task(df, model_id, model_name, dataset_id):
        logger.info(
            "Starting VB task -------- Model ID: {}; Model Type: {}; step 1/{}"
            .format(model_id, model_name, step_count[model_name]))
        DaskTasks.update_status(model_id, "Loading and validating data",
                                "1/{}".format(step_count[model_name]))

        dataset_m = Metadata(parent=Dataset.objects.get(
            id=dataset_id)).get_metadata("DatasetMetadata")
        target = "Response" if "response" not in dataset_m.keys(
        ) else dataset_m["response"]
        attributes = None if "attributes" not in dataset_m.keys(
        ) else dataset_m["attributes"]
        y = df[target]
        if attributes:
            attributes_list = json.loads(attributes.replace("\'", "\""))
            x = df[attributes_list]
        else:
            x = df.drop(target, axis=1)

        logger.info("Model ID: {}, loading hyper-parameters step 2/{}".format(
            model_id, step_count[model_name]))
        DaskTasks.update_status(model_id, "Loading hyper-parameters",
                                "2/{}".format(step_count[model_name]))
        parameters = Metadata(parent=AnalyticalModel.objects.get(
            id=model_id)).get_metadata("ModelMetadata")

        if model_name == "lra":
            DaskTasks.execute_lra(model_id, parameters, x, y,
                                  step_count[model_name])
예제 #2
0
 def set_prediction_estimators(project_id, model_id, selected_models: dict):
     project = Project.objects.get(id=int(project_id))
     model = Model.objects.get(id=int(model_id))
     m = load_model(model.id, model.model)
     model_metadata = Metadata(parent=model).get_metadata("ModelMetadata")
     m.prediction_model_type = model_metadata[
         "prediction_model_type"] if "prediction_model_type" in model_metadata.keys(
         ) else "single"
     m.refitPredictionModels(selected_models=selected_models)
     m.save(n=4, model_id=model_id, message="Model selection")
예제 #3
0
    def set_prediction_estimators(project_id, model_id, selected_models: dict):
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=project.dataset)
        df = load_dataset(dataset.id, dataset)
        project_metadata = Metadata(parent=Project.objects.get(id=project.id)).get_metadata("ProjectMetadata")

        target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys() else project_metadata["features"]
        target = df[target_label]
        if features_label:
            features_list = json.loads(features_label.replace("\'", "\""))
            features = df[features_list]
        else:
            features = df.drop(target_label, axis=1)
        model = Model.objects.get(id=int(model_id))
        m = load_model(model.id, model.model)
        # TODO: update predictive_model_type from model metadata
        m.refitPredictiveModels(selected_models=selected_models, y_df=target, x_df=features)
        m.save(n=4, model_id=model_id)
예제 #4
0
    def make_prediction(amodel_id, data=None):
        amodel = AnalyticalModel.objects.get(id=int(amodel_id))
        dataset = Dataset.objects.get(id=int(amodel.dataset))
        y_data = None

        df = pd.read_csv(StringIO(bytes(dataset.data).decode()))
        dataset_m = Metadata(parent=dataset).get_metadata("DatasetMetadata")
        target = "Response" if "response" not in dataset_m.keys(
        ) else dataset_m["response"]
        attributes = None if "attributes" not in dataset_m.keys(
        ) else dataset_m["attributes"]
        y = df[target]
        if attributes:
            attributes_list = json.loads(attributes.replace("\'", "\""))
            x = df[attributes_list]
        else:
            x = df.drop(target, axis=1)

        t = LinearRegressionAutomatedVB()
        t.set_data(x, y)
        x_train = t.x_train
        y_train = t.y_train
        x_data = t.x_test
        y_test = t.y_test.to_numpy().flatten()

        if data is not None:
            x_data = data
        model = pickle.loads(amodel.model)
        response = {
            "results": model.predict(x_data),
            "train_score": model.score(x_train, y_train)
        }
        if data is None:
            response["residuals"] = y_test - response["results"]
            response["test_score"] = model.score(x_data, y_test)
        return response
예제 #5
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(pipeline_id,
                      "Data and Model Setup: Retrieving dataset and pipeline",
                      "1/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 1/{}".format(
                          pipeline_id, None, pre_processing_steps),
                      message="Cross validation")
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        dataset_metadata = Metadata(
            parent=dataset).get_metadata("DatasetMetadata")
        pipeline_metadata = Metadata(
            parent=pipeline).get_metadata("PipelineMetadata")
        project_metadata = Metadata(
            parent=project).get_metadata("ProjectMetadata")

        target_label = None if "target" not in project_metadata.keys(
        ) else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys(
        ) else project_metadata["features"]

        target_label = "target" if (
            "target" not in dataset_metadata.keys()
            and target_label is None) else dataset_metadata["target"]

        if "features" not in dataset_metadata.keys(
        ) and features_label is None:
            features_label = None
        else:
            features_label = dataset_metadata["features"]
        if features_label is None or features_label == "*":
            features_label = list(df.columns)
            features_label.remove(target_label)
        else:
            features_label = json.loads(features_label)
        drop_vars = [] if "drop_features" not in project_metadata.keys(
        ) else json.loads(project_metadata["drop_features"].replace(
            "\'", "\""))
        for d in drop_vars:
            features_label.remove(d)

        # STAGE 2 - Data prep
        update_status(pipeline_id,
                      "Data and Model Setup: Loading data",
                      "2/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 2/{}".format(
                          pipeline_id, pipeline.name, pre_processing_steps),
                      message="Cross validation")

        target = df[target_label].to_frame()
        if features_label:
            features = df[features_label]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings",
            "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(
                pipeline_id, pipeline.name, pre_processing_steps),
            message="Cross validation")
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys(
            ) else json.loads(pipeline_metadata["parameters"].replace(
                "'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata[
            "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                est_str = pipeline_metadata["estimators"].replace("\'", "\"")
                estimators = json.loads(est_str)
            else:
                update_status(
                    pipeline_id,
                    "Error: VB Helper requires an estimator.",
                    "-1/{}".format(pre_processing_steps),
                    log="Pipeline: {}, Type: {}, Setup: -1/{}".format(
                        pipeline_id, pipeline.name, pre_processing_steps),
                    message="Cross validation")
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {
                'cv_reps': 1,
                'cv_folds': 5,
                'cv_strategy': ('quantile', 5)
            }
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx}
            prep_dict = {
                'cat_approach': 'together',
                'impute_strategy': 'impute_middle',
                'cat_idx': vbhelper.cat_idx
            }
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked,
                               prep_dict=prep_dict,
                               inner_cv=inner_cv,
                               cat_idx=vbhelper.cat_idx,
                               float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys(
                ) else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {
                    "pipe": estimator,
                    "pipe_kwargs": e_kwargs
                }
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate(verbose=True)
                vbhelper.buildCVScoreDict()
            else:
                #TODO: check processing for non-outer-cv instance for data cleanup
                vbhelper.fitEstimators()
            try:
                model = Model.objects.get(pipeline=pipeline)
                model_id = model.id
            except Model.DoesNotExist:
                model_id = None
            vbhelper.save(message="Completed.")
            del model
        except Exception as e:
            update_status(pipeline_id,
                          "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(
                              pipeline_id, pipeline.name, e),
                          message="Cross validation")
        del vbhelper
예제 #6
0
class DataExploration:
    def __init__(self, dataset_id):
        # TODO: replace the need for the project_id with providing the target variable
        self.dataset_id = dataset_id
        self.dataset = Dataset.objects.get(pk=dataset_id)

        self.df = load_dataset(dataset_id, self.dataset)
        self.dataset_metadata = Metadata(
            parent=self.dataset).get_metadata("DatasetMetadata")

        self.target_label = "target" if "target" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["target"]
        self.features_label = None if "features" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["features"]
        if self.features_label is None or self.features_label == "*":
            self.features_label = list(self.df.columns)
            self.features_label.remove(self.target_label)
        else:
            self.features_label = json.loads(self.features_label)

        self.y_df = self.df[self.target_label].to_frame()
        self.X_df = self.df[self.features_label]

        self.vbhelper = VBHelper(pipeline_id=-1)
        self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)

    def get_missing_vals(self):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.missingVals()

    def get_components(self, num_cols, keep_cats=False):
        try:
            if "," in num_cols:
                _num_cols = num_cols.split(",")
                num_cols = []
                for n in _num_cols:
                    num_cols.append(int(n))
            else:
                num_cols = [int(num_cols)]
        except Exception:
            num_cols = [1]
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats)

    def get_kerneldensity(self):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.kernelDensityPie()

    def get_dendrogram(self, linkage='ward', dist='spearmanr'):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)
예제 #7
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(
            pipeline_id,
            "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 1/{}".format(pipeline_id, None, pre_processing_steps)
        )
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        pipeline_metadata = Metadata(parent=Pipeline.objects.get(id=pipeline_id)).get_metadata("PipelineMetadata")
        project_metadata = Metadata(parent=Project.objects.get(id=project_id)).get_metadata("ProjectMetadata")

        target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys() else project_metadata["features"]

        # STAGE 2 - Data prep
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 2/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )

        target = df[target_label]
        if features_label:
            features_list = json.loads(features_label.replace("\'", "\""))
            features = df[features_list]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys() else json.loads(pipeline_metadata["parameters"].replace("'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata["outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                estimators = json.loads(pipeline_metadata["estimators"].replace("\'", "\""))
            else:
                update_status(pipeline_id, "Error: VB Helper requires an estimator.",
                              "-1/{}".format(pre_processing_steps),
                              log="Pipeline: {}, Type: {}, Setup: -1/{}".format(pipeline_id, pipeline.name,
                                                                                pre_processing_steps)
                              )
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5)}
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            prep_dict = {'impute_strategy': 'impute_knn5', 'cat_idx': vbhelper.cat_idx}
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv,
                               gridpoints=4, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys() else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {"pipe": estimator, "pipe_kwargs": e_kwargs}
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate()
                vbhelper.buildCVScoreDict()
            else:
                vbhelper.fitEstimators()
            vbhelper.save()
        except Exception as e:
            update_status(pipeline_id, "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(pipeline_id, pipeline.name, e)
                          )