示例#1
0
    # When n_estimators is a valid value not equal to the default
    est = forest(n_estimators=100)
    est = assert_no_warnings(est.fit, X, y)


class MyBackend(LokyBackend):
    def __init__(self, *args, **kwargs):
        self.count = 0
        super(MyBackend, self).__init__(*args, **kwargs)

    def start_call(self):
        self.count += 1
        return super(MyBackend, self).start_call()


register_parallel_backend('testing', MyBackend)


@pytest.mark.skipif(_joblib.__version__ < LooseVersion('0.12'),
                    reason='tests not yet supported in joblib <0.12')
@skip_if_no_parallel
def test_backend_respected():
    clf = RandomForestClassifier(n_estimators=10, n_jobs=2)

    with parallel_backend("testing") as (ba, n_jobs):
        clf.fit(X, y)

    assert ba.count > 0

    # predict_proba requires shared memory. Ensure that's honored.
    with parallel_backend("testing") as (ba, _):
示例#2
0
    # When n_estimators is a valid value not equal to the default
    est = forest(n_estimators=100)
    est = assert_no_warnings(est.fit, X, y)


class MyBackend(LokyBackend):
    def __init__(self, *args, **kwargs):
        self.count = 0
        super(MyBackend, self).__init__(*args, **kwargs)

    def start_call(self):
        self.count += 1
        return super(MyBackend, self).start_call()


register_parallel_backend('testing', MyBackend)


@pytest.mark.skipif(_joblib.__version__ < LooseVersion('0.12'),
                    reason='tests not yet supported in joblib <0.12')
@skip_if_no_parallel
def test_backend_respected():
    clf = RandomForestClassifier(n_estimators=10, n_jobs=2)

    with parallel_backend("testing") as (ba, n_jobs):
        clf.fit(X, y)

    assert ba.count > 0

    # predict_proba requires shared memory. Ensure that's honored.
    with parallel_backend("testing") as (ba, _):
示例#3
0
class Worker(QThread):

    register_parallel_backend("threading", ThreadingBackend, make_default=True)

    def __init__(
        self,
        rlayer,
        vlayer,
        outlayer,
        fields,
        classifier,
        model_params,
        split_params,
        tiles,
        accass,
        max_pix,
        tr,
    ):
        super(Worker, self).__init__()
        self.signals = Signals(self)
        try:
            self.starttime = time()
            self.rlayer = rlayer
            self.vlayer = vlayer
            self.outlayer = outlayer
            self.fields = fields
            self.classif = classifier
            self.params = model_params
            self.sp_params = split_params
            self.tiles = tiles
            self.acc = accass
            self.max_pix = max_pix
            self.tr = tr

        except Exception as e:
            import traceback

            self.signals.error.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                    traceback.format_exc())
            self.signals.finished.emit(None)

    def run(self):
        self.signals.status.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                 "Start...")
        try:

            self.train()
            result = self.classify()
            self.signals.finished.emit(result)

        except Exception as e:
            import traceback

            self.signals.error.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                    traceback.format_exc())
            self.signals.finished.emit(None)

    def train(self):
        def split(X, Y, test_size, stratify):
            return train_test_split(X,
                                    Y,
                                    test_size=test_size,
                                    stratify=stratify)

        shp = self.vlayer.dataProvider().dataSourceUri()
        train_shp = shp.split("|")[0]
        self.signals.status.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                 "Open vector file: " + train_shp)

        field_data = []
        geoms = []
        with fiona.open(train_shp, 'r') as source:
            for f in source:
                geoms.append(shape(f['geometry']))
                field_data.append(f['properties'][self.fields])

        labelencoder = LabelEncoder()

        #### TODO: special case self.fields == 'ClassificationTool_encoding'

        fde = labelencoder.fit_transform(field_data)
        shp_data = pd.DataFrame({self.fields: field_data, 'CTencoding': fde})
        self.shp_stats = (shp_data.groupby([
            self.fields, 'CTencoding'
        ]).size().reset_index().rename(columns={0: "Number of Polygons"}))
        #geoms = shapefile.geometry.values

        self.signals.status.emit(
            strftime("%Y-%m-%d %H:%M:%S", gmtime()),
            "Open raster file: " + self.rlayer.dataProvider().dataSourceUri(),
        )
        with rasterio.open(self.rlayer.dataProvider().dataSourceUri()) as src:
            img_bands = src.count
            crs = src.crs

        #p1 = pyproj.Proj(crs)
        #p2 = pyproj.Proj(shapefile.crs)
        #if p1.srs != p2.srs:
        #    raise RuntimeError("Error: data sets have different projections")

        X = np.array([]).reshape(0, img_bands)
        y = np.array([], dtype=np.int8)

        self.signals.status.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                 "Extract raster values ...")
        with rasterio.open(self.rlayer.dataProvider().dataSourceUri()) as src:
            meta = src.meta
            for index, geom in enumerate(geoms):
                feature = [mapping(geom)]

                out_image, out_transform = mask(src, feature, crop=True)

                out_image_trimmed = out_image[:, ~np.all(
                    out_image == meta["nodata"], axis=0)]
                out_image_reshaped = np.transpose(out_image_trimmed)
                if self.max_pix > -1:
                    out_image_reshaped = out_image_reshaped[np.random.choice(
                        out_image_reshaped.shape[0],
                        self.max_pix,
                        replace=False)]
                y = np.append(
                    y,
                    [fde[index]] * out_image_reshaped.shape[0],
                )

                X = np.vstack((X, out_image_reshaped))

        self.signals.status.emit(
            strftime("%Y-%m-%d %H:%M:%S", gmtime()),
            "Split data into training and test subset: " +
            str(100 - (self.sp_params["test_size"])) + "% test data " +
            str(self.sp_params["test_size"]) + "% training data",
        )

        if self.sp_params["stratify"] == True:
            stratify = y
        else:
            stratify = None

        test_size = self.sp_params["test_size"] / 100

        X_train, X_test, y_train, y_test = split(X, y, test_size, stratify)

        if self.classif == "KNearestNeighbor":
            self.signals.status.emit(
                strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                "Train model using " + self.classif,
            )
            from PyQt5 import QtTest

            QtTest.QTest.qWait(30000)
            scaler = MinMaxScaler(feature_range=(0, 1))
            rescaledX = scaler.fit_transform(X)
            classifier = KNeighborsClassifier(**self.params)
            classifier.fit(X_train, y_train)

        if self.classif == "RandomForest":
            self.signals.status.emit(
                strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                "Train model using " + self.classif,
            )
            classifier = RandomForestClassifier(**self.params)
            classifier.fit(X, y)

        if self.classif == "SVC":
            self.signals.status.emit(
                strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                "Train model using " + self.classif,
            )

            classifier = SVC(**self.params)
            classifier.fit(X_train, y_train)

        self.classifier = classifier

        if self.acc == True:
            y_pred = classifier.predict(X_test)
            cm = metrics.confusion_matrix(y_test, y_pred)
            stat_sort = self.shp_stats.sort_values("CTencoding")

            cmpd = pd.DataFrame(
                data=cm,
                index=stat_sort[self.fields].values,
                columns=stat_sort[self.fields].values,
            )
            cmpd_out = os.path.splitext(self.outlayer)[0] + "CM.csv"
            cmpd.to_csv(cmpd_out)

            clsf_report = pd.DataFrame(
                metrics.classification_report(y_true=y_test,
                                              y_pred=y_pred,
                                              output_dict=True)).transpose()
            clist = stat_sort[self.fields].to_list()
            clist.extend(["micro avg", "macro avg", "weighted avg"])
            clsf_report.index = clist
            clsf_out = os.path.splitext(self.outlayer)[0] + "class_report.csv"
            clsf_report.to_csv(clsf_out)

    def classify(self):
        def calculate_chunks(width, height, tiles):
            pixels = width * height
            max_pixels = pixels / tiles
            chunk_size = int(math.floor(math.sqrt(max_pixels)))
            ncols = int(math.ceil(width / chunk_size))
            nrows = int(math.ceil(height / chunk_size))
            chunk_windows = []

            for col in range(ncols):
                col_offset = col * chunk_size
                w = min(chunk_size, width - col_offset)
                for row in range(nrows):
                    row_offset = row * chunk_size
                    h = min(chunk_size, height - row_offset)
                    chunk_windows.append(
                        ((row, col), Window(col_offset, row_offset, w, h)))
            return chunk_windows

        with rasterio.open(self.rlayer.dataProvider().dataSourceUri()) as src:
            width = src.width
            height = src.height
            bands = src.count
            meta = src.meta
            dtype = src.dtypes

            self.signals.status.emit(strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                                     "Predicting image values ... ")

            chunk_blocks = calculate_chunks(width, height, self.tiles)
            meta.update({"count": 1, "dtype": dtype[0]})

            with rasterio.open(self.outlayer, "w", **meta) as dst:
                counter = 1
                for idx, window in chunk_blocks:
                    self.signals.status.emit(
                        strftime("%Y-%m-%d %H:%M:%S",
                                 gmtime()), "Processing Block: " +
                        str(counter) + " of " + str(len(chunk_blocks)))
                    img = src.read(window=window)
                    dtype = rasterio.dtypes.get_minimum_dtype(img)
                    reshaped_img = reshape_as_image(img)
                    rows, cols, bands_n = reshaped_img.shape

                    class_prediction = self.classifier.predict(
                        reshaped_img.reshape(-1, bands))
                    classification = np.zeros((rows, cols, 1)).astype(dtype)
                    classification[:, :, 0] = class_prediction.reshape(
                        reshaped_img[:, :, 1].shape).astype(dtype)
                    final = reshape_as_raster(classification)
                    dst.write(final, window=window)
                    counter += 1

        seconds_elapsed = time() - self.starttime
        self.signals.status.emit(
            strftime("%Y-%m-%d %H:%M:%S", gmtime()),
            "Execution completed in " +
            str(np.around(seconds_elapsed, decimals=2)) + " seconds",
        )
        return self.outlayer
示例#4
0
class ModelTrainer(QRunnable):
    '''
    QThread tasked with running all model training/tuning.  
    This could potentially take days to complete.
    '''
    # Setting parallel_backend to threading allows for multi-threading from a thread.  GUI will not freeze and
    # multithreading seems functional.
    # However, program now uses dask for the backend.  This code is left in for posterity
    # ! NOTE: some models, e.g. RandomForestClassifier, will not train using any backend attempted when n_jobs > 1.
    # ! This is regardless of using dask or joblib backend.  RandomForestClassifier will fail with n_jobs > 1
    # An exception is caught and the log updated if this occurs.
    register_parallel_backend('threading', ThreadingBackend, make_default=True)
    parallel_backend('threading')

    def __init__(self,
                 selected_models,
                 version_directory,
                 training_eval_params,
                 training_data,
                 tune_models,
                 tuning_params,
                 use_proba=False,
                 train_stacking_algorithm=True,
                 **kwargs):
        super(ModelTrainer, self).__init__()
        self.logger = logging.getLogger(__name__)
        self.signals = ModelTrainerSignals()

        self.allowed_pipeline_types = [
            'feature_extraction', 'feature_selection'
        ]
        self.version_directory = version_directory
        self.selected_models = selected_models
        self.training_eval_params = training_eval_params
        self.training_data = training_data
        self.tune_models = tune_models
        self.tuning_params = tuning_params
        self.use_proba = use_proba
        self.train_stacking_algorithm = train_stacking_algorithm
        self.kwargs = kwargs
        self.all_predictions_df = pd.DataFrame(index=self.training_data.index)
        self.grid_search_time = None
        self.model_checksums = {}
        self._is_running = True
        self.tag_suffix = CONFIG.get('VARIABLES', 'TagDelimiter') + CONFIG.get(
            'VARIABLES', 'TagDataColumnSuffix')

    @pyqtSlot()
    def run(self):
        self._update_log('Beginning ModelTrain run')
        # * Run thru enumeration of columns.  The second argument in enumerate
        # * tells python where to begin the idx count.  Here, 1 for our offset
        try:
            for col_idx, col in enumerate(self.training_data.columns, 1):
                if col.endswith(self.tag_suffix):
                    self._update_log(f'Current classification task: {col}',
                                     False)
                    col_label = col.split(
                        CONFIG.get('VARIABLES', 'TagDelimiter'))[0]
                    col_path = os.path.join(self.version_directory, col_label)
                    # * FInd and drop any samples missing an index
                    missing_idx_count = self.training_data.index.isna().sum()
                    if (missing_idx_count > 0):
                        self._update_log(
                            f"<b>Found {missing_idx_count} samples missing a value for index </b> \
                                        (index_col = {CONFIG.get('VARIABLES', 'IndexColumn')}).  Removing those samples..."
                        )
                        valid_indexes = self.training_data.index.dropna()
                        self.training_data = self.training_data[
                            self.training_data.index.isin(valid_indexes)]
                        self._update_log(
                            f'Shape of dataset after removal: {self.training_data.shape}'
                        )
                    # * Create dict to fill na samples with 'unanswered' and score of 0
                    label_col_name = self.training_data.columns[col_idx]
                    fill_dict = pd.DataFrame(data={
                        col: 'unanswered',
                        label_col_name: 0
                    },
                                             index=[0])
                    self.training_data.fillna(value=0, inplace=True, axis=1)
                    x = self.training_data[col].copy()
                    y = self.training_data[
                        self.training_data.columns[col_idx]].copy().values

                    results = pd.DataFrame(index=self.training_data.index)
                    results[TRUTH_LABEL_SUFFIX] = y
                    preds = np.empty(y.shape)
                    probs = np.empty(shape=(y.shape[0], len(np.unique(y))))

                    # * Initialize sklearn evaluation parameters
                    sk_eval_type = self.training_eval_params['sklearn']['type']
                    sk_eval_value = self.training_eval_params['sklearn'][
                        'value']
                    # * SKLEARN
                    for model, selected in self.selected_models[
                            'sklearn'].items():
                        if self._is_running == False:
                            self.signals.training_complete.emit(pd.DataFrame())
                            break
                        if selected:
                            try:
                                if self.tune_models:
                                    self._tune_model(x, y, model, col_path)
                                model_params = self.get_params_from_file(
                                    model, col_path)
                                self._update_log(f'Begin training {model}')
                                pipeline = Pipeline(
                                    self.get_pipeline(model_params['params']))
                                try:
                                    if sk_eval_type == 'cv':
                                        skf = StratifiedKFold(
                                            n_splits=sk_eval_value,
                                            random_state=RANDOM_SEED)
                                        for train, test in skf.split(x, y):
                                            with joblib.parallel_backend(
                                                    'dask'):
                                                preds[test] = pipeline.fit(
                                                    x.iloc[train],
                                                    y[train]).predict(
                                                        x.iloc[test])
                                            if self.use_proba and hasattr(
                                                    pipeline, 'predict_proba'):
                                                try:
                                                    probs[
                                                        test] = pipeline.predict_proba(
                                                            x.iloc[test])
                                                except AttributeError:
                                                    self.logger.debug(
                                                        '{} does not support predict_proba'
                                                        .format(model))
                                                    print(
                                                        model,
                                                        'does not support predict_proba'
                                                    )
                                            else:
                                                probs = np.array([])
                                    elif sk_eval_type == 'test_split':
                                        x_train, x_test, y_train, y_test = train_test_split(
                                            x,
                                            y,
                                            test_size=sk_eval_value,
                                            stratify=y,
                                            random_state=CONFIG.getfloat(
                                                'VARIABLES', 'RandomSeed'))
                                        preds = np.empty(len(y_test))
                                    else:
                                        self._update_log(
                                            f'No evaluation type chosen.')
                                except (KeyboardInterrupt, SystemExit):
                                    raise
                                except Exception:
                                    self.logger.warning(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model),
                                        exc_info=True)
                                    tb = traceback.format_exc()
                                    print(tb)
                                    self._update_log(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model), True, False)
                                model_scores = self.get_model_scores(y, preds)

                                self._update_log(
                                    f'Task completed on <b>{model}</b>.')
                                table_str = '''<table>
                                                    <thead>
                                                        <tr>
                                                            <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                                        </tr>
                                                    </thead>
                                                <tbody>
                                                    <tr>
                                            '''
                                for metric, score in model_scores.items():
                                    table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
                                table_str += '</tr></tbody></table><br>'
                                if sk_eval_type is not None:
                                    self._update_log(table_str, False, True)
                                self._update_log(
                                    f'Training {model} on full dataset')
                                with joblib.parallel_backend('dask'):
                                    pipeline.fit(x, y)

                                pred_col_name = col_label + TAG_DELIMITER + model + PRED_LABEL_SUFFIX
                                prob_col_name = col_label + TAG_DELIMITER + model + PROB_LABEL_SUFFIX
                                results[pred_col_name] = preds.astype(int)
                                # If predicting probabilities and the probability array has values,
                                # use those values for the results.
                                if self.use_proba and probs.size:
                                    results[prob_col_name] = np.amax(probs,
                                                                     axis=1)

                                save_path = os.path.join(col_path, model)
                                if not os.path.exists(save_path):
                                    os.makedirs(save_path)
                                self.save_model(model, pipeline, save_path,
                                                model_scores)
                            except (KeyboardInterrupt, SystemExit):
                                raise
                            except Exception as e:
                                self.logger.error(f'ModelTrainer.run {model}:',
                                                  exc_info=True)
                                tb = traceback.format_exc()
                                print(tb)
                                self._update_log(tb)
                    # Tensorflow__ would reside here
                    try:
                        if self.train_stacking_algorithm and self._is_running:
                            self.train_stacker(
                                results.drop(TRUTH_LABEL_SUFFIX, axis=1),
                                results[TRUTH_LABEL_SUFFIX].values, col_path)
                        else:
                            self._update_log('Skipping Stacker training.')
                    except ValueError as ve:
                        self.signals.training_complete.emit(pd.DataFrame())
                        self._update_log(
                            f'Unable to train Stacking algorithm on {col_label}.'
                        )
                        tb = traceback.format_exc()
                        print(tb)
                    except Exception as e:
                        self.logger.error(f'ModelTrainer.run {model}:',
                                          exc_info=True)
                        tb = traceback.format_exc()
                        print(tb)
                        self._update_log(tb)
            self._is_running = False
            self.signals.training_complete.emit(self.all_predictions_df)

        except Exception as e:
            self.signals.training_complete.emit(pd.DataFrame())
            self.logger.error('ModelTrainer.run (General):', exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)

    def get_model_scores(self, y, y_hat):
        '''
            Generate scores for a given model
                # Arguments
                    y: list, ground truth for a given classification task
                    y_hat: list, predictions generated by the model
                # Returns
                    scores: dict, generated scores.  Key is metric name and value is score
        '''
        scores = {}
        try:
            scores['accuracy'] = accuracy_score(y, y_hat)
            scores['f1_score'] = f1_score(y, y_hat, average='weighted')
            scores['cohen_kappa'] = cohen_kappa_score(y, y_hat)
        except ValueError as ve:
            self._update_log(
                "Unable to generate performance metrics.  Returning all values as zero."
            )
            scores['accuracy'] = 0
            scores['f1_score'] = 0
            scores['cohen_kappa'] = 0
        except Exception as e:
            # self.signals.training_complete.emit(0, False)
            self.logger.error('ModelTrainer.get_model_scores:', exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)

        return scores

    def get_params_from_file(self, model_name, base_path=None, tpot=False):
        '''
            Loads model parameters either from file (if version has been saved), or grabs the defaults
                # Arguments
                    model_name: string, model name used to specify path
                    base_path: string, optional pathing used for loading custom model parameters
                # Returns
                    model_params: dict, parameters from file or defaults
        '''
        try:
            if tpot or base_path is not None:
                model_path = os.path.join(base_path, model_name,
                                          model_name + '.json')
                if not os.path.isfile(model_path):
                    model_path = os.path.join(
                        CONFIG.get('PATHS', 'DefaultModelDirectory'),
                        model_name, model_name + '.json')

            # elif base_path is not None:
            #     model_path = os.path.join(
            #         base_path, model_name, model_name + '.json')
            #     if not os.path.isfile(model_path):
            #         model_path = os.path.join(CONFIG.get('PATHS', 'DefaultModelDirectory'),
            #                                   model_name,
            #                                   model_name + '.json')
            else:
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model_name,
                    model_name + '.json')

            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file, object_hook=cat_decoder)
            return model_params
        except Exception as e:
            self.logger.error('ModelTrainer.get_params_from_file:',
                              exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb, True, False)

    def get_pipeline(self, param_dict, include_feature_selection=True):
        '''Builds pipeline steps required for sklearn models.  
            Includes Feature extraction, feature selection, and classifier.
                # Arguments
                    param_dict: dict, dictionary of current model parameter values.
                # Returns
                    pipeline_steps: list, list of steps with intialized classes.
        '''
        pipeline_queue = PriorityQueue()
        for args, values in param_dict.items():
            full_class = args.split('.')
            current_module = '.'.join(full_class[0:-1])
            current_type = full_class[1]

            if current_type == 'feature_extraction':
                priority = 0
            elif current_type == 'feature_selection':
                if include_feature_selection:
                    priority = 50
                else:
                    continue
            else:
                priority = 100
            inst_module = importlib.import_module(current_module)
            current_class = getattr(inst_module, full_class[-1])
            if values:
                pipeline_queue.put(
                    (priority, (full_class[-1], current_class(**values))))
            else:
                pipeline_queue.put(
                    (priority, (full_class[-1], current_class())))

        pipeline = []
        while not pipeline_queue.empty():
            pipeline.append(pipeline_queue.get()[-1])
        return pipeline

    def get_tpot_pipeline(self,
                          param_dict,
                          tpot_params,
                          include_feature_selection=False):
        pipeline_queue = PriorityQueue()
        for args, values in param_dict.items():
            full_class = args.split('.')
            current_module = '.'.join(full_class[0:-1])
            current_type = full_class[1]

            if current_type == 'feature_extraction':
                priority = 0
            elif current_type == 'feature_selection':
                if include_feature_selection:
                    priority = 50
                else:
                    continue
            else:
                continue
            inst_module = importlib.import_module(current_module)
            current_class = getattr(inst_module, full_class[-1])
            if values:
                pipeline_queue.put(
                    (priority, (full_class[-1], current_class(**values))))
            else:
                pipeline_queue.put(
                    (priority, (full_class[-1], current_class())))

        pipeline_queue.put(
            (100, ('TPOTClassifier',
                   TPOTClassifier(**tpot_params['tpot.TPOTClassifier']))))

        pipeline = []
        while not pipeline_queue.empty():
            pipeline.append(pipeline_queue.get()[-1])
        return pipeline

    def grid_search(self,
                    model,
                    x,
                    y,
                    pipeline,
                    tuning_params,
                    n_jobs=-1,
                    n_iter=20,
                    scoring=None,
                    include_tfidf=False,
                    keras_params=None):
        '''Performs grid search on selected pipeline.

            # Arguments

                model: string, name of classifier in pipeline
                x: pandas.DataFrame, training data
                y: numpy.array, training labels
                pipeline: sklearn.model_selection.Pipeline, pipeline object containing feature extractors, feature selectors and estimator
                n_jobs: int, Number of jobs to run in parallel.
                n_iter: int, number of iterations to perform search
                scoring: list, scoring metrics to be used by the evaluator
                include_tfidf: bool:, flag to indicate tfidf is included in the pipeline
                keras_params: dict, parameters necessary for model training outside of the regular hyperparams.  e.g. input_shape, num_classes, num_features
        '''
        try:
            start_time = time.time()
            filepath = os.path.join(CONFIG.get('PATHS', 'BaseModelDirectory'),
                                    model + '.json')
            with open(filepath, 'r') as f:
                model_data = json.load(f, object_hook=cat_decoder)

            grid_params = {}
            default_params = model_data[model]

            for param_types, types in default_params.items():
                for t, params in types.items():
                    if params['tunable']:
                        param_name = model + '__' + t
                        if params['type'] == 'dropdown':
                            param_options = list(params['options'].values())
                        elif params['type'] == 'double':
                            param_options = scipy.stats.expon(
                                scale=params['step_size'])
                        elif params['type'] == 'int':
                            param_options = scipy.stats.randint(
                                params['min'], params['max'] + 1)
                        elif params['type'] == 'range':
                            param_options = [(1, 1), (1, 2), (1, 3), (1, 4)]
                        grid_params.update({param_name: param_options})
                    else:
                        continue

            if include_tfidf:
                with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f:
                    model_data = json.load(f, object_hook=cat_decoder)
                model_class = model_data['model_class']
                default_params = model_data[model_class]

                for param_types, types in default_params.items():
                    for t, params in types.items():
                        if params['tunable']:
                            param_name = model_class + '__' + t
                            if params['type'] == 'dropdown':
                                param_options = list(
                                    params['options'].values())
                            elif params['type'] == 'double':
                                param_options = scipy.stats.expon(
                                    scale=params['step_size'])
                            elif params['type'] == 'int':
                                param_options = scipy.stats.randint(
                                    params['min'], params['max'] + 1)
                            elif params['type'] == 'range':
                                param_options = [(1, 1), (1, 2), (1, 3),
                                                 (1, 4)]
                            else:
                                param_options = None
                            grid_params.update({param_name: param_options})
                        else:
                            continue
            # Remnant from __TENSORFLOW work.
            # if keras_params:
            #     updated_key_dict = {f'{model}__{k}':
            #         [v] for k, v in keras_params.items()}
            #     grid_params.update(updated_key_dict)

            self._update_log(f'Beginning RandomizedSearchCV on {model}...')
            rscv = RandomizedSearchCV(
                pipeline,
                grid_params,
                n_jobs=tuning_params['gridsearch']['n_jobs']
                if tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=tuning_params['gridsearch']['cv'],
                n_iter=n_iter,
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=tuning_params['gridsearch']['scoring']
                if len(tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            #   refit='accuracy' if len(tuning_params['gridsearch']['scoring']) > 0 else None)  # ! FIXME: Should we allow other, non accuracy metrics here?
            with joblib.parallel_backend('dask'):
                rscv.fit(x, y)
            self.grid_search_time = time.time() - start_time
            self._update_log(
                f'RandomizedSearchCV on {model} completed in {self.grid_search_time}'
            )
            self._update_log(f'Best score for {model}: {rscv.best_score_}',
                             False)
            return rscv

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.grid_search {} not found'.format(filepath))
        except Exception as e:
            self.logger.error('ModelTrainer.grid_search {}:'.format(model),
                              exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)

    def save_model(self, model_name, pipeline, save_path, scores={}):
        save_file = os.path.join(save_path, model_name + '.pkl')
        self._update_log(f'Saving {model_name} to : {save_file}', False)
        joblib.dump(pipeline, save_file, compress=1)
        self.model_checksums[model_name] = hashlib.md5(
            open(save_file, 'rb').read()).hexdigest()
        self._update_log(
            f'{model_name} checksum: {self.model_checksums[model_name]}',
            False)
        if model_name == 'TPOTClassifier':
            self.save_tpot_params_to_file(pipeline, save_path, scores)
        else:
            self.save_params_to_file(model_name, pipeline.get_params(),
                                     save_path, scores)
        # if self.tune_models:
        #     if model_name == 'TPOTClassifier':
        #         self.save_tpot_params_to_file(pipeline, save_path, scores)
        #     else:
        #         self.save_params_to_file(
        #             model_name, pipeline.get_params(), save_path, scores)

    def save_params_to_file(self,
                            model,
                            best_params,
                            model_param_path,
                            score_dict={}):
        try:
            model_path = os.path.join(model_param_path, model + '.json')
            if not os.path.isfile(model_path):
                # Get default values
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model,
                    model + '.json')
            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file)
            current_time = time.localtime()
            model_params['meta']['training_meta'].update({
                'last_train_date':
                time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                'train_eval_score':
                score_dict,
                'checksum':
                self.model_checksums[model]
            })
            if self.tune_models:
                model_params['meta']['tuning_meta'].update({
                    'last_tune_date':
                    time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                    'n_iter':
                    self.tuning_params['gridsearch']['n_iter'],
                    'tuning_duration':
                    self.grid_search_time,
                    'tune_eval_score':
                    score_dict
                })

            # Update model params to those discovered during tuning
            for param_type, parameters in model_params['params'].items():
                param_key = param_type.split('.')[-1]
                for k, v in best_params.items():
                    best_param_key = k.split('__')[-1]
                    if k.startswith(
                            param_key) and best_param_key in parameters.keys():
                        parameters[best_param_key] = v
            save_path = os.path.join(model_param_path, model + '.json')
            # print(f'Saving {model} params: {model_params} to {save_path}')
            with open(save_path, 'w') as outfile:
                json.dump(model_params, outfile, indent=2, cls=CATEncoder)

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.save_params_to_file {} not found'.format(
                    model_path))
        except Exception as e:
            self.logger.error(
                'ModelTrainer.save_params_to_file {}:'.format(model),
                exc_info=True)
            tb = traceback.format_exc()
            print(tb)

    def save_tpot_params_to_file(self, pipeline, model_param_path, score_dict):
        try:
            model = 'TPOTClassifier'
            model_path = os.path.join(model_param_path, model + '.json')
            if not os.path.isfile(model_path):
                # Get default values
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model,
                    model + '.json')
            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file)

            best_params = pipeline.get_params()

            tpot_params = model_params['tpot_params']
            # * Remove any models under params that are not TfidfVectorizers
            for param_type in list(model_params['params'].keys()):
                param_key = param_type.split('.')[1]
                if param_key != 'feature_extraction':
                    del model_params['params'][param_type]

            # * Update tfidf params to the best
            for param_type, parameters in model_params['params'].items():
                param_key = param_type.split('.')[-1]
                for k, v in best_params.items():
                    best_param_key = k.split('__')[-1]
                    if k.startswith(
                            param_key) and best_param_key in parameters.keys():
                        parameters[best_param_key] = v
            current_time = time.localtime()
            model_params['meta']['training_meta'].update({
                'last_train_date':
                time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                'train_eval_score':
                score_dict,
                'checksum':
                self.model_checksums[model]
            })

            if self.tune_models:
                model_params['meta']['tuning_meta'].update({
                    'last_tune_date':
                    time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                    'n_iter':
                    self.tuning_params['gridsearch']['n_iter'],
                    'tuning_duration':
                    self.grid_search_time,
                    'tune_eval_score':
                    score_dict
                })
            # * Now to get the new model parameters
            for name, obj in pipeline.named_steps.items():
                if name == 'TfidfVectorizer':
                    continue
                module_name = str(obj.__class__).split("'")[1]
                module_params = obj.get_params()
                model_params['params'].update({module_name: module_params})

            model_params['tpot_params'] = tpot_params

            with open(os.path.join(model_param_path, model + '.json'),
                      'w') as outfile:
                json.dump(model_params, outfile, indent=2, cls=CATEncoder)

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.save_params_to_file {} not found'.format(
                    model_path))
        except Exception as e:
            self.logger.error(
                'ModelTrainer.save_params_to_file {}:'.format(model),
                exc_info=True)
            tb = traceback.format_exc()
            print(tb)

    # @pyqtSlot()

    def stop_thread(self):
        self._update_log(
            'Attempting to stop ModelTrainer.<br>Current task must complete before stopping...'
        )
        self._is_running = False

    def train_stacker(self, x, y, col_path):
        def get_ratio(row):
            """
            Returns the ratio of agreement between column values (here, predictors) in a given row.
            """
            try:
                pred_value = row.iloc[-1]
                total_same = 0.0
                col_count = float(len(row.iloc[:-1]))
                for data in row.iloc[:-1]:
                    if data == pred_value:
                        total_same += 1.0
                return total_same / col_count
            except ZeroDivisionError as zde:
                return 0
            except Exception as e:
                self.logger.error("ModelTrainer.get_ratio", exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_ratio.', repr(e))

        def get_bamboozled_score(row):
            """
            Returns the difference between the number of models and the number of models who predicted incorrectly.
            The higher this value, the more bamboozling the sample
            """
            try:
                pred_value = row.iloc[-1]
                total_wrong = 0
                col_count = len(row.iloc[:-1])
                for data in row.iloc[:-1]:
                    if data != pred_value:
                        total_wrong += 1
                return col_count - total_wrong
            except Exception as e:
                self.logger.error("ModelTrainer.get_bamboozled_score",
                                  exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_bamboozled_score.',
                    repr(e))

        stacker_full_class = CONFIG.get(
            'VARIABLES', 'StackingAlgorithmCLassName').split('.')

        final_preds = np.empty(y.shape)
        stacker_module = '.'.join(stacker_full_class[0:-1])
        inst_module = importlib.import_module(stacker_module)
        stacker_class = getattr(inst_module, stacker_full_class[-1])
        stacker = stacker_class()
        if self.tuning_params['gridsearch']['tune_stacker']:
            self._update_log(
                f'Beginning tuning run on Stacker <b>{".".join(stacker_full_class)}</b>...'
            )
            rscv = RandomizedSearchCV(
                estimator=stacker,
                n_jobs=self.tuning_params['gridsearch']['n_jobs']
                if self.tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=self.tuning_params['gridsearch']['cv'],
                n_iter=self.tuning_params['gridsearch']['n_iter'],
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=self.tuning_params['gridsearch']['scoring'] if
                len(self.tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            rscv.fit(x, y)
            best_params = rscv.best_params_
            stacker = stacker_class(**best_params)
            self._update_log('Stacker tuning completed!  Re-evaluating...')

        self._update_log(
            f'Training Stacking algorithm <b>{".".join(stacker_full_class)}</b>'
        )
        skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)

        for train, test in skf.split(x, y):
            with joblib.parallel_backend('dask'):
                stacker.fit(x.iloc[train], y[train])
            final_preds[test] = stacker.predict(x.iloc[test])
        # stack_preds = [1 if x > .5 else 0 for x in np.nditer(final_preds)]
        self._update_log('Stacking training complete')
        stack_scores = self.get_model_scores(y, final_preds)

        table_str = '''<table>
                            <thead>
                                <tr>
                                    <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                </tr>
                            </thead>
                        <tbody>
                            <tr>
                    '''
        for metric, score in stack_scores.items():
            table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
        table_str += '</tr></tbody></table><br>'
        self._update_log(table_str, False, True)
        self._update_log('Retraining Stacker on full dataset')
        stacker.fit(x, y)
        save_path = os.path.join(col_path, 'Stacker')
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        save_file = os.path.join(save_path, 'Stacker.pkl')
        self._update_log(f'Saving Stacking algorithm to : {save_file}', False)
        joblib.dump(stacker, save_file, compress=1)
        self.model_checksums['Stacker'] = hashlib.md5(
            open(save_file, 'rb').read()).hexdigest()
        self._update_log(f'Stacking hash: {self.model_checksums["Stacker"]}')

        # Save particulars to file
        col_name = col_path.split('\\')[-1]
        stacker_info = {
            'column': col_name,
            'version_directory': self.version_directory,
            'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime()),
            'train_eval_score': stack_scores,
            'model_checksums': self.model_checksums
        }
        stacker_json_save_file = os.path.join(save_path, 'Stacker.json')
        with open(stacker_json_save_file, 'w') as outfile:
            json.dump(stacker_info, outfile, indent=2)
        x[col_name + TRUTH_LABEL_SUFFIX] = y
        agreement_ratios = x.apply(get_ratio, axis=1)
        bamboozled = x.apply(get_bamboozled_score, axis=1)

        x[col_name + TAG_DELIMITER + 'agreement_ratio'] = agreement_ratios
        x[col_name + TAG_DELIMITER + 'bamboozled_score'] = bamboozled
        pc_len = len(x[x[col_name + TAG_DELIMITER +
                         'agreement_ratio'] <= DISAGREEMENT_THRESHOLD])
        bamboozled_len = len(x[x[col_name + TAG_DELIMITER +
                                 'bamboozled_score'] <= BAMBOOZLED_THRESHOLD])
        self._update_log(
            f"Found {pc_len} samples for {col_name} that fall at or below the {DISAGREEMENT_THRESHOLD} predictor agreement."
        )
        self._update_log(
            f"Found {bamboozled_len} samples for {col_name} that have a bamboozled score of {BAMBOOZLED_THRESHOLD} or below."
        )
        # print('HEAD OF X IN TRAIN_STACKER')
        # print(x.head())
        # print(x.columns)
        # ? What X is a dataframe  [col_name + CONFIG.get('VARIABLES', 'StackerLabelSuffix')] = final_preds
        self.all_predictions_df = pd.merge(self.all_predictions_df,
                                           x,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
        # print('HEAD OF all_redictions_df IN TRAIN_STACKER')
        # print(self.all_predictions_df.head())
        # print(self.all_predictions_df.columns)
        self._update_log('Run complete')
        self._update_log('<hr>', False, True)

    def _tune_model(self, x, y, model, col_path):
        model_params = self.get_params_from_file(model, col_path, True)
        if (model.lower() == 'tpotclassifier'):
            self._update_log('Begin TPOT Optimization')
            tpot_pipeline = Pipeline(
                self.get_tpot_pipeline(model_params['params'],
                                       model_params['tpot_params']))
            with joblib.parallel_backend('dask'):
                tpot_pipeline.fit(x, y)
            new_steps = []
            new_steps.append(('TfidfVectorizer',
                              tpot_pipeline.named_steps['TfidfVectorizer']))
            fitted_pipeline = tpot_pipeline.named_steps[
                'TPOTClassifier'].fitted_pipeline_
            for n, p in fitted_pipeline.named_steps.items():
                new_steps.append((n, p))
            pipeline = Pipeline(new_steps)
        else:
            gs_pipeline = Pipeline(
                self.get_pipeline(model_params['params'],
                                  include_feature_selection=False))
            self._update_log(f'Begin tuning on {model}')
            with joblib.parallel_backend('dask'):
                pipeline = self.grid_search(
                    model=model,
                    x=x,
                    y=y,
                    pipeline=gs_pipeline,
                    tuning_params=self.tuning_params,
                    n_iter=self.tuning_params['gridsearch']['n_iter'],
                    n_jobs=self.tuning_params['gridsearch']['n_jobs'],
                    include_tfidf=True).best_estimator_

        if pipeline is None:
            self._update_log(
                f'Grid search failed for {model} on task {col_path}.  Skipping...'
            )
            return False
        else:
            save_path = os.path.join(col_path, model)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            self.save_model(model, pipeline, save_path)
            return True

    def _generate_best_param_dict(self, model_param_keys, best_params):
        try:
            result_dict = {el: {} for el in model_param_keys}
            for param_type in model_param_keys:
                key = param_type.split('.')[-1]
                result_dict[key] = {
                    k: v
                    for k, v in best_params.items()
                    if k.startswith(key.split('.')[-1])
                }

            return result_dict
        except Exception as e:
            self.logger.error(
                'ModelTrainer._generate_best_param_dict {}:'.format(e),
                exc_info=True)
            tb = traceback.format_exc()
            print(tb)

    def _update_log(self, msg, include_time=True, as_html=True):
        # outbound = f'{time.strftime('%Y-%m-%d %H:%M:%S', current_time)} - {msg}<br>'
        self.signals.update_training_logger.emit(msg, include_time, as_html)