Exemplo n.º 1
0
    def run(self):
        apply_cols = [
            col for col in self.data.columns if col.endswith('_text')
        ]
        self.data[apply_cols] = self.data[apply_cols].applymap(
            lambda x: processText(str(x), **self.options)
        )
        if self.options['spell_correction']:
            sentences = self.data[apply_cols].applymap(
                lambda x: str(x).split()
            ).values
            sc = SpellCheck(sentences, CONFIG.getint('VARIABLES', 'TopKSpellCheck'))

            self.data[apply_cols] = self.data[apply_cols].applymap(
                lambda x: sc.correct_spelling(x)
            )
        # sys.stdout = sys.__stdout__
        self.preprocessing_complete.emit(self.data)
Exemplo n.º 2
0
    def load_file(self, f_path):
        """
        Load data from a CSV file to the workspace.
        Column 0 is used for the index column.
        chardet attempts to determine encoding if file is not utf-8.
            # Attributes
                f_path(String): The filename selected via open_file
        """
        # FIXME: Reset status bar when new data is loaded.
        try:
            self.graph.clear_graph()
            self.available_column_model.loadData([], include_labels=False)
            self.prediction_data = pd.read_csv(
                f_path,
                encoding='utf-8',
                index_col=CONFIG.getint(
                    'VARIABLES',
                    'IndexColumn'))  #TODO: user defined index column
        except UnicodeDecodeError as ude:
            self.logger.warning("UnicodeDecode error opening file",
                                exc_info=True)
            self.comms.update_statusbar.emit(
                "Attempting to determine file encoding...")
            detector = UniversalDetector()
            try:
                for line in open(f_path, 'rb'):
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
                print("chardet determined encoding type to be {}".format(
                    detector.result['encoding']))
                self.prediction_data = pd.read_csv(
                    f_path, encoding=detector.result['encoding'], index_col=0)
            except Exception as e:
                self.logger.error("Error detecing encoding", exc_info=True)
                exceptionWarning("Exception has occured.", exception=e)
        except IOError as ioe:
            self.logger.error("IOError detecting encoding", exc_info=True)
            exceptionWarning("IO Exception occured while opening file.",
                             exception=ioe)
        except Exception as e:
            self.logger.error("Error detecting encoding", exc_info=True)
            exceptionWarning("Error occured opening file.", exception=e)

        try:
            columns = self.prediction_data.columns
            self.available_columns = []
            self.columns_with_truth = []

            self.ground_truth_columns = self.prediction_data.columns[
                ~self.prediction_data.isna().any()].tolist()

            for column in columns:
                if column.lower().endswith("text"):
                    self.available_columns.append(column)
                    column_tag = column.split('__')[0]
                    if (column_tag + '__actual' in self.ground_truth_columns):
                        self.columns_with_truth.append(column)

            if self.available_columns:
                self.available_column_model.loadData(self.available_columns,
                                                     include_labels=False)

            if self.columns_with_truth:
                self.available_column_model.setTruthData(
                    self.columns_with_truth)
                # self.full_text_count.setText(str(self.prediction_data.shape[0]))
                # self.display_selected_row(None)
                # self.select_all_btn.setEnabled(True)
                # self.deselect_all_btn.setEnabled(True)

            self.comms.update_statusbar.emit("CSV loaded.")
            # else:
            #     exceptionWarning("No allowable data discovered in file.")
        except pd.errors.EmptyDataError as ede:
            exceptionWarning('Empty Data Error.\n', exception=ede)
        except Exception as e:
            self.logger.error("Error loading dataframe", exc_info=True)
            exceptionWarning("Exception occured.  PredictWidget.load_file.",
                             exception=e)
Exemplo n.º 3
0
    def train_stacker(self, x, y, col_path):
        def get_ratio(row):
            """
            Returns the ratio of agreement between column values (here, predictors) in a given row.
            """
            try:
                pred_value = row.iloc[-1]
                total_same = 0.0
                col_count = float(len(row.iloc[:-1]))
                for data in row.iloc[:-1]:
                    if data == pred_value:
                        total_same += 1.0
                return total_same / col_count
            except ZeroDivisionError as zde:
                return 0
            except Exception as e:
                self.logger.error("ModelTrainer.get_ratio", exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_ratio.', repr(e))

        def get_bamboozled_score(row):
            """
            Returns the difference between the number of models and the number of models who predicted incorrectly.
            The higher this value, the more bamboozling the sample
            """
            try:
                pred_value = row.iloc[-1]
                total_wrong = 0
                col_count = len(row.iloc[:-1])
                for data in row.iloc[:-1]:
                    if data != pred_value:
                        total_wrong += 1
                return col_count - total_wrong
            except Exception as e:
                self.logger.error("ModelTrainer.get_bamboozled_score",
                                  exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_bamboozled_score.',
                    repr(e))

        stacker_full_class = CONFIG.get(
            'VARIABLES', 'StackingAlgorithmCLassName').split('.')

        final_preds = np.empty(y.shape)
        stacker_module = '.'.join(stacker_full_class[0:-1])
        inst_module = importlib.import_module(stacker_module)
        stacker_class = getattr(inst_module, stacker_full_class[-1])
        stacker = stacker_class()
        if self.tuning_params['gridsearch']['tune_stacker']:
            self._update_log(
                f'Beginning tuning run on Stacker <b>{".".join(stacker_full_class)}</b>...'
            )
            rscv = RandomizedSearchCV(
                estimator=stacker,
                n_jobs=self.tuning_params['gridsearch']['n_jobs']
                if self.tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=self.tuning_params['gridsearch']['cv'],
                n_iter=self.tuning_params['gridsearch']['n_iter'],
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=self.tuning_params['gridsearch']['scoring'] if
                len(self.tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            rscv.fit(x, y)
            best_params = rscv.best_params_
            stacker = stacker_class(**best_params)
            self._update_log('Stacker tuning completed!  Re-evaluating...')

        self._update_log(
            f'Training Stacking algorithm <b>{".".join(stacker_full_class)}</b>'
        )
        skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)

        for train, test in skf.split(x, y):
            with joblib.parallel_backend('dask'):
                stacker.fit(x.iloc[train], y[train])
            final_preds[test] = stacker.predict(x.iloc[test])
        # stack_preds = [1 if x > .5 else 0 for x in np.nditer(final_preds)]
        self._update_log('Stacking training complete')
        stack_scores = self.get_model_scores(y, final_preds)

        table_str = '''<table>
                            <thead>
                                <tr>
                                    <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                </tr>
                            </thead>
                        <tbody>
                            <tr>
                    '''
        for metric, score in stack_scores.items():
            table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
        table_str += '</tr></tbody></table><br>'
        self._update_log(table_str, False, True)
        self._update_log('Retraining Stacker on full dataset')
        stacker.fit(x, y)
        save_path = os.path.join(col_path, 'Stacker')
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        save_file = os.path.join(save_path, 'Stacker.pkl')
        self._update_log(f'Saving Stacking algorithm to : {save_file}', False)
        joblib.dump(stacker, save_file, compress=1)
        self.model_checksums['Stacker'] = hashlib.md5(
            open(save_file, 'rb').read()).hexdigest()
        self._update_log(f'Stacking hash: {self.model_checksums["Stacker"]}')

        # Save particulars to file
        col_name = col_path.split('\\')[-1]
        stacker_info = {
            'column': col_name,
            'version_directory': self.version_directory,
            'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime()),
            'train_eval_score': stack_scores,
            'model_checksums': self.model_checksums
        }
        stacker_json_save_file = os.path.join(save_path, 'Stacker.json')
        with open(stacker_json_save_file, 'w') as outfile:
            json.dump(stacker_info, outfile, indent=2)
        x[col_name + TRUTH_LABEL_SUFFIX] = y
        agreement_ratios = x.apply(get_ratio, axis=1)
        bamboozled = x.apply(get_bamboozled_score, axis=1)

        x[col_name + TAG_DELIMITER + 'agreement_ratio'] = agreement_ratios
        x[col_name + TAG_DELIMITER + 'bamboozled_score'] = bamboozled
        pc_len = len(x[x[col_name + TAG_DELIMITER +
                         'agreement_ratio'] <= DISAGREEMENT_THRESHOLD])
        bamboozled_len = len(x[x[col_name + TAG_DELIMITER +
                                 'bamboozled_score'] <= BAMBOOZLED_THRESHOLD])
        self._update_log(
            f"Found {pc_len} samples for {col_name} that fall at or below the {DISAGREEMENT_THRESHOLD} predictor agreement."
        )
        self._update_log(
            f"Found {bamboozled_len} samples for {col_name} that have a bamboozled score of {BAMBOOZLED_THRESHOLD} or below."
        )
        # print('HEAD OF X IN TRAIN_STACKER')
        # print(x.head())
        # print(x.columns)
        # ? What X is a dataframe  [col_name + CONFIG.get('VARIABLES', 'StackerLabelSuffix')] = final_preds
        self.all_predictions_df = pd.merge(self.all_predictions_df,
                                           x,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
        # print('HEAD OF all_redictions_df IN TRAIN_STACKER')
        # print(self.all_predictions_df.head())
        # print(self.all_predictions_df.columns)
        self._update_log('Run complete')
        self._update_log('<hr>', False, True)
Exemplo n.º 4
0
    def grid_search(self,
                    model,
                    x,
                    y,
                    pipeline,
                    tuning_params,
                    n_jobs=-1,
                    n_iter=20,
                    scoring=None,
                    include_tfidf=False,
                    keras_params=None):
        '''Performs grid search on selected pipeline.

            # Arguments

                model: string, name of classifier in pipeline
                x: pandas.DataFrame, training data
                y: numpy.array, training labels
                pipeline: sklearn.model_selection.Pipeline, pipeline object containing feature extractors, feature selectors and estimator
                n_jobs: int, Number of jobs to run in parallel.
                n_iter: int, number of iterations to perform search
                scoring: list, scoring metrics to be used by the evaluator
                include_tfidf: bool:, flag to indicate tfidf is included in the pipeline
                keras_params: dict, parameters necessary for model training outside of the regular hyperparams.  e.g. input_shape, num_classes, num_features
        '''
        try:
            start_time = time.time()
            filepath = os.path.join(CONFIG.get('PATHS', 'BaseModelDirectory'),
                                    model + '.json')
            with open(filepath, 'r') as f:
                model_data = json.load(f, object_hook=cat_decoder)

            grid_params = {}
            default_params = model_data[model]

            for param_types, types in default_params.items():
                for t, params in types.items():
                    if params['tunable']:
                        param_name = model + '__' + t
                        if params['type'] == 'dropdown':
                            param_options = list(params['options'].values())
                        elif params['type'] == 'double':
                            param_options = scipy.stats.expon(
                                scale=params['step_size'])
                        elif params['type'] == 'int':
                            param_options = scipy.stats.randint(
                                params['min'], params['max'] + 1)
                        elif params['type'] == 'range':
                            param_options = [(1, 1), (1, 2), (1, 3), (1, 4)]
                        grid_params.update({param_name: param_options})
                    else:
                        continue

            if include_tfidf:
                with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f:
                    model_data = json.load(f, object_hook=cat_decoder)
                model_class = model_data['model_class']
                default_params = model_data[model_class]

                for param_types, types in default_params.items():
                    for t, params in types.items():
                        if params['tunable']:
                            param_name = model_class + '__' + t
                            if params['type'] == 'dropdown':
                                param_options = list(
                                    params['options'].values())
                            elif params['type'] == 'double':
                                param_options = scipy.stats.expon(
                                    scale=params['step_size'])
                            elif params['type'] == 'int':
                                param_options = scipy.stats.randint(
                                    params['min'], params['max'] + 1)
                            elif params['type'] == 'range':
                                param_options = [(1, 1), (1, 2), (1, 3),
                                                 (1, 4)]
                            else:
                                param_options = None
                            grid_params.update({param_name: param_options})
                        else:
                            continue
            # Remnant from __TENSORFLOW work.
            # if keras_params:
            #     updated_key_dict = {f'{model}__{k}':
            #         [v] for k, v in keras_params.items()}
            #     grid_params.update(updated_key_dict)

            self._update_log(f'Beginning RandomizedSearchCV on {model}...')
            rscv = RandomizedSearchCV(
                pipeline,
                grid_params,
                n_jobs=tuning_params['gridsearch']['n_jobs']
                if tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=tuning_params['gridsearch']['cv'],
                n_iter=n_iter,
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=tuning_params['gridsearch']['scoring']
                if len(tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            #   refit='accuracy' if len(tuning_params['gridsearch']['scoring']) > 0 else None)  # ! FIXME: Should we allow other, non accuracy metrics here?
            with joblib.parallel_backend('dask'):
                rscv.fit(x, y)
            self.grid_search_time = time.time() - start_time
            self._update_log(
                f'RandomizedSearchCV on {model} completed in {self.grid_search_time}'
            )
            self._update_log(f'Best score for {model}: {rscv.best_score_}',
                             False)
            return rscv

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.grid_search {} not found'.format(filepath))
        except Exception as e:
            self.logger.error('ModelTrainer.grid_search {}:'.format(model),
                              exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)
Exemplo n.º 5
0
# import package.utils.SequenceTransformer as seq_trans

RANDOM_SEED = 1337
TOP_K = 20000
MAX_SEQUENCE_LENGTH = 1500
BASE_MODEL_DIR = './package/data/base_models'
BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json'
INPUT_SHAPE = (0, 0)

TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter')
PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix')
PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix')
TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix')
STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix')
DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold')
BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold')


class ModelTrainerSignals(QObject):
    training_complete = pyqtSignal(pd.DataFrame)
    tuning_complete = pyqtSignal(bool, dict)
    update_progressbar = pyqtSignal(int, bool)
    update_training_logger = pyqtSignal(str, bool, bool)


class ModelTrainer(QRunnable):
    '''
    QThread tasked with running all model training/tuning.  
    This could potentially take days to complete.
    '''
    # Setting parallel_backend to threading allows for multi-threading from a thread.  GUI will not freeze and