示例#1
0
    def __init__(self,
                 selected_models,
                 version_directory,
                 training_eval_params,
                 training_data,
                 tune_models,
                 tuning_params,
                 use_proba=False,
                 train_stacking_algorithm=True,
                 **kwargs):
        super(ModelTrainer, self).__init__()
        self.logger = logging.getLogger(__name__)
        self.signals = ModelTrainerSignals()

        self.allowed_pipeline_types = [
            'feature_extraction', 'feature_selection'
        ]
        self.version_directory = version_directory
        self.selected_models = selected_models
        self.training_eval_params = training_eval_params
        self.training_data = training_data
        self.tune_models = tune_models
        self.tuning_params = tuning_params
        self.use_proba = use_proba
        self.train_stacking_algorithm = train_stacking_algorithm
        self.kwargs = kwargs
        self.all_predictions_df = pd.DataFrame(index=self.training_data.index)
        self.grid_search_time = None
        self.model_checksums = {}
        self._is_running = True
        self.tag_suffix = CONFIG.get('VARIABLES', 'TagDelimiter') + CONFIG.get(
            'VARIABLES', 'TagDataColumnSuffix')
示例#2
0
    def chartSingleClassFrequency(self, data):
        """Display a bar chart of frequencies per label
            # Arguments
                data: list, List of integer values corresponding to the actual
                question score.
        """

        num_classes = self.getNumClasses(data)
        count_map = Counter(data)
        counts = [count_map[i] for i in range(num_classes)]
        total_count = sum(counts)
        majority_class_count = max(counts)
        majority_acc = round((majority_class_count / total_count), 2)
        idx = np.arange(num_classes)
        colors = []
        for count in counts:
            if count < (total_count * CONFIG.getfloat('VARIABLES', 'MinorityClassThreshold')):
                colors.append('r')
            else:
                colors.append('b')
        self.axes.cla()
        self.axes.bar(idx, counts, color=colors)
        self.axes.set_xlabel('Class')
        self.axes.set_ylabel('Number of Samples')
        self.axes.set_xticks(idx)
        self.axes.set_title(f"Majority class accuracy: {majority_acc}")
        rects = self.axes.patches
        for rect, label in zip(rects, counts):
            height = rect.get_height()
            self.axes.text(rect.get_x() + rect.get_width() / 2,
                           height + 5, label, ha='center', va='bottom')
        self.draw()
示例#3
0
    def save_params_to_file(self,
                            model,
                            best_params,
                            model_param_path,
                            score_dict={}):
        try:
            model_path = os.path.join(model_param_path, model + '.json')
            if not os.path.isfile(model_path):
                # Get default values
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model,
                    model + '.json')
            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file)
            current_time = time.localtime()
            model_params['meta']['training_meta'].update({
                'last_train_date':
                time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                'train_eval_score':
                score_dict,
                'checksum':
                self.model_checksums[model]
            })
            if self.tune_models:
                model_params['meta']['tuning_meta'].update({
                    'last_tune_date':
                    time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                    'n_iter':
                    self.tuning_params['gridsearch']['n_iter'],
                    'tuning_duration':
                    self.grid_search_time,
                    'tune_eval_score':
                    score_dict
                })

            # Update model params to those discovered during tuning
            for param_type, parameters in model_params['params'].items():
                param_key = param_type.split('.')[-1]
                for k, v in best_params.items():
                    best_param_key = k.split('__')[-1]
                    if k.startswith(
                            param_key) and best_param_key in parameters.keys():
                        parameters[best_param_key] = v
            save_path = os.path.join(model_param_path, model + '.json')
            # print(f'Saving {model} params: {model_params} to {save_path}')
            with open(save_path, 'w') as outfile:
                json.dump(model_params, outfile, indent=2, cls=CATEncoder)

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.save_params_to_file {} not found'.format(
                    model_path))
        except Exception as e:
            self.logger.error(
                'ModelTrainer.save_params_to_file {}:'.format(model),
                exc_info=True)
            tb = traceback.format_exc()
            print(tb)
示例#4
0
    def get_params_from_file(self, model_name, base_path=None, tpot=False):
        '''
            Loads model parameters either from file (if version has been saved), or grabs the defaults
                # Arguments
                    model_name: string, model name used to specify path
                    base_path: string, optional pathing used for loading custom model parameters
                # Returns
                    model_params: dict, parameters from file or defaults
        '''
        try:
            if tpot or base_path is not None:
                model_path = os.path.join(base_path, model_name,
                                          model_name + '.json')
                if not os.path.isfile(model_path):
                    model_path = os.path.join(
                        CONFIG.get('PATHS', 'DefaultModelDirectory'),
                        model_name, model_name + '.json')

            # elif base_path is not None:
            #     model_path = os.path.join(
            #         base_path, model_name, model_name + '.json')
            #     if not os.path.isfile(model_path):
            #         model_path = os.path.join(CONFIG.get('PATHS', 'DefaultModelDirectory'),
            #                                   model_name,
            #                                   model_name + '.json')
            else:
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model_name,
                    model_name + '.json')

            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file, object_hook=cat_decoder)
            return model_params
        except Exception as e:
            self.logger.error('ModelTrainer.get_params_from_file:',
                              exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb, True, False)
示例#5
0
    def run(self):
        apply_cols = [
            col for col in self.data.columns if col.endswith('_text')
        ]
        self.data[apply_cols] = self.data[apply_cols].applymap(
            lambda x: processText(str(x), **self.options)
        )
        if self.options['spell_correction']:
            sentences = self.data[apply_cols].applymap(
                lambda x: str(x).split()
            ).values
            sc = SpellCheck(sentences, CONFIG.getint('VARIABLES', 'TopKSpellCheck'))

            self.data[apply_cols] = self.data[apply_cols].applymap(
                lambda x: sc.correct_spelling(x)
            )
        # sys.stdout = sys.__stdout__
        self.preprocessing_complete.emit(self.data)
示例#6
0
    def __init__(self, parent=None):
        super(SelectModelWidget, self).__init__(parent)
        self.logger = logging.getLogger(__name__)
        self.parent = parent
        self.threadpool = QThreadPool()
        self.logger.info(
            f"Multithreading enabled with a maximum of {self.threadpool.maxThreadCount()} threads."
        )

        print("Multithreading with maximum %d threads" %
              self.threadpool.maxThreadCount())
        self.training_data = pd.DataFrame()
        self.training_predictions = pd.DataFrame()
        self.selected_version = CONFIG.get('PATHS', 'DefaultModelDirectory')
        self.comms = Communicate()

        self.selected_models = {}
        self.selected_models['sklearn'] = {}
        self.selected_models['tensorflow'] = {}
        self.model_checkboxes = []
        # * Initialize training parameter dict.
        # * Has entry for both model base types
        self.training_params = {}
        self.training_params['sklearn'] = {}
        self.training_params['sklearn']['type'] = None
        self.training_params['sklearn']['value'] = None
        self.training_params['tensorflow'] = {}
        # * Init tuning param dict
        # * Currently only using gridsearch
        self.tuning_params = {}
        self.tuning_params['gridsearch'] = {
            'n_iter': 20,
            'cv': 3,
            'n_jobs': -1,
            'scoring': ['accuracy'],
            'tune_stacker': False
        }

        self.sklearn_model_dialogs = []
        self.sklearn_model_dialog_btns = []
        self.sklearn_training_inputs = []

        self.tensorflow_training_inputs = []
        self.tensorflow_model_dialogs = []
        self.tensorflow_model_dialog_btns = []

        self.main_layout = QVBoxLayout()
        self.upper_hbox = QHBoxLayout()

        self.version_form = QFormLayout()
        self.header_hbox = QHBoxLayout()
        self.header_hbox.addLayout(self.version_form)
        self.header_hbox.addStretch()
        self.tune_models_chkbox = QCheckBox("Tune Models")
        self.header_hbox.addWidget(self.tune_models_chkbox)
        self.tune_models_chkbox.stateChanged.connect(
            lambda state: self._enable_tuning_ui(state))
        self.main_layout.addLayout(self.header_hbox)
        self.main_layout.addLayout(self.upper_hbox)

        self.model_vbox = QVBoxLayout()
        self.tuning_vbox = QVBoxLayout()

        self.upper_hbox.addLayout(self.model_vbox)
        self.upper_hbox.addSpacing(10)
        self.upper_hbox.addLayout(self.tuning_vbox)
        self.upper_hbox.addSpacing(200)
        # * Build sklearn ui components
        self.sklearn_hbox = QHBoxLayout()
        self.sklearn_groupbox = QGroupBox("Sklearn")
        self.sklearn_groupbox.setLayout(self.sklearn_hbox)

        self.skmodel_groupbox = QGroupBox("Model Selection")
        self.sklearn_hbox.addWidget(self.skmodel_groupbox)
        self.sklearn_model_form = QFormLayout()
        self.sklearn_model_form.setFormAlignment(Qt.AlignTop)
        self.skmodel_groupbox.setLayout(self.sklearn_model_form)

        # * Sklearn training and tuning ui components
        self.sklearn_training_groupbox = QGroupBox("Training")
        self.sklearn_training_form = QFormLayout()
        self.sklearn_training_groupbox.setLayout(self.sklearn_training_form)
        self.sklearn_hbox.addWidget(self.sklearn_training_groupbox)

        self.model_vbox.addWidget(self.sklearn_groupbox)

        # * Build Tensorflow ui components
        self.tensorflow_hbox = QHBoxLayout()
        self.tensorflow_groupbox = QGroupBox("Tensorflow")
        self.tensorflow_groupbox.setLayout(self.tensorflow_hbox)

        self.tensorflow_model_groupbox = QGroupBox("Model Selection")
        self.tensorflow_hbox.addWidget(self.tensorflow_model_groupbox)
        self.tensorflow_model_form = QFormLayout()

        self.tensorflow_model_groupbox.setLayout(self.tensorflow_model_form)
        self.tensorflow_training_groupbox = QGroupBox("Training")
        self.tensorflow_training_form = QFormLayout()
        self.tensorflow_training_groupbox.setLayout(
            self.tensorflow_training_form)
        self.tensorflow_hbox.addWidget(self.tensorflow_training_groupbox)

        # * This is the tensorflow groupbox for models and training params.
        # self.model_vbox.addWidget(self.tensorflow_groupbox)

        self.tuning_groupbox = QGroupBox("Tuning")
        self.tuning_form = QFormLayout()
        self.tuning_groupbox.setLayout(self.tuning_form)
        self.tuning_vbox.addWidget(self.tuning_groupbox)
        self.tuning_groupbox.setEnabled(False)
        self.model_form_grid = QGridLayout()

        self.setup_model_selection_ui()
        self.setup_training_ui()
        self.setup_tuning_ui()
        # * QTextEdit box for training/tuning status
        self.training_logger = QTextEdit()
        self.training_logger.setReadOnly(True)
        self.training_logger.setAcceptRichText(True)
        self.training_logger.insertHtml(
            "<i>Multithreading with maximum %d threads</i><br>" %
            self.threadpool.maxThreadCount())
        self.training_logger.setMinimumHeight(400)
        self.main_layout.addWidget(self.training_logger)
        self.clear_btn_hbox = QHBoxLayout()
        self.clear_text_btn = QPushButton('Clear')
        self.clear_text_btn.setMaximumWidth(50)
        self.clear_text_btn.clicked.connect(
            lambda: self.training_logger.clear())
        self.clear_btn_hbox.addStretch()
        self.clear_btn_hbox.addWidget(self.clear_text_btn)

        self.main_layout.addLayout(self.clear_btn_hbox)

        self.main_layout.addStretch()
        self.run_btn = QPushButton("&Train Models")
        self.run_btn.setMinimumWidth(200)
        self.run_btn.clicked.connect(lambda: self.train_models())
        self.run_btn.setEnabled(False)

        self.stop_btn = QPushButton('Sto&p')
        self.stop_btn.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Fixed)

        self.comms.enable_training_btn.connect(self.set_training_btn_state)
        self.button_hbox = QHBoxLayout()

        icon = QIcon()
        icon.addPixmap(QPixmap('icons/Programming-Save-icon.png'))
        self.save_results_btn = QPushButton()
        self.save_results_btn.setIcon(icon)
        self.save_results_btn.setEnabled(False)
        self.save_results_btn.setToolTip(
            'Save model evaluation predictions, agreement ratio, and bamboozled score'
        )
        self.save_results_btn.clicked.connect(lambda: self.save_predictions())

        self.button_hbox.addWidget(self.run_btn)
        self.button_hbox.addWidget(self.stop_btn)
        self.button_hbox.addStretch()
        self.button_hbox.addWidget(self.save_results_btn)
        self.main_layout.addLayout(self.button_hbox)
        self.setLayout(self.main_layout)

        # Trigger update to load model parameters
        self._update_version(self.version_selection.currentData())
示例#7
0
    def setup_model_selection_ui(self):
        """
        Setup model selection ui.

        The order of the parameters in ModelDialog matters.  model_data must come first!
        """
        self.version_selection_label = QLabel("Select version: ")
        self.version_selection = QComboBox(objectName='version_select')
        self.version_selection.setMinimumWidth(100)
        # Changed default models to a unique directory.  This
        # is where default models will be saved.
        # self.version_selection.addItem(
        #     'default', '.\\package\\data\\default_models\\default')
        available_versions = os.listdir(BASE_VERSION_DIR)
        for version in available_versions:
            v_path = os.path.join(BASE_VERSION_DIR, version)
            if os.path.isdir(v_path):
                self.version_selection.addItem(version, v_path)
        self.version_selection.currentIndexChanged.connect(
            lambda x, y=self.version_selection: self._update_version(
                y.currentData()))
        self.version_form.addRow(self.version_selection_label,
                                 self.version_selection)

        # Load base TF-IDF features
        # and feature selection data
        try:
            with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f:
                tfidf_data = json.load(f)
        except IOError as ioe:
            self.logger.error("Error loading base TFIDF params", exc_info=True)
            exceptionWarning(
                'Error occurred while loading base TFIDF parameters.',
                repr(ioe))
        try:
            with open(CONFIG.get('PATHS', 'BaseFeatureSeletionDirectory'),
                      'r') as f:
                self.fs_params = json.load(f)
        except IOError as ioe:
            self.logger.error("Error loading base feature selector params",
                              exc_info=True)
            exceptionWarning(
                'Error occurred while loading base feature selector parameters.',
                repr(ioe))
        # Dynamically generate ModelDialogs for each model in the base model directory.
        # Only considers *.json file extension.
        try:
            row = 0
            for filename in os.listdir(
                    CONFIG.get('PATHS', 'BaseModelDirectory')):
                if filename.endswith('.json'):
                    with open(
                            os.path.join(
                                CONFIG.get('PATHS', 'BaseModelDirectory'),
                                filename), 'r') as f:
                        model_data = json.load(f)
                        model = model_data['model_class']
                        model_base = model_data['model_base']
                        model_module = model_data['model_module']
                        #! The order of the arguments matters!  model_data must come first.
                        if model_base == 'tensorflow':
                            continue
                            # model_dialog = SkModelDialog(self, model_data)
                        if model_module == 'tpot':
                            model_dialog = TPOTModelDialog(
                                self, model_data, tfidf_data)
                        else:
                            model_dialog = SkModelDialog(
                                self, model_data, tfidf_data, self.fs_params)
                        self.comms.version_change.connect(
                            model_dialog.update_version)
                        # Initialize model as unselected
                        self.selected_models[model_base][model] = False
                        btn = QPushButton(model, objectName=model + '_btn')
                        # Partial allows the connection of dynamically generated QObjects
                        btn.clicked.connect(
                            partial(self.open_dialog, model_dialog))
                        chkbox = QCheckBox(objectName=model)
                        chkbox.stateChanged.connect(
                            lambda state, x=model, y=model_base: self.
                            _update_selected_models(x, y, state))
                        if model_base == 'tensorflow':
                            self.tensorflow_model_form.addRow(chkbox, btn)
                            self.tensorflow_model_dialogs.append(model_dialog)
                            self.tensorflow_model_dialog_btns.append(btn)
                        else:
                            self.sklearn_model_form.addRow(chkbox, btn)
                            self.sklearn_model_dialogs.append(model_dialog)
                            self.sklearn_model_dialog_btns.append(btn)
                        self.model_checkboxes.append(chkbox)
                        row += 1
        except OSError as ose:
            self.logger.error("OSError opening model config files",
                              exc_info=True)
            exceptionWarning('OSError opening model config files!', ose)
            tb = traceback.format_exc()
            print(tb)
        except Exception as e:
            self.logger.error("Error opening model config files",
                              exc_info=True)
            exceptionWarning('Error occured.', e)
            tb = traceback.format_exc()
            print(tb)
示例#8
0
    def load_file(self, f_path):
        """
        Load data from a CSV file to the workspace.
        Column 0 is used for the index column.
        chardet attempts to determine encoding if file is not utf-8.
            # Attributes
                f_path(String): The filename selected via open_file
        """
        # FIXME: Reset status bar when new data is loaded.
        try:
            self.graph.clear_graph()
            self.available_column_model.loadData([], include_labels=False)
            self.prediction_data = pd.read_csv(
                f_path,
                encoding='utf-8',
                index_col=CONFIG.getint(
                    'VARIABLES',
                    'IndexColumn'))  #TODO: user defined index column
        except UnicodeDecodeError as ude:
            self.logger.warning("UnicodeDecode error opening file",
                                exc_info=True)
            self.comms.update_statusbar.emit(
                "Attempting to determine file encoding...")
            detector = UniversalDetector()
            try:
                for line in open(f_path, 'rb'):
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
                print("chardet determined encoding type to be {}".format(
                    detector.result['encoding']))
                self.prediction_data = pd.read_csv(
                    f_path, encoding=detector.result['encoding'], index_col=0)
            except Exception as e:
                self.logger.error("Error detecing encoding", exc_info=True)
                exceptionWarning("Exception has occured.", exception=e)
        except IOError as ioe:
            self.logger.error("IOError detecting encoding", exc_info=True)
            exceptionWarning("IO Exception occured while opening file.",
                             exception=ioe)
        except Exception as e:
            self.logger.error("Error detecting encoding", exc_info=True)
            exceptionWarning("Error occured opening file.", exception=e)

        try:
            columns = self.prediction_data.columns
            self.available_columns = []
            self.columns_with_truth = []

            self.ground_truth_columns = self.prediction_data.columns[
                ~self.prediction_data.isna().any()].tolist()

            for column in columns:
                if column.lower().endswith("text"):
                    self.available_columns.append(column)
                    column_tag = column.split('__')[0]
                    if (column_tag + '__actual' in self.ground_truth_columns):
                        self.columns_with_truth.append(column)

            if self.available_columns:
                self.available_column_model.loadData(self.available_columns,
                                                     include_labels=False)

            if self.columns_with_truth:
                self.available_column_model.setTruthData(
                    self.columns_with_truth)
                # self.full_text_count.setText(str(self.prediction_data.shape[0]))
                # self.display_selected_row(None)
                # self.select_all_btn.setEnabled(True)
                # self.deselect_all_btn.setEnabled(True)

            self.comms.update_statusbar.emit("CSV loaded.")
            # else:
            #     exceptionWarning("No allowable data discovered in file.")
        except pd.errors.EmptyDataError as ede:
            exceptionWarning('Empty Data Error.\n', exception=ede)
        except Exception as e:
            self.logger.error("Error loading dataframe", exc_info=True)
            exceptionWarning("Exception occured.  PredictWidget.load_file.",
                             exception=e)
示例#9
0
from functools import partial
import hashlib

from chardet.universaldetector import UniversalDetector
import pandas as pd
import pkg_resources

from package.evaluate.EvaluateTableModel import EvaluateTableModel
from package.utils.catutils import exceptionWarning, clearLayout
from package.utils.DataframeTableModel import DataframeTableModel
from package.utils.GraphWidget import GraphWidget
from package.utils.config import CONFIG

from sklearn.metrics import f1_score, accuracy_score, cohen_kappa_score, precision_score, recall_score

DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter')
PRED_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix')
TRUTH_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix')
AVG_TYPE = CONFIG.get('VARIABLES', 'MetricsAverageType')


class Communicate(QObject):
    version_change = pyqtSignal(str)
    enable_eval_btn = pyqtSignal(bool)
    # stop_training = pyqtSignal()
    data_load = pyqtSignal(pd.DataFrame)
    update_statusbar = pyqtSignal(str)
    update_progressbar = pyqtSignal(int, bool)


class EvaluateWidget(QWidget):
示例#10
0
    def train_stacker(self, x, y, col_path):
        def get_ratio(row):
            """
            Returns the ratio of agreement between column values (here, predictors) in a given row.
            """
            try:
                pred_value = row.iloc[-1]
                total_same = 0.0
                col_count = float(len(row.iloc[:-1]))
                for data in row.iloc[:-1]:
                    if data == pred_value:
                        total_same += 1.0
                return total_same / col_count
            except ZeroDivisionError as zde:
                return 0
            except Exception as e:
                self.logger.error("ModelTrainer.get_ratio", exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_ratio.', repr(e))

        def get_bamboozled_score(row):
            """
            Returns the difference between the number of models and the number of models who predicted incorrectly.
            The higher this value, the more bamboozling the sample
            """
            try:
                pred_value = row.iloc[-1]
                total_wrong = 0
                col_count = len(row.iloc[:-1])
                for data in row.iloc[:-1]:
                    if data != pred_value:
                        total_wrong += 1
                return col_count - total_wrong
            except Exception as e:
                self.logger.error("ModelTrainer.get_bamboozled_score",
                                  exc_info=True)
                exceptionWarning(
                    'Exception occured in ModelTrainer.get_bamboozled_score.',
                    repr(e))

        stacker_full_class = CONFIG.get(
            'VARIABLES', 'StackingAlgorithmCLassName').split('.')

        final_preds = np.empty(y.shape)
        stacker_module = '.'.join(stacker_full_class[0:-1])
        inst_module = importlib.import_module(stacker_module)
        stacker_class = getattr(inst_module, stacker_full_class[-1])
        stacker = stacker_class()
        if self.tuning_params['gridsearch']['tune_stacker']:
            self._update_log(
                f'Beginning tuning run on Stacker <b>{".".join(stacker_full_class)}</b>...'
            )
            rscv = RandomizedSearchCV(
                estimator=stacker,
                n_jobs=self.tuning_params['gridsearch']['n_jobs']
                if self.tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=self.tuning_params['gridsearch']['cv'],
                n_iter=self.tuning_params['gridsearch']['n_iter'],
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=self.tuning_params['gridsearch']['scoring'] if
                len(self.tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            rscv.fit(x, y)
            best_params = rscv.best_params_
            stacker = stacker_class(**best_params)
            self._update_log('Stacker tuning completed!  Re-evaluating...')

        self._update_log(
            f'Training Stacking algorithm <b>{".".join(stacker_full_class)}</b>'
        )
        skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)

        for train, test in skf.split(x, y):
            with joblib.parallel_backend('dask'):
                stacker.fit(x.iloc[train], y[train])
            final_preds[test] = stacker.predict(x.iloc[test])
        # stack_preds = [1 if x > .5 else 0 for x in np.nditer(final_preds)]
        self._update_log('Stacking training complete')
        stack_scores = self.get_model_scores(y, final_preds)

        table_str = '''<table>
                            <thead>
                                <tr>
                                    <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                </tr>
                            </thead>
                        <tbody>
                            <tr>
                    '''
        for metric, score in stack_scores.items():
            table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
        table_str += '</tr></tbody></table><br>'
        self._update_log(table_str, False, True)
        self._update_log('Retraining Stacker on full dataset')
        stacker.fit(x, y)
        save_path = os.path.join(col_path, 'Stacker')
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        save_file = os.path.join(save_path, 'Stacker.pkl')
        self._update_log(f'Saving Stacking algorithm to : {save_file}', False)
        joblib.dump(stacker, save_file, compress=1)
        self.model_checksums['Stacker'] = hashlib.md5(
            open(save_file, 'rb').read()).hexdigest()
        self._update_log(f'Stacking hash: {self.model_checksums["Stacker"]}')

        # Save particulars to file
        col_name = col_path.split('\\')[-1]
        stacker_info = {
            'column': col_name,
            'version_directory': self.version_directory,
            'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime()),
            'train_eval_score': stack_scores,
            'model_checksums': self.model_checksums
        }
        stacker_json_save_file = os.path.join(save_path, 'Stacker.json')
        with open(stacker_json_save_file, 'w') as outfile:
            json.dump(stacker_info, outfile, indent=2)
        x[col_name + TRUTH_LABEL_SUFFIX] = y
        agreement_ratios = x.apply(get_ratio, axis=1)
        bamboozled = x.apply(get_bamboozled_score, axis=1)

        x[col_name + TAG_DELIMITER + 'agreement_ratio'] = agreement_ratios
        x[col_name + TAG_DELIMITER + 'bamboozled_score'] = bamboozled
        pc_len = len(x[x[col_name + TAG_DELIMITER +
                         'agreement_ratio'] <= DISAGREEMENT_THRESHOLD])
        bamboozled_len = len(x[x[col_name + TAG_DELIMITER +
                                 'bamboozled_score'] <= BAMBOOZLED_THRESHOLD])
        self._update_log(
            f"Found {pc_len} samples for {col_name} that fall at or below the {DISAGREEMENT_THRESHOLD} predictor agreement."
        )
        self._update_log(
            f"Found {bamboozled_len} samples for {col_name} that have a bamboozled score of {BAMBOOZLED_THRESHOLD} or below."
        )
        # print('HEAD OF X IN TRAIN_STACKER')
        # print(x.head())
        # print(x.columns)
        # ? What X is a dataframe  [col_name + CONFIG.get('VARIABLES', 'StackerLabelSuffix')] = final_preds
        self.all_predictions_df = pd.merge(self.all_predictions_df,
                                           x,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
        # print('HEAD OF all_redictions_df IN TRAIN_STACKER')
        # print(self.all_predictions_df.head())
        # print(self.all_predictions_df.columns)
        self._update_log('Run complete')
        self._update_log('<hr>', False, True)
示例#11
0
    def save_tpot_params_to_file(self, pipeline, model_param_path, score_dict):
        try:
            model = 'TPOTClassifier'
            model_path = os.path.join(model_param_path, model + '.json')
            if not os.path.isfile(model_path):
                # Get default values
                model_path = os.path.join(
                    CONFIG.get('PATHS', 'DefaultModelDirectory'), model,
                    model + '.json')
            with open(model_path, 'r') as param_file:
                model_params = json.load(param_file)

            best_params = pipeline.get_params()

            tpot_params = model_params['tpot_params']
            # * Remove any models under params that are not TfidfVectorizers
            for param_type in list(model_params['params'].keys()):
                param_key = param_type.split('.')[1]
                if param_key != 'feature_extraction':
                    del model_params['params'][param_type]

            # * Update tfidf params to the best
            for param_type, parameters in model_params['params'].items():
                param_key = param_type.split('.')[-1]
                for k, v in best_params.items():
                    best_param_key = k.split('__')[-1]
                    if k.startswith(
                            param_key) and best_param_key in parameters.keys():
                        parameters[best_param_key] = v
            current_time = time.localtime()
            model_params['meta']['training_meta'].update({
                'last_train_date':
                time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                'train_eval_score':
                score_dict,
                'checksum':
                self.model_checksums[model]
            })

            if self.tune_models:
                model_params['meta']['tuning_meta'].update({
                    'last_tune_date':
                    time.strftime('%Y-%m-%d %H:%M:%S', current_time),
                    'n_iter':
                    self.tuning_params['gridsearch']['n_iter'],
                    'tuning_duration':
                    self.grid_search_time,
                    'tune_eval_score':
                    score_dict
                })
            # * Now to get the new model parameters
            for name, obj in pipeline.named_steps.items():
                if name == 'TfidfVectorizer':
                    continue
                module_name = str(obj.__class__).split("'")[1]
                module_params = obj.get_params()
                model_params['params'].update({module_name: module_params})

            model_params['tpot_params'] = tpot_params

            with open(os.path.join(model_param_path, model + '.json'),
                      'w') as outfile:
                json.dump(model_params, outfile, indent=2, cls=CATEncoder)

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.save_params_to_file {} not found'.format(
                    model_path))
        except Exception as e:
            self.logger.error(
                'ModelTrainer.save_params_to_file {}:'.format(model),
                exc_info=True)
            tb = traceback.format_exc()
            print(tb)
示例#12
0
    def grid_search(self,
                    model,
                    x,
                    y,
                    pipeline,
                    tuning_params,
                    n_jobs=-1,
                    n_iter=20,
                    scoring=None,
                    include_tfidf=False,
                    keras_params=None):
        '''Performs grid search on selected pipeline.

            # Arguments

                model: string, name of classifier in pipeline
                x: pandas.DataFrame, training data
                y: numpy.array, training labels
                pipeline: sklearn.model_selection.Pipeline, pipeline object containing feature extractors, feature selectors and estimator
                n_jobs: int, Number of jobs to run in parallel.
                n_iter: int, number of iterations to perform search
                scoring: list, scoring metrics to be used by the evaluator
                include_tfidf: bool:, flag to indicate tfidf is included in the pipeline
                keras_params: dict, parameters necessary for model training outside of the regular hyperparams.  e.g. input_shape, num_classes, num_features
        '''
        try:
            start_time = time.time()
            filepath = os.path.join(CONFIG.get('PATHS', 'BaseModelDirectory'),
                                    model + '.json')
            with open(filepath, 'r') as f:
                model_data = json.load(f, object_hook=cat_decoder)

            grid_params = {}
            default_params = model_data[model]

            for param_types, types in default_params.items():
                for t, params in types.items():
                    if params['tunable']:
                        param_name = model + '__' + t
                        if params['type'] == 'dropdown':
                            param_options = list(params['options'].values())
                        elif params['type'] == 'double':
                            param_options = scipy.stats.expon(
                                scale=params['step_size'])
                        elif params['type'] == 'int':
                            param_options = scipy.stats.randint(
                                params['min'], params['max'] + 1)
                        elif params['type'] == 'range':
                            param_options = [(1, 1), (1, 2), (1, 3), (1, 4)]
                        grid_params.update({param_name: param_options})
                    else:
                        continue

            if include_tfidf:
                with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f:
                    model_data = json.load(f, object_hook=cat_decoder)
                model_class = model_data['model_class']
                default_params = model_data[model_class]

                for param_types, types in default_params.items():
                    for t, params in types.items():
                        if params['tunable']:
                            param_name = model_class + '__' + t
                            if params['type'] == 'dropdown':
                                param_options = list(
                                    params['options'].values())
                            elif params['type'] == 'double':
                                param_options = scipy.stats.expon(
                                    scale=params['step_size'])
                            elif params['type'] == 'int':
                                param_options = scipy.stats.randint(
                                    params['min'], params['max'] + 1)
                            elif params['type'] == 'range':
                                param_options = [(1, 1), (1, 2), (1, 3),
                                                 (1, 4)]
                            else:
                                param_options = None
                            grid_params.update({param_name: param_options})
                        else:
                            continue
            # Remnant from __TENSORFLOW work.
            # if keras_params:
            #     updated_key_dict = {f'{model}__{k}':
            #         [v] for k, v in keras_params.items()}
            #     grid_params.update(updated_key_dict)

            self._update_log(f'Beginning RandomizedSearchCV on {model}...')
            rscv = RandomizedSearchCV(
                pipeline,
                grid_params,
                n_jobs=tuning_params['gridsearch']['n_jobs']
                if tuning_params['gridsearch']['n_jobs'] != 0 else None,
                cv=tuning_params['gridsearch']['cv'],
                n_iter=n_iter,
                pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'),
                verbose=CONFIG.getint('VARIABLES',
                                      'RandomizedSearchVerbosity'),
                scoring=tuning_params['gridsearch']['scoring']
                if len(tuning_params['gridsearch']['scoring']) > 0 else None,
                refit='accuracy')
            #   refit='accuracy' if len(tuning_params['gridsearch']['scoring']) > 0 else None)  # ! FIXME: Should we allow other, non accuracy metrics here?
            with joblib.parallel_backend('dask'):
                rscv.fit(x, y)
            self.grid_search_time = time.time() - start_time
            self._update_log(
                f'RandomizedSearchCV on {model} completed in {self.grid_search_time}'
            )
            self._update_log(f'Best score for {model}: {rscv.best_score_}',
                             False)
            return rscv

        except FileNotFoundError as fnfe:
            self.logger.debug(
                'ModelTrainer.grid_search {} not found'.format(filepath))
        except Exception as e:
            self.logger.error('ModelTrainer.grid_search {}:'.format(model),
                              exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)
示例#13
0
    def run(self):
        self._update_log('Beginning ModelTrain run')
        # * Run thru enumeration of columns.  The second argument in enumerate
        # * tells python where to begin the idx count.  Here, 1 for our offset
        try:
            for col_idx, col in enumerate(self.training_data.columns, 1):
                if col.endswith(self.tag_suffix):
                    self._update_log(f'Current classification task: {col}',
                                     False)
                    col_label = col.split(
                        CONFIG.get('VARIABLES', 'TagDelimiter'))[0]
                    col_path = os.path.join(self.version_directory, col_label)
                    # * FInd and drop any samples missing an index
                    missing_idx_count = self.training_data.index.isna().sum()
                    if (missing_idx_count > 0):
                        self._update_log(
                            f"<b>Found {missing_idx_count} samples missing a value for index </b> \
                                        (index_col = {CONFIG.get('VARIABLES', 'IndexColumn')}).  Removing those samples..."
                        )
                        valid_indexes = self.training_data.index.dropna()
                        self.training_data = self.training_data[
                            self.training_data.index.isin(valid_indexes)]
                        self._update_log(
                            f'Shape of dataset after removal: {self.training_data.shape}'
                        )
                    # * Create dict to fill na samples with 'unanswered' and score of 0
                    label_col_name = self.training_data.columns[col_idx]
                    fill_dict = pd.DataFrame(data={
                        col: 'unanswered',
                        label_col_name: 0
                    },
                                             index=[0])
                    self.training_data.fillna(value=0, inplace=True, axis=1)
                    x = self.training_data[col].copy()
                    y = self.training_data[
                        self.training_data.columns[col_idx]].copy().values

                    results = pd.DataFrame(index=self.training_data.index)
                    results[TRUTH_LABEL_SUFFIX] = y
                    preds = np.empty(y.shape)
                    probs = np.empty(shape=(y.shape[0], len(np.unique(y))))

                    # * Initialize sklearn evaluation parameters
                    sk_eval_type = self.training_eval_params['sklearn']['type']
                    sk_eval_value = self.training_eval_params['sklearn'][
                        'value']
                    # * SKLEARN
                    for model, selected in self.selected_models[
                            'sklearn'].items():
                        if self._is_running == False:
                            self.signals.training_complete.emit(pd.DataFrame())
                            break
                        if selected:
                            try:
                                if self.tune_models:
                                    self._tune_model(x, y, model, col_path)
                                model_params = self.get_params_from_file(
                                    model, col_path)
                                self._update_log(f'Begin training {model}')
                                pipeline = Pipeline(
                                    self.get_pipeline(model_params['params']))
                                try:
                                    if sk_eval_type == 'cv':
                                        skf = StratifiedKFold(
                                            n_splits=sk_eval_value,
                                            random_state=RANDOM_SEED)
                                        for train, test in skf.split(x, y):
                                            with joblib.parallel_backend(
                                                    'dask'):
                                                preds[test] = pipeline.fit(
                                                    x.iloc[train],
                                                    y[train]).predict(
                                                        x.iloc[test])
                                            if self.use_proba and hasattr(
                                                    pipeline, 'predict_proba'):
                                                try:
                                                    probs[
                                                        test] = pipeline.predict_proba(
                                                            x.iloc[test])
                                                except AttributeError:
                                                    self.logger.debug(
                                                        '{} does not support predict_proba'
                                                        .format(model))
                                                    print(
                                                        model,
                                                        'does not support predict_proba'
                                                    )
                                            else:
                                                probs = np.array([])
                                    elif sk_eval_type == 'test_split':
                                        x_train, x_test, y_train, y_test = train_test_split(
                                            x,
                                            y,
                                            test_size=sk_eval_value,
                                            stratify=y,
                                            random_state=CONFIG.getfloat(
                                                'VARIABLES', 'RandomSeed'))
                                        preds = np.empty(len(y_test))
                                    else:
                                        self._update_log(
                                            f'No evaluation type chosen.')
                                except (KeyboardInterrupt, SystemExit):
                                    raise
                                except Exception:
                                    self.logger.warning(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model),
                                        exc_info=True)
                                    tb = traceback.format_exc()
                                    print(tb)
                                    self._update_log(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model), True, False)
                                model_scores = self.get_model_scores(y, preds)

                                self._update_log(
                                    f'Task completed on <b>{model}</b>.')
                                table_str = '''<table>
                                                    <thead>
                                                        <tr>
                                                            <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                                        </tr>
                                                    </thead>
                                                <tbody>
                                                    <tr>
                                            '''
                                for metric, score in model_scores.items():
                                    table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
                                table_str += '</tr></tbody></table><br>'
                                if sk_eval_type is not None:
                                    self._update_log(table_str, False, True)
                                self._update_log(
                                    f'Training {model} on full dataset')
                                with joblib.parallel_backend('dask'):
                                    pipeline.fit(x, y)

                                pred_col_name = col_label + TAG_DELIMITER + model + PRED_LABEL_SUFFIX
                                prob_col_name = col_label + TAG_DELIMITER + model + PROB_LABEL_SUFFIX
                                results[pred_col_name] = preds.astype(int)
                                # If predicting probabilities and the probability array has values,
                                # use those values for the results.
                                if self.use_proba and probs.size:
                                    results[prob_col_name] = np.amax(probs,
                                                                     axis=1)

                                save_path = os.path.join(col_path, model)
                                if not os.path.exists(save_path):
                                    os.makedirs(save_path)
                                self.save_model(model, pipeline, save_path,
                                                model_scores)
                            except (KeyboardInterrupt, SystemExit):
                                raise
                            except Exception as e:
                                self.logger.error(f'ModelTrainer.run {model}:',
                                                  exc_info=True)
                                tb = traceback.format_exc()
                                print(tb)
                                self._update_log(tb)
                    # Tensorflow__ would reside here
                    try:
                        if self.train_stacking_algorithm and self._is_running:
                            self.train_stacker(
                                results.drop(TRUTH_LABEL_SUFFIX, axis=1),
                                results[TRUTH_LABEL_SUFFIX].values, col_path)
                        else:
                            self._update_log('Skipping Stacker training.')
                    except ValueError as ve:
                        self.signals.training_complete.emit(pd.DataFrame())
                        self._update_log(
                            f'Unable to train Stacking algorithm on {col_label}.'
                        )
                        tb = traceback.format_exc()
                        print(tb)
                    except Exception as e:
                        self.logger.error(f'ModelTrainer.run {model}:',
                                          exc_info=True)
                        tb = traceback.format_exc()
                        print(tb)
                        self._update_log(tb)
            self._is_running = False
            self.signals.training_complete.emit(self.all_predictions_df)

        except Exception as e:
            self.signals.training_complete.emit(pd.DataFrame())
            self.logger.error('ModelTrainer.run (General):', exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)
示例#14
0
import package.utils.training_utils as tu
from package.utils.catutils import CATEncoder, cat_decoder, exceptionWarning
from package.utils.config import CONFIG
import package.utils.embedding_utils as embed_utils
# import package.utils.keras_models as keras_models
# import package.utils.SequenceTransformer as seq_trans

RANDOM_SEED = 1337
TOP_K = 20000
MAX_SEQUENCE_LENGTH = 1500
BASE_MODEL_DIR = './package/data/base_models'
BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json'
INPUT_SHAPE = (0, 0)

TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter')
PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix')
PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix')
TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix')
STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix')
DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold')
BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold')


class ModelTrainerSignals(QObject):
    training_complete = pyqtSignal(pd.DataFrame)
    tuning_complete = pyqtSignal(bool, dict)
    update_progressbar = pyqtSignal(int, bool)
    update_training_logger = pyqtSignal(str, bool, bool)

示例#15
0
from chardet.universaldetector import UniversalDetector

from package.utils.catutils import exceptionWarning, clearLayout
from package.utils.preprocess_text import processText, get_avg_words_per_sample
from package.utils.spellcheck import SpellCheck
from package.utils.DataframeTableModel import DataframeTableModel
from package.utils.AttributeTableModel import AttributeTableModel
from package.utils.GraphWidget import GraphWidget
from package.utils.config import CONFIG

"""DataLoader imports CSV file and returns a dataframe with the appropriate columns.
For training data, DI will consider the nth column as a training sample
and nth+1 as ground truth.
CSV files must be formatted accordingly.
"""
TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter')
DATA_COLUMN_SUFFIX = CONFIG.get('VARIABLES', 'TrainingDataColumnSuffix')
TRUTH_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix')

class DataLoader(QWidget):
    """
    TODO: Refactor this monstrosity into functions to setup UI
    """
    data_load = pyqtSignal(pd.DataFrame)
    update_statusbar = pyqtSignal(str)
    update_progressbar = pyqtSignal(int, bool)

    def __init__(self, parent=None):
        super(DataLoader, self).__init__(parent)
        self.logger = logging.getLogger(__name__)
        # self.logger.setLevel(logging.DEBUG)
示例#16
0
import errno

import pandas as pd

from PyQt5.QtCore import (Qt, pyqtSlot, pyqtSignal)
from PyQt5.QtWidgets import (QApplication, QHBoxLayout, QDialog, QHeaderView, QAction,
                               QMainWindow, QSizePolicy, QProgressBar, QWidget,
                               QVBoxLayout, QFormLayout, QGroupBox, QLineEdit,
                               QLabel, QDialogButtonBox, QMessageBox, QComboBox, QPushButton)

from package.train.TrainWidget import TrainWidget
from package.utils.catutils import exceptionWarning
from package.utils.config import CONFIG


VERSION_BASE_DIR = CONFIG.get('PATHS', 'BaseVersionDirectory')
# DEFAULT_QUESTION_LABELS = ['Q1', 'Q2', 'Q3', 'Q4', 'Q6',
#                     'Q7', 'Q9', 'Q11', 'Q14', 'Q15'] 
DEFAULT_QUESTION_LABELS = CONFIG.get('VARIABLES', 'DefaultQuestionLabels').split(',')
class CatTrain(QMainWindow):
    """ The central widget for the training component of CATScore
        Most of the functionality is contained in this class
    """
    def __init__(self, parent=None):
        super(CatTrain, self).__init__(parent)
        self.logger = logging.getLogger(__name__)
        self.title = 'CAT Train'
        self.setWindowTitle(self.title)
        geometry = QApplication.desktop().availableGeometry(self)
        parent_left = self.parent().geometry().left()
        parent_top = self.parent().geometry().top()