def run(self): apply_cols = [ col for col in self.data.columns if col.endswith('_text') ] self.data[apply_cols] = self.data[apply_cols].applymap( lambda x: processText(str(x), **self.options) ) if self.options['spell_correction']: sentences = self.data[apply_cols].applymap( lambda x: str(x).split() ).values sc = SpellCheck(sentences, CONFIG.getint('VARIABLES', 'TopKSpellCheck')) self.data[apply_cols] = self.data[apply_cols].applymap( lambda x: sc.correct_spelling(x) ) # sys.stdout = sys.__stdout__ self.preprocessing_complete.emit(self.data)
def load_file(self, f_path): """ Load data from a CSV file to the workspace. Column 0 is used for the index column. chardet attempts to determine encoding if file is not utf-8. # Attributes f_path(String): The filename selected via open_file """ # FIXME: Reset status bar when new data is loaded. try: self.graph.clear_graph() self.available_column_model.loadData([], include_labels=False) self.prediction_data = pd.read_csv( f_path, encoding='utf-8', index_col=CONFIG.getint( 'VARIABLES', 'IndexColumn')) #TODO: user defined index column except UnicodeDecodeError as ude: self.logger.warning("UnicodeDecode error opening file", exc_info=True) self.comms.update_statusbar.emit( "Attempting to determine file encoding...") detector = UniversalDetector() try: for line in open(f_path, 'rb'): detector.feed(line) if detector.done: break detector.close() print("chardet determined encoding type to be {}".format( detector.result['encoding'])) self.prediction_data = pd.read_csv( f_path, encoding=detector.result['encoding'], index_col=0) except Exception as e: self.logger.error("Error detecing encoding", exc_info=True) exceptionWarning("Exception has occured.", exception=e) except IOError as ioe: self.logger.error("IOError detecting encoding", exc_info=True) exceptionWarning("IO Exception occured while opening file.", exception=ioe) except Exception as e: self.logger.error("Error detecting encoding", exc_info=True) exceptionWarning("Error occured opening file.", exception=e) try: columns = self.prediction_data.columns self.available_columns = [] self.columns_with_truth = [] self.ground_truth_columns = self.prediction_data.columns[ ~self.prediction_data.isna().any()].tolist() for column in columns: if column.lower().endswith("text"): self.available_columns.append(column) column_tag = column.split('__')[0] if (column_tag + '__actual' in self.ground_truth_columns): self.columns_with_truth.append(column) if self.available_columns: self.available_column_model.loadData(self.available_columns, include_labels=False) if self.columns_with_truth: self.available_column_model.setTruthData( self.columns_with_truth) # self.full_text_count.setText(str(self.prediction_data.shape[0])) # self.display_selected_row(None) # self.select_all_btn.setEnabled(True) # self.deselect_all_btn.setEnabled(True) self.comms.update_statusbar.emit("CSV loaded.") # else: # exceptionWarning("No allowable data discovered in file.") except pd.errors.EmptyDataError as ede: exceptionWarning('Empty Data Error.\n', exception=ede) except Exception as e: self.logger.error("Error loading dataframe", exc_info=True) exceptionWarning("Exception occured. PredictWidget.load_file.", exception=e)
def train_stacker(self, x, y, col_path): def get_ratio(row): """ Returns the ratio of agreement between column values (here, predictors) in a given row. """ try: pred_value = row.iloc[-1] total_same = 0.0 col_count = float(len(row.iloc[:-1])) for data in row.iloc[:-1]: if data == pred_value: total_same += 1.0 return total_same / col_count except ZeroDivisionError as zde: return 0 except Exception as e: self.logger.error("ModelTrainer.get_ratio", exc_info=True) exceptionWarning( 'Exception occured in ModelTrainer.get_ratio.', repr(e)) def get_bamboozled_score(row): """ Returns the difference between the number of models and the number of models who predicted incorrectly. The higher this value, the more bamboozling the sample """ try: pred_value = row.iloc[-1] total_wrong = 0 col_count = len(row.iloc[:-1]) for data in row.iloc[:-1]: if data != pred_value: total_wrong += 1 return col_count - total_wrong except Exception as e: self.logger.error("ModelTrainer.get_bamboozled_score", exc_info=True) exceptionWarning( 'Exception occured in ModelTrainer.get_bamboozled_score.', repr(e)) stacker_full_class = CONFIG.get( 'VARIABLES', 'StackingAlgorithmCLassName').split('.') final_preds = np.empty(y.shape) stacker_module = '.'.join(stacker_full_class[0:-1]) inst_module = importlib.import_module(stacker_module) stacker_class = getattr(inst_module, stacker_full_class[-1]) stacker = stacker_class() if self.tuning_params['gridsearch']['tune_stacker']: self._update_log( f'Beginning tuning run on Stacker <b>{".".join(stacker_full_class)}</b>...' ) rscv = RandomizedSearchCV( estimator=stacker, n_jobs=self.tuning_params['gridsearch']['n_jobs'] if self.tuning_params['gridsearch']['n_jobs'] != 0 else None, cv=self.tuning_params['gridsearch']['cv'], n_iter=self.tuning_params['gridsearch']['n_iter'], pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'), verbose=CONFIG.getint('VARIABLES', 'RandomizedSearchVerbosity'), scoring=self.tuning_params['gridsearch']['scoring'] if len(self.tuning_params['gridsearch']['scoring']) > 0 else None, refit='accuracy') rscv.fit(x, y) best_params = rscv.best_params_ stacker = stacker_class(**best_params) self._update_log('Stacker tuning completed! Re-evaluating...') self._update_log( f'Training Stacking algorithm <b>{".".join(stacker_full_class)}</b>' ) skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED) for train, test in skf.split(x, y): with joblib.parallel_backend('dask'): stacker.fit(x.iloc[train], y[train]) final_preds[test] = stacker.predict(x.iloc[test]) # stack_preds = [1 if x > .5 else 0 for x in np.nditer(final_preds)] self._update_log('Stacking training complete') stack_scores = self.get_model_scores(y, final_preds) table_str = '''<table> <thead> <tr> <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th> </tr> </thead> <tbody> <tr> ''' for metric, score in stack_scores.items(): table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score table_str += '</tr></tbody></table><br>' self._update_log(table_str, False, True) self._update_log('Retraining Stacker on full dataset') stacker.fit(x, y) save_path = os.path.join(col_path, 'Stacker') if not os.path.exists(save_path): os.makedirs(save_path) save_file = os.path.join(save_path, 'Stacker.pkl') self._update_log(f'Saving Stacking algorithm to : {save_file}', False) joblib.dump(stacker, save_file, compress=1) self.model_checksums['Stacker'] = hashlib.md5( open(save_file, 'rb').read()).hexdigest() self._update_log(f'Stacking hash: {self.model_checksums["Stacker"]}') # Save particulars to file col_name = col_path.split('\\')[-1] stacker_info = { 'column': col_name, 'version_directory': self.version_directory, 'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), 'train_eval_score': stack_scores, 'model_checksums': self.model_checksums } stacker_json_save_file = os.path.join(save_path, 'Stacker.json') with open(stacker_json_save_file, 'w') as outfile: json.dump(stacker_info, outfile, indent=2) x[col_name + TRUTH_LABEL_SUFFIX] = y agreement_ratios = x.apply(get_ratio, axis=1) bamboozled = x.apply(get_bamboozled_score, axis=1) x[col_name + TAG_DELIMITER + 'agreement_ratio'] = agreement_ratios x[col_name + TAG_DELIMITER + 'bamboozled_score'] = bamboozled pc_len = len(x[x[col_name + TAG_DELIMITER + 'agreement_ratio'] <= DISAGREEMENT_THRESHOLD]) bamboozled_len = len(x[x[col_name + TAG_DELIMITER + 'bamboozled_score'] <= BAMBOOZLED_THRESHOLD]) self._update_log( f"Found {pc_len} samples for {col_name} that fall at or below the {DISAGREEMENT_THRESHOLD} predictor agreement." ) self._update_log( f"Found {bamboozled_len} samples for {col_name} that have a bamboozled score of {BAMBOOZLED_THRESHOLD} or below." ) # print('HEAD OF X IN TRAIN_STACKER') # print(x.head()) # print(x.columns) # ? What X is a dataframe [col_name + CONFIG.get('VARIABLES', 'StackerLabelSuffix')] = final_preds self.all_predictions_df = pd.merge(self.all_predictions_df, x, how='outer', left_index=True, right_index=True) # print('HEAD OF all_redictions_df IN TRAIN_STACKER') # print(self.all_predictions_df.head()) # print(self.all_predictions_df.columns) self._update_log('Run complete') self._update_log('<hr>', False, True)
def grid_search(self, model, x, y, pipeline, tuning_params, n_jobs=-1, n_iter=20, scoring=None, include_tfidf=False, keras_params=None): '''Performs grid search on selected pipeline. # Arguments model: string, name of classifier in pipeline x: pandas.DataFrame, training data y: numpy.array, training labels pipeline: sklearn.model_selection.Pipeline, pipeline object containing feature extractors, feature selectors and estimator n_jobs: int, Number of jobs to run in parallel. n_iter: int, number of iterations to perform search scoring: list, scoring metrics to be used by the evaluator include_tfidf: bool:, flag to indicate tfidf is included in the pipeline keras_params: dict, parameters necessary for model training outside of the regular hyperparams. e.g. input_shape, num_classes, num_features ''' try: start_time = time.time() filepath = os.path.join(CONFIG.get('PATHS', 'BaseModelDirectory'), model + '.json') with open(filepath, 'r') as f: model_data = json.load(f, object_hook=cat_decoder) grid_params = {} default_params = model_data[model] for param_types, types in default_params.items(): for t, params in types.items(): if params['tunable']: param_name = model + '__' + t if params['type'] == 'dropdown': param_options = list(params['options'].values()) elif params['type'] == 'double': param_options = scipy.stats.expon( scale=params['step_size']) elif params['type'] == 'int': param_options = scipy.stats.randint( params['min'], params['max'] + 1) elif params['type'] == 'range': param_options = [(1, 1), (1, 2), (1, 3), (1, 4)] grid_params.update({param_name: param_options}) else: continue if include_tfidf: with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f: model_data = json.load(f, object_hook=cat_decoder) model_class = model_data['model_class'] default_params = model_data[model_class] for param_types, types in default_params.items(): for t, params in types.items(): if params['tunable']: param_name = model_class + '__' + t if params['type'] == 'dropdown': param_options = list( params['options'].values()) elif params['type'] == 'double': param_options = scipy.stats.expon( scale=params['step_size']) elif params['type'] == 'int': param_options = scipy.stats.randint( params['min'], params['max'] + 1) elif params['type'] == 'range': param_options = [(1, 1), (1, 2), (1, 3), (1, 4)] else: param_options = None grid_params.update({param_name: param_options}) else: continue # Remnant from __TENSORFLOW work. # if keras_params: # updated_key_dict = {f'{model}__{k}': # [v] for k, v in keras_params.items()} # grid_params.update(updated_key_dict) self._update_log(f'Beginning RandomizedSearchCV on {model}...') rscv = RandomizedSearchCV( pipeline, grid_params, n_jobs=tuning_params['gridsearch']['n_jobs'] if tuning_params['gridsearch']['n_jobs'] != 0 else None, cv=tuning_params['gridsearch']['cv'], n_iter=n_iter, pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'), verbose=CONFIG.getint('VARIABLES', 'RandomizedSearchVerbosity'), scoring=tuning_params['gridsearch']['scoring'] if len(tuning_params['gridsearch']['scoring']) > 0 else None, refit='accuracy') # refit='accuracy' if len(tuning_params['gridsearch']['scoring']) > 0 else None) # ! FIXME: Should we allow other, non accuracy metrics here? with joblib.parallel_backend('dask'): rscv.fit(x, y) self.grid_search_time = time.time() - start_time self._update_log( f'RandomizedSearchCV on {model} completed in {self.grid_search_time}' ) self._update_log(f'Best score for {model}: {rscv.best_score_}', False) return rscv except FileNotFoundError as fnfe: self.logger.debug( 'ModelTrainer.grid_search {} not found'.format(filepath)) except Exception as e: self.logger.error('ModelTrainer.grid_search {}:'.format(model), exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb)
# import package.utils.SequenceTransformer as seq_trans RANDOM_SEED = 1337 TOP_K = 20000 MAX_SEQUENCE_LENGTH = 1500 BASE_MODEL_DIR = './package/data/base_models' BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json' INPUT_SHAPE = (0, 0) TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter') PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix') PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix') TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix') STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix') DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold') BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold') class ModelTrainerSignals(QObject): training_complete = pyqtSignal(pd.DataFrame) tuning_complete = pyqtSignal(bool, dict) update_progressbar = pyqtSignal(int, bool) update_training_logger = pyqtSignal(str, bool, bool) class ModelTrainer(QRunnable): ''' QThread tasked with running all model training/tuning. This could potentially take days to complete. ''' # Setting parallel_backend to threading allows for multi-threading from a thread. GUI will not freeze and