def test_make_results_for_models(self, model_dir, name): print(f'Model: {name}') model = cache.load_obj(f'{model_dir}/trained_model.pkl') plot_logloss_and_error(model, name=name, save_dir=model_dir) x, y, names = load_data() results = cache.load_obj(f'{model_dir}/cv_results.pkl') plot_fi(model, names, scale_name, sort=True, save_dir=model_dir) plot_cross_validation(results, name=name, save_dir=model_dir) print(results_to_print(results)) _test_prediction(model, x, y) self.assertTrue(True)
def extract_features(csv_object): start = datetime.now() features = [] times = [] if isinstance(csv_object, CsvData): if csv_object.features_extracted: return True data = load_obj(csv_object.cached_data_path) for entry in data: if not isinstance(entry, DataEntry): logger.error( f'Preprocessed data for {csv_object} on path {csv_object.cached_data_path} are broken!' ) return False if not entry.accelerometer: break entry_features = entry.get_features() if csv_object.training_data: entry_features['SLEEP'] = entry.sleep features.append(entry_features) times.append(entry.time) df = DataFrame(features, index=times) df.to_excel(csv_object.features_data_path) csv_object.features_extracted = True csv_object.save() end = datetime.now() logger.info( f'Features for {csv_object.filename} extracted in {end - start}') return True else: return False
def create_structure(): structure = [] for subject in Subject.objects.all(): sleep_days = SleepDiaryDay.objects.filter(subject=subject) if (sleep_days.exists()): for sleep_day in sleep_days: assert isinstance(sleep_day, SleepDiaryDay) data = CsvData.objects.filter(subject=subject) if not data.exists(): # no CSV data logger.warning( f'Missing csv data for subject {subject} with {len(sleep_days)} sleep diary days' ) else: maching_data = None if len(data) == 1: # single CSV data file matching_data = data.first() else: # data need to be found s = sleep_day.t1 e = sleep_day.t4 for d in data: assert isinstance(d, CsvData) pred = cache.load_obj(d.cached_prediction_path) interval = pred[s:e] if len(interval) > 0: # matchin data found matching_data = d break if matching_data is None: continue structure.append((subject, matching_data, sleep_day)) logger.debug( f'{subject.code} - {data.first().filename} - {sleep_day.date} added to validation structure ' ) return structure
def test_save(self): model = cache.load_obj(TRAINED_MODEL_PATH) x, y, names = load_data() if isinstance(model, xgb.sklearn.XGBClassifier): model._Booster.save_model(TRAINED_MODEL_EXPORT_PATH) _test_prediction(model, x, y) import_model = xgb.sklearn.XGBClassifier() import_model.load_model(TRAINED_MODEL_PATH) _test_prediction(import_model, x, y)
def excel_prediction_url(self): if not path.exists(self.excel_prediction_path) and path.exists( self.cached_prediction_path): df = cache.load_obj(self.cached_prediction_path) df.to_excel(self.excel_prediction_path) elif path.exists(self.excel_prediction_path): return self.data.storage.url(self.excel_prediction_path) else: return ''
def test_model_on_unknown_data(self, data_path): df = cache.load_obj(data_path) x = df[[c for c in df.columns if c != scale_name]].values predictions = self.model.predict(x) sleep = [x for x in predictions if x == 1] wake = [x for x in predictions if x == 0] r_sleep = len(sleep) / len(predictions) r_wake = len(wake) / len(predictions) print("Sleep: %.2f%%" % (r_sleep * 100.0)) print("Wake: %.2f%%" % (r_wake * 100.0)) self.assertTrue(1 > r_sleep > 0) self.assertTrue(1 > r_wake > 0)
def predict(csv_data, force=False): if isinstance(csv_data, CsvData): start = datetime.now() if os.path.exists(csv_data.cached_prediction_path) and not force: logger.info( f'Prediction features data for {csv_data.filename} will be loaded from cache' ) df = cache.load_obj(csv_data.cached_prediction_path) return df else: logger.info(f'Data {csv_data.filename} need to be preprocessed') result = preprocess_data(csv_data) if not result: logger.warning( f'Data {csv_data.filename} cannot be preprocessed') return None logger.info( f'Features for {csv_data.filename} need to be extracted') result = extract_features(csv_data) if not result: logger.warning( f'Features cannot be extracted for {csv_data.filename}') return None df = pd.read_excel(csv_data.features_data_path, index_col=0) logger.info(f'Prediction need to be done for {csv_data.filename}') predictions = _predict(df) df[prediction_name] = predictions cache.save_obj(df, csv_data.cached_prediction_path) df.to_excel(csv_data.excel_prediction_path) csv_data.prediction_cached = True csv_data.save() end = datetime.now() logger.info( f'Prediction for {csv_data.filename} made in {end - start}') return df else: return None
def setUpClass(cls): cls.model = cache.load_obj(TRAINED_MODEL_PATH)
def hilev(): structure = create_structure() res = True for subject, data, day in structure: if not isinstance(data, CsvData) and path.exists( data.cached_prediction_path): res = False continue if not isinstance(day, SleepDiaryDay): res = False continue df = cache.load_obj(data.cached_prediction_path) if not isinstance(df, DataFrame): res = False continue nights = SleepNight.objects.filter(diary_day=day).filter( data=data).filter(subject=subject) if not nights.exists(): night = SleepNight() night.diary_day = day night.data = data night.subject = subject else: night = nights.first() s = day.t1 - timedelta(minutes=30) e = day.t4 + timedelta(minutes=30) interval = df.loc[s:e, [prediction_name]] rolling_10 = interval.rolling('300s').sum() rolling_10['strict'] = numpy.where(rolling_10[prediction_name] <= 5, 'W', 'S') sleep = rolling_10.index[rolling_10['strict'] == 'S'].tolist() if not sleep: logger.warning( f'No sleep found for {night.subject.code} {night.diary_day.date} {night.data.filename}' ) res = False continue night.sleep_onset = pytz.timezone("Europe/Prague").localize(sleep[0]) night.sleep_end = pytz.timezone("Europe/Prague").localize(sleep[-1]) rolling_10[hilev_prediction] = numpy.where( rolling_10[prediction_name] <= 2, 'W', 'S') pred = rolling_10.loc[sleep[0]:sleep[-1], [hilev_prediction]] if not isinstance(pred, DataFrame): res = False continue pred.to_excel(night.name) wake = pred.index[pred[hilev_prediction] == 'W'].tolist() night.tst = (night.sleep_end - night.sleep_onset).seconds night.waso = len(wake) * 30 night.se = ((night.tst - night.waso) / night.tst) * 100 pred["number_prediction"] = numpy.where(pred[hilev_prediction] == 'S', 1, 0) wakes_counts = (pred["number_prediction"].diff() == -1).sum() night.sf = wakes_counts / (night.convert(night.tst).seconds / 3600) onset_latency = sleep[0] - day.t1 if sleep[0] > day.t1 else timedelta( seconds=0) night.sol = onset_latency.seconds logger.info(night) night.save() return res
def learn(): logger.info('Load the data') x, y, names = load_data() y = y.reshape((len(y), )) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=17) y_train = y_train.reshape((len(y_train), )) logger.info('Original train data:}') log_data_info(y_train) # Add NaN according to K-nearest neighbours imputer = KNNImputer(n_neighbors=4, weights="uniform") x_train = imputer.fit_transform(x_train) # Add synthetic values to balance dataset sm = SMOTE(random_state=27) x_train, y_train = sm.fit_sample(x_train, y_train) logger.info('Data after SMOTE synthesis:}') log_data_info(y_train) if os.path.exists(MODEL_PATH): logger.info('Load model') model = load_obj(MODEL_PATH) else: if os.path.exists(HYPER_PARAMS_PATH): params = load_obj(HYPER_PARAMS_PATH) else: logger.info('Hyper-parameters tuning') params = _search_best_hyper_parameters(x_train, y_train) logger.info('Cross-validation of params') y_train = y_train.ravel() model = xgb.sklearn.XGBClassifier(**params) cv_results = evaluate_cross_validation(model=model, x_train=x_train, y_train=y_train, save_path=CV_RESULTS_PATH) logger.info(results_to_print(cv_results)) plot_cross_validation(cv_results, 'Model binary:logistic') _train_model(model, x_test, x_train, y_test, y_train) save_obj(model, TRAINED_MODEL_PATH) # Plot the feature importances plot_fi(model, names, scale_name, sort=True, save_dir=ML_DIR) plot_logloss_and_error(model, model_name) predict = model.predict(x_test) logger.info('After training results on test data: ') logger.info( f'ACC: {accuracy_score(y_test, predict):.2f} | F1: {f1_score(y_test, predict):.2f}' ) logger.info('Confusion matrix: ') logger.info(confusion_matrix(y_test, predict)) predict = model.predict(x) logger.info('After training results on whole dataset: ') logger.info( f'ACC: {accuracy_score(y, predict):.2f} | F1: {f1_score(y, predict):.2f}' ) logger.info('Confusion matrix: ') logger.info(confusion_matrix(y, predict)) return model
def _predict(df): x = df[[c for c in df.columns if c != scale_name]].values model = cache.load_obj(TRAINED_MODEL_PATH) predictions = model.predict(x) return predictions