def predictWithData(self, data, y, _featureDesp="all"): labels = y y = [label['value'] for label in labels] assert (len(data) == len(y)) # split data num_groundtruth = len(data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size # track test instances in original data for access to metadata test_glucoseData = self.glucose_data[train_size:] # assert (len(test_glucoseData) == test_size) # fix train_size, as we ignored the first value train_size -= 1 train_data = data[0:train_size] test_glucoseData = labels[train_size:] train_y = y[0:train_size] test_data = data[train_size:] test_y = y[train_size:] lr = linear_model.LinearRegression() lr.fit(train_data, train_y) predictions = lr.predict(test_data) if len(test_data) == 0: return # self.confidence_cal(train_data, test_data, test_y, predictions, rf, self.patient_id) results = dict() results['groundtruth'] = [item['value'] for item in test_glucoseData] timestamps = [item['time'] for item in test_glucoseData] results['times'] = timestamps results['indices'] = [int(item['index']) for item in test_glucoseData] results['predictions'] = predictions results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=test_y, predictions=predictions) results['performance'].update( compute_performance_meals(timestamps=timestamps, groundtruth=test_y, predictions=predictions, carbdata=self.carb_data)) results['params'] = self.save_params() results['featureDesp'] = _featureDesp self.plot_learned_model(results['predictions'], results['groundtruth'], results['times']) return results
def predict(self): """ Runs ContextAVG value prediction. :return: """ assert (self.glucose_data) # split the data num_groundtruth = len(self.glucose_data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size test_data = self.glucose_data[train_size:] assert (len(test_data) == test_size) # create prediction list using time weighted avg of previous values predictions = list() for i in range(0, test_size): # time of prediction next_time = test_data[i]['time'] # observed data prev_glucose = [ item for item in self.glucose_data[:train_size - 1 + i] ] predictions.append( self.get_time_weighted_average(prev_glucose, next_time)) assert (len(predictions) == test_size) # return ground truth (test set) and predictions (as a dict) test_values = [item['value'] for item in test_data] results = dict() results['groundtruth'] = test_values timestamps = [item['time'] for item in test_data] results['times'] = timestamps results['indices'] = [item['index'] for item in test_data] results['predictions'] = predictions results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=test_values, predictions=predictions) results['performance'].update( compute_performance_meals(timestamps=timestamps, groundtruth=test_values, predictions=predictions, carbdata=self.carb_data)) results['params'] = None return results
def predict(self): """ Runs last value prediction. :return: """ assert(self.glucose_data) # split the data num_groundtruth = len(self.glucose_data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size train_data = self.glucose_data[0:train_size] test_data = self.glucose_data[train_size:] assert(len(test_data) == test_size) # compute avg on training data last_value = train_data[-1]['value'] # create prediction list using avg test_values = [item['value'] for item in test_data] predictions = list() for i in range(0, test_size): predictions.append(last_value) last_value = test_values[i] assert(len(predictions) == test_size) # return ground truth (test set) and predictions (as a dict) results = dict() results['groundtruth'] = test_values timestamps = [item['time'] for item in test_data] results['times'] = timestamps results['indices'] = [item['index'] for item in test_data] results['predictions'] = predictions results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=test_values, predictions=predictions) results['performance'].update(compute_performance_meals( timestamps=timestamps, groundtruth=test_values, predictions=predictions, carbdata=self.carb_data )) results['params'] = None return results
def predict(self): size = int(len(self.discretized_data) * self.split_ratio) train, test = self.discretized_data[0:size], self.discretized_data[ size:len(self.discretized_data)] print("number of test instances including nan: {}".format(len(test))) # remove missing values train = train[numpy.logical_not(numpy.isnan(train))] test = test[numpy.logical_not(numpy.isnan(test))] print("number of test instances: {}".format(len(test))) history = [x for x in train] predictions = list() for t in range(len(test)): model = ARIMA_MODEL(history, order=(self.p, self.d, self.q)) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] predictions.append(yhat) obs = test[t] size = int(len(self.glucose_data) * self.split_ratio) history.append(obs) print('predicted=%f, expected=%f' % (yhat, obs)) # load timestamps ts_df = self.load_timestamps(self.con, self.patient_id) test_ts = ts_df.tail(len(test)) results = dict() results['groundtruth'] = test timestamps = test_ts.index results['times'] = timestamps results['indices'] = test_ts['pos'].values results['predictions'] = predictions results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=test, predictions=predictions) results['performance'].update( compute_performance_meals(timestamps=timestamps, groundtruth=test, predictions=predictions, carbdata=self.carb_data)) results['params'] = self.save_params() return results
def to_results(self, test_glucoseData, carbData, test_y, predictions, classes, patientId): timestamps = [item['time'] for item in test_glucoseData] results = dict() results['groundtruth'] = [item['value'] for item in test_glucoseData] results['times'] = timestamps results['indices'] = [int(item['index']) for item in test_glucoseData] results['performance'], results[ 'perClass'] = compute_performance_time_binned( test_y, predictions, timestamps=timestamps, regression=False, plotConfusionMatrix=False, classes=classes, patientId=patientId, model=self.modelName) r_meal, r_meal_perclass = compute_performance_meals( test_y, predictions, timestamps=timestamps, plotConfusionMatrix=False, classes=classes, patientId=patientId, carbdata=carbData, regression=False, model=self.modelName) results['performance'].update(r_meal) results['perClass'].update(r_meal_perclass) results['params'] = self.save_params() # Compute confusion matrix cnf_matrix = confusion_matrix(test_y, predictions) np.set_printoptions(precision=2) results["report"] = str(cnf_matrix) return results
def predict(self): """ load in data and model_name, whether LSTM or RNN :return: gt-values and predictions """ X, Y = self.load_time_series( self.con, self.patient_id ) if self.discretized else self.load_continuous_data() print "modelname %s" % self.modelName # TODO: fix split when not interpolating train_size = int(len(X) * self.split_ratio) test_size = len(X) - train_size trainX, testX = X[0:train_size], X[train_size:len(X)] trainY, testY = Y[0:train_size], Y[train_size:len(Y)] train_size = len(trainY) test_size = len(testY) # reshape into X=t and Y=t+1 # TODO: remove the missing values here; unless we want to interpolate: then do it before transforming the data # scale all the data \o/ xScaler = MinMaxScaler(feature_range=(0, 1)) yScaler = MinMaxScaler(feature_range=(0, 1)) # reshape to 2D for MinMaxScaler train_nsamples, train_nx, train_ny = trainX.shape d2_trainX = trainX.reshape((train_nsamples, train_nx * train_ny)) test_nsamples, test_nx, test_ny = testX.shape d2_testX = testX.reshape((test_nsamples, test_nx * test_ny)) trainX = xScaler.fit_transform(d2_trainX) testX = xScaler.transform(d2_testX) trainY = trainY.reshape(-1, 1) testY = testY.reshape(-1, 1) trainY = yScaler.fit_transform(trainY) testY = yScaler.fit_transform(testY) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (train_nsamples, train_nx, train_ny)) testX = numpy.reshape(testX, (test_nsamples, test_nx, test_ny)) #actual values testYa = self.get_test_values(testY, train_size, test_size) # create and fit the LSTM network model = Sequential() #FIXED: we skip here the time period features if self.addTimeofDay: self.no_features += 1 # input shape: (look_back, nfeatures) #model.add(Masking(mask_value=0., input_shape=(self.look_back, self.no_features))) #model.add(Model.LSTM.value(4)) model.add(self.models[self.modelName](self.num_units, input_shape=(self.look_back, self.no_features))) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(trainX, trainY, epochs=self.epochs, batch_size=1, verbose=2) # make predictions testPredict = model.predict(testX) print( "#Test data points (w/ padding): {}; #instances: {}; # training instances: {};" .format(len(testY), len(X), len(trainX))) # trace back predictions at actual values testYp = self.get_test_values(testPredict, train_size, test_size) testYp = yScaler.inverse_transform(testYp) testYa = yScaler.inverse_transform(testYa) print("Test values: {}".format(testYa)) print("predicted values: {}".format(testYp)) # load timestamps ts_df = self.load_timestamps(self.con, self.patient_id) # if self.interpolated: # # skip last timestamp # ts_df = ts_df[:-1] test_ts = ts_df.tail(len(testYa)) # test_ts = test_ts.reset_index() results = dict() results['groundtruth'] = testYa timestamps = test_ts.index results['times'] = timestamps results['indices'] = test_ts['pos'].values results['predictions'] = testYp results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=testYa, predictions=testYp) results['performance'].update( compute_performance_meals(timestamps=timestamps, groundtruth=testYa, predictions=testYp, carbdata=self.carb_data)) results['params'] = self.save_params() assert (len(testYa) == len(testYp)) assert (len(timestamps) == len(testYp)) return results
def predict_with_data(self, data, Y, _feature_desp="all"): # labeling sorted_Y = sorted(Y) # sort ascending thresh1 = int(len(sorted_Y) * self.threshold1) - 1 thresh2 = int(len(sorted_Y) * self.threshold2) - 1 if self.multiclass: cat_Y = [self.categorized_y(y, [sorted_Y[thresh1], sorted_Y[thresh2]]) for y in Y] classes = ['low ' + str(self.threshold1 * 100) + '%', 'medium', 'high'] else: cat_Y = [self.categorized_y(y, [sorted_Y[thresh1]]) for y in Y] classes = ['low ' + str(self.threshold1 * 100) + '%', 'high'] assert (len(data) == len(cat_Y)) # split data num_groundtruth = len(self.glucose_data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size # track test instances in original data for access to metadata test_glucose_data = self.glucose_data[train_size:] assert (len(test_glucose_data) == test_size) # fix train_size, as we ignored the first value train_size -= 1 train_data = data[0:train_size] train_y = cat_Y[0:train_size] test_data = data[train_size:] test_y = cat_Y[train_size:] assert (len(test_y) == len(test_glucose_data)) assert (len(train_y) + len(test_y) + 1 == num_groundtruth) clf = None predictions = None if self.tune: clf = svm.SVC() grid = GridSearchCV(clf, param_grid=self.param_grid, cv=5, refit=True, scoring='accuracy') grid.fit(train_data, train_y) self.best_params = grid.best_estimator_; self.log.info("Best parameters for patient {} {}".format(self.patient_id, self.best_params)) predictions = grid.predict(test_data) else: clf = svm.SVC() clf.fit(train_data, train_y) print train_y print predictions predictions = clf.predict(test_data) print precision_recall_fscore_support(test_y, predictions, average='weighted') print "accuracy: {}".format(accuracy_score(test_y, predictions, normalize=False)) timestamps = [item['time'] for item in test_glucose_data] results = dict() results['performance'], results['perClass'] = compute_performance_time_binned(test_y, predictions, timestamps=timestamps, regression=False, plotConfusionMatrix=True, classes=classes, patientId=self.patient_id, model=self.modelName) r_meal, r_meal_perclass = compute_performance_meals(test_y, predictions, timestamps=timestamps, plotConfusionMatrix=True, classes=classes, patientId=self.patient_id, carbdata=self.carb_data, regression=False, model=self.modelName) results['performance'].update(r_meal) results['perClass'].update(r_meal_perclass) results['params'] = self.save_params() # Compute confusion matrix cnf_matrix = confusion_matrix(test_y, predictions) np.set_printoptions(precision=2) results["report"] = "Binary classification with label: {}".format(classes) results["report"] = "number of instance in {}: {} and in {}: {}".format(classes[0], thresh1 + 1, classes[1], len(Y) - thresh1 - 1) results["report"] += ";confusion matrix: " + str(cnf_matrix) # Plot non-normalized confusion matrix save_confusion_matrix(cnf_matrix, classes=classes, patientId=self.patient_id, desc="all", model=self.modelName) return results
def predict_with_data(self, data, Y, _feature_desp="all"): # labeling sorted_Y = sorted(Y) # sort ascending thresh1 = int(len(sorted_Y) * self.threshold1) - 1 thresh2 = int(len(sorted_Y) * self.threshold2) - 1 ''' TODO: refactor for code reuse at class siblings ''' if self.hard_threshold: cat_Y = [ self.categorized_y(y, [Constant.HYPERGLYCEMIA_THRESHOLD]) for y in Y ] classes = ["non", Constant.HYPER] else: if self.multiclass: cat_Y = [ self.categorized_y(y, [sorted_Y[thresh1], sorted_Y[thresh2]]) for y in Y ] classes = [ 'low ' + str(self.threshold1 * 100) + '%', 'medium', 'high' ] else: cat_Y = [self.categorized_y(y, [sorted_Y[thresh1]]) for y in Y] classes = ['low ' + str(self.threshold1 * 100) + '%', 'high'] assert (len(data) == len(cat_Y)) # split data num_groundtruth = len(self.glucose_data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size # track test instances in original data for access to metadata test_glucose_data = self.glucose_data[train_size:] assert (len(test_glucose_data) == test_size) # fix train_size, as we ignored the first value train_size -= 1 train_data = data[0:train_size] train_y = cat_Y[0:train_size] test_data = data[train_size:] test_y = cat_Y[train_size:] assert (len(test_y) == len(test_glucose_data)) assert (len(train_y) + len(test_y) + 1 == num_groundtruth) clf = DummyClassifier(strategy=self.strategy, random_state=self.random_state) clf.fit(train_data, train_y) # ignore train_data predictions = clf.predict(test_data) print "prediction: {}".format(predictions) print "accuracy: {}".format( accuracy_score(test_y, predictions, normalize=True)) print precision_recall_fscore_support(test_y, predictions) timestamps = [item['time'] for item in test_glucose_data] results = dict() results['performance'], results[ 'perClass'] = compute_performance_time_binned( test_y, predictions, timestamps=timestamps, regression=False, plotConfusionMatrix=True, classes=classes, patientId=self.patient_id, model=self.model_name) r_meal, r_meal_perclass = compute_performance_meals( test_y, predictions, timestamps=timestamps, plotConfusionMatrix=True, classes=classes, patientId=self.patient_id, carbdata=self.carb_data, regression=False, model=self.model_name) results['performance'].update(r_meal) results['perClass'].update(r_meal_perclass) results['params'] = self.save_params() # Compute confusion matrix cnf_matrix = confusion_matrix(test_y, predictions) np.set_printoptions(precision=2) results["report"] = "Binary classification with label: {}".format( classes) results[ "report"] = "number of instance in {}: {} and in {}: {}".format( classes[0], thresh1 + 1, classes[1], len(Y) - thresh1 - 1) results["report"] += ";confusion matrix: " + str(cnf_matrix) # Plot non-normalized confusion matrix save_confusion_matrix(cnf_matrix, classes=classes, patientId=self.patient_id, desc="all", model=self.model_name) return results
def predictWithData(self, data, y, _featureDesp="all"): labels = y y = [label['value'] for label in labels] assert (len(data) == len(y)) # split data num_groundtruth = len(data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size # track test instances in original data for access to metadata test_glucoseData = self.glucose_data[train_size:] # assert (len(test_glucoseData) == test_size) # fix train_size, as we ignored the first value train_size -= 1 train_data = data[0:train_size] test_glucoseData = labels[train_size:] train_y = y[0:train_size] test_data = data[train_size:] test_y = y[train_size:] # assert (len(test_y) == len(test_glucoseData)) # assert (len(train_y) + len(test_y) + 1 == num_groundtruth) rf = None if self.tune: model = self.models[self.modelName](random_state=30) param_grid = self.param_grid rf = GridSearchCV(model, param_grid, n_jobs=1, cv=2) rf.fit(train_data, train_y) self.best_params = rf.best_estimator_ self.log.info("Best parameters for patient {} {}".format( self.patient_id, self.best_params)) else: rf = self.models[self.modelName]( n_estimators=self.n_estimator, criterion=self.criterion, min_samples_leaf=self.min_samples_leaf) # confident intervals with small data # train_data, train_y = self.sub_data(train_data, train_y) rf.fit(train_data, train_y) predictions = rf.predict(test_data) V_IJ, V_IJ_unbiased = self.confidenceCal(train_data, test_data, predictions, test_y, rf) # start = datetime.now() # predictions = self.feature_select_on_accuracy(train_data, train_y, test_data, rf) # print "runtime for feature selection for patient{}. : {}".format(self.patient_id, datetime.now() - start) confidence_thrsd = 0.4 # choose from the parameter tuning filtered = [] for i in range(0, len(test_data)): if V_IJ[i] >= confidence_thrsd: filtered.append(i) test_data = np.delete(test_data, filtered) test_y = np.delete(test_y, filtered) test_glucoseData = np.delete(test_glucoseData, filtered) predictions = np.delete(predictions, filtered) if len(test_data) == 0: return # self.confidence_cal(train_data, test_data, test_y, predictions, rf, self.patient_id) results = dict() results['groundtruth'] = [item['value'] for item in test_glucoseData] timestamps = [item['time'] for item in test_glucoseData] results['times'] = timestamps results['indices'] = [int(item['index']) for item in test_glucoseData] results['predictions'] = predictions results['performance'] = compute_performance_time_binned( timestamps=timestamps, groundtruth=test_y, predictions=predictions) results['performance'].update( compute_performance_meals(timestamps=timestamps, groundtruth=test_y, predictions=predictions, carbdata=self.carb_data)) results['params'] = self.save_params() results['featureDesp'] = _featureDesp self.plot_learned_model(results['predictions'], results['groundtruth'], results['times']) return results
def predict_with_data(self, data, Y, _feature_desp="all"): # labeling sorted_Y = sorted(Y) # sort ascending thresh1 = int(len(sorted_Y) * self.threshold1) - 1 thresh2 = int(len(sorted_Y) * self.threshold2) - 1 ''' TODO: refactor for code reuse at class siblings ''' if self.hard_threshold: cat_Y = [self.categorized_y(y, [Constant.HYPERGLYCEMIA_THRESHOLD]) for y in Y] classes = ["non", Constant.HYPER] else: if self.multiclass: cat_Y = [self.categorized_y(y, [sorted_Y[thresh1], sorted_Y[thresh2]]) for y in Y] classes = ['low ' + str(self.threshold1 * 100) + '%', 'medium', 'high'] else: cat_Y = [self.categorized_y(y, [sorted_Y[thresh1]]) for y in Y] classes = ['low ' + str(self.threshold1 * 100) + '%', 'high'] assert (len(data) == len(cat_Y)) # split data num_groundtruth = len(self.glucose_data) train_size = int(num_groundtruth * self.split_ratio) test_size = num_groundtruth - train_size # track test instances in original data for access to metadata test_glucose_data = self.glucose_data[train_size:] assert (len(test_glucose_data) == test_size) # fix train_size, as we ignored the first value train_size -= 1 train_data = data[0:train_size] train_y = cat_Y[0:train_size] test_data = data[train_size:] test_y = cat_Y[train_size:] assert (len(test_y) == len(test_glucose_data)) assert (len(train_y) + len(test_y) + 1 == num_groundtruth) rf = None if self.tune: model = ensemble.RandomForestClassifier(random_state=30, class_weight="balanced") param_grid = self.param_grid rf = GridSearchCV(model, param_grid, n_jobs=-1, cv=2) self.best_params = rf.best_estimator_; self.log.info("Best parameters for patient %s %s") % self.patient_id % self.best_params else: rf = self.models[self.modelName](n_estimators=self.n_estimator, criterion=self.criterion, min_samples_leaf=self.min_samples_leaf, class_weight="balanced") ''' Sampling training set ''' print "Original training set shape: {}".format(Counter(train_y)) print "Original test set shape: {}".format(Counter(test_y)) self.log.debug("Original training set shape: {}".format(Counter(train_y))) # try: # sm = SMOTE(random_state=42, k_neighbors=3) # train_data, train_y = sm.fit_sample(np.asarray(train_data), np.asarray(train_y)) # self.log.debug("Resampled training set shape: {}".format(Counter(train_y))) # except ValueError: # pass rf.fit(train_data, train_y) predictions = rf.predict(test_data) print "prediction: {}".format(predictions) print "accuracy: {}".format(accuracy_score(test_y, predictions, normalize=True)) print precision_recall_fscore_support(test_y,predictions) timestamps = [item['time'] for item in test_glucose_data] results = dict() results['groundtruth'] = [item['value'] for item in test_glucose_data] results['times'] = timestamps results['indices'] = [int(item['index']) for item in test_glucose_data] results['performance'], results['perClass'] = compute_performance_time_binned(test_y, predictions, timestamps=timestamps, regression=False, plotConfusionMatrix=False, classes=classes, patientId=self.patient_id, model=self.modelName) r_meal, r_meal_perclass = compute_performance_meals(test_y, predictions, timestamps=timestamps, plotConfusionMatrix=False, classes=classes, patientId=self.patient_id, carbdata=self.carb_data, regression=False, model=self.modelName) results['performance'].update(r_meal) results['perClass'].update(r_meal_perclass) results['params'] = self.save_params() # Compute confusion matrix cnf_matrix = confusion_matrix(test_y, predictions) np.set_printoptions(precision=2) results["report"] = "Binary classification with label: {}".format(classes) results["report"] = "number of instance in {}: {} and in {}: {}".format(classes[0], thresh1 + 1, classes[1], len(Y) - thresh1 - 1) results["report"] += ";confusion matrix: " + str(cnf_matrix) #Plot non-normalized confusion matrix save_confusion_matrix(cnf_matrix, classes=classes, patientId=self.patient_id, desc="all", model=self.modelName) return results