def setup(self, X, y): ''' Inspects data to build crossvalidation folds, initialize results lists and dicts, and build other helper lists and dicts IN: SeriesModel X - pd dataframe - trial data. see data structures. Usually passed in to fit y - pd dataframe - trial labels. see data structures. Usually passed in to fit OUT: None ''' start = time.time() ptf('\n>> i. Setting-up TriggeredSeriesModel ...', self.logfile) # Use parent class methods self.confusion_labels = self._build_confusion_labels(y) self.trial_lengths = self.find_trial_lengths(X) self.inspect_trial_shapes(X) self._build_results_dataframes(len(X)) # define new methods self._build_crossvalidation_folds(y) end = time.time() ptf('\n>> Set-up completed (%s seconds) <<' % (end - start), self.logfile)
def load_spots_files(x, root_folder, column_headers, columns_to_drop, fname = 'spots.txt', verbose=False, LOGFILE=None): # csv file created in a windows enivornment relpath = x['Folder'].replace(ntpath.sep, os.sep) data_file = os.path.join(root_folder, relpath, fname) if verbose: ptf(['Loading data from', data_file], LOGFILE) # load data file mini_df = pd.read_table(data_file, header=None) # some files have extra tab, creating extra data column. # Strip them if mini_df.values.shape[1]>241: mini_df.drop(241, inplace=True, axis=1) # add column_headers to df mini_df.columns = column_headers # drop spots mini_df.drop(columns_to_drop, inplace=True, axis=1) # convert filenames to minutes # mini_df['time'] = mini_df['time'].apply(timestamp_interpretter) ntimes = len(mini_df) mini_df['time'] = np.arange(0, 20*ntimes, 20) x['data'] = mini_df return x
def _trigger_score_one_fold(self, yt, yp, probas, t, testtrain='test', fold='all'): number_of_times = t fpr, tpr, thresholds = roc_curve(yt, probas[:, 1], pos_label=1) roc_auc = auc(fpr, tpr) if self.verbose and fold == 'all': ptf('%s results' % testtrain, self.logfile) ptf( mcm.classification_report_ovr( yt, yp, self.confusion_labels['detection']), self.logfile) scores = mcm.scores_binary(yt, yp) # builds confusion matrix of TP, FP, etc. for the detection case cm = mcm.confusion_matrix_binary(yt, yp) # detection - populate scores overall_acc = accuracy_score(yt, yp) score_dict = self._trigger_populate_score_dict(cm, scores, number_of_times, fpr, tpr, thresholds, roc_auc, overall_acc=overall_acc) score_dict['fold'] = fold if testtrain == 'train': self._append_row_to_df(self.trigger_scores, score_dict) else: self._append_row_to_df(self.trigger_scores_test, score_dict)
def reload_data(LOGFILE = None, PICKLE_DATA = True, root_folder = 'Shared Sepsis Data', csv_filename = 'Sepsis_JCM.csv'): ''' Reloads raw_data from folders. IN: LOGFILE - fileobj - an open text file where logs are written PICKLE_DATA - bool - whether to pickle data once loaded root_folder - str - relative path to top level folder for all data and csv_file csv_filename - str - name of csv file containing the trial labels and locations ''' csv_file = os.path.join(root_folder, csv_filename) X, y, used_column_headers, df, df_raw = load_data(root_folder, csv_file, verbose=False, LOGFILE=LOGFILE) # pickle data for later loading efficiency if PICKLE_DATA: start = time.time() ptf( '\n>> Pickling data ...\n', LOGFILE) for z, zname in izip([X,y,used_column_headers], PICKLE_NAMES): my_pickle(z, zname) end = time.time() ptf( 'Data pickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) return X, y, used_column_headers, df, df_raw
def print_run_details(X, sm, LOGFILE=None): # prints other run details ptf('\n\n>> Run details <<') ptf('\tntrials: %d' % len(X), LOGFILE) ptf('\tntimes: %d' % len(sm.times), LOGFILE) ptf('\n\n>> Other model details ', LOGFILE) ptf(sm, LOGFILE)
def setup(self, X, y): ''' Inspects data to build crossvalidation folds, initialize results lists and dicts, and build other helper lists and dicts IN: SeriesModel X - pd dataframe - trial data. see data structures. Usually passed in to fit y - pd dataframe - trial labels. see data structures. Usually passed in to fit OUT: None ''' start = time.time() ptf('\n>> i. Setting-up TriggeredSeriesModel ...', self.logfile) # Use parent class methods self.confusion_labels = self._build_confusion_labels(y) self.trial_lengths = self.find_trial_lengths(X) self.inspect_trial_shapes(X) self._build_results_dataframes(len(X)) # define new methods self._build_crossvalidation_folds(y) end = time.time() ptf('\n>> Set-up completed (%s seconds) <<' % (end-start), self.logfile)
def load_spots_files(x, root_folder, column_headers, columns_to_drop, fname='spots.txt', verbose=False, LOGFILE=None): # csv file created in a windows enivornment relpath = x['Folder'].replace(ntpath.sep, os.sep) data_file = os.path.join(root_folder, relpath, fname) if verbose: ptf(['Loading data from', data_file], LOGFILE) # load data file mini_df = pd.read_table(data_file, header=None) # some files have extra tab, creating extra data column. # Strip them if mini_df.values.shape[1] > 241: mini_df.drop(241, inplace=True, axis=1) # add column_headers to df mini_df.columns = column_headers # drop spots mini_df.drop(columns_to_drop, inplace=True, axis=1) # convert filenames to minutes # mini_df['time'] = mini_df['time'].apply(timestamp_interpretter) ntimes = len(mini_df) mini_df['time'] = np.arange(0, 20 * ntimes, 20) x['data'] = mini_df return x
def trigger_predict(self, model_detection, X_test, fold, t): number_of_times = t if self.verbose: ptf( 'Predicting detection fold:%d, nt:%d ...' % (fold, number_of_times), self.logfile) y_predict_detection = model_detection.predict(X_test) y_probabilities_detection = model_detection.predict_proba(X_test) return y_predict_detection, y_probabilities_detection
def save_model(sm, RUNID, MODELFILENAME, LOGFILE=None): ''' Saves model to file IN: sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run RUNID - str - str name for the folder where file will be saved MODELFILENAME - str - filename SeriesModel will be saved to LOGFILE - fild obj - open logfile for outputting print statements ''' model_file = open('./' + RUNID + '/' + MODELFILENAME, 'wb') ptf('\n>> Writing model results to %s' % MODELFILENAME, LOGFILE) pickle.dump(sm, model_file, -1) model_file.close()
def trigger_train(self, X, y, fold, t): ''' Trains models for a timestep, fold IN: TriggeredSeriesModel X - nparrays - final features for this train fold, timestep (ntrials X nfeatures) y - dict of nparrays - labels for this train fold (ntrials) for each label class (key) fold - int - fold index NOTE: only the detection label is used in this implementation, but all are passed in for backwords compatibility with series model t - int - time index OUT: models - model - trained model for this timestep, fold predictions - np array - train predictions nparray (ntrials in this fold) probabilities - np array - train probabilities nparray (ntrials in this fold X 2) ''' number_of_times = t # (X_train_detection, X_train_gram, X_train_classification) = X np_X_detection = X print len(y) (y_train_detection, y_train_gram, y_train_classification) = y # fit detection if self.verbose: ptf( 'Training detection fold:%d, nt:%d ...' % (fold, number_of_times), self.logfile) model_detection = self._fit_class(np_X_detection, y_train_detection, self.detection_base_model, self.detection_base_model_arguments, step=('detection t=%d_%d' % (fold, number_of_times))) # store model, predict y_predict_detection = model_detection.predict(np_X_detection) y_probabilities_detection = model_detection.predict_proba( np_X_detection) if not self.on_disk: self.trigger_models[fold][number_of_times] = model_detection else: self.pickle_time_step(model_detection, 'trigger_model', t=number_of_times, fold=fold) return model_detection, y_predict_detection, y_probabilities_detection
def resample(self, X, y, t, fold): if not self.resample_method: return X, y else: start = time.time() if self.verbose: ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile) # create resampler if self.resample_method == 'under': print 'UNDER SAMPLING is not implemented yet' return X, y elif self.resample_method == 'over': if self.oversample_method.lower() == 'smote': resampler = SMOTE(**self.oversample_arguments) else: print 'Your resampling method is not implemented yet' return X, y print type(X), type(y) print X.shape, y[0].shape Xsmote, ysmote = resampler.fit_transform(X, y[0]) # resample ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold) # ysmote_df = self.build_smoted_label_df(ysmote, y, fold) # # find new folds # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y) if self.debug: print np.sum(y[0] == 0), np.sum(ysmote == 0) print np.sum(y[0] == 1), np.sum(ysmote == 1) if self.on_disk: self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t) self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t) else: self.trigger_resample_labels[fold][t] = ysmote_tuple self.trigger_resample_features[fold][t] = Xsmote end = time.time() if self.verbose: ptf('... %d s' % (end - start), self.logfile) return Xsmote, ysmote_tuple
def _regress(self, X): start = time.time() number_of_spots = X.iloc[0].shape[1]-1 self.number_of_spots = number_of_spots coef_ = X.copy() scores_ = X.apply(lambda x: np.zeros(number_of_spots)) for trial_index, x in enumerate(X): if trial_index % 100 == 0: if self.verbose: ptf( 'Featurizing trial %d'% trial_index, self.logfile) # regress coefficients are (poly order +1 )x(n_spots) coefficients = np.zeros((3, number_of_spots)) scores = np.zeros(number_of_spots) t = x[:,0] t = t[self.reference_time:] for column_index in np.arange(x.shape[1]): # if column_index % 10 == 0: # if self.verbose: # ptf( 'ci:%d'% column_index, self.logfile) # print column_index spot_index = column_index - 1 if column_index == 0: pass else: # only fit data past the reference_time # other data is 0 from preprocessing # print t.shape, x[self.reference_time:, column_index].shape # print t # print x[self.reference_time:, column_index] # print self.p_init popt, pcov = curve_fit(self.sigmoid, t, x[self.reference_time:,column_index], p0=self.p_init, ftol=self.ftol, xtol=self.xtol, gtol=self.gtol, maxfev=self.maxfev) coefficients[:, spot_index] = popt xpred = self.sigmoid(t, *popt) scores[spot_index] = r2_score(x[self.reference_time:, column_index], xpred) coef_.iloc[trial_index] = coefficients scores_.iloc[trial_index] = scores end = time.time() ptf( 'Regressed %d trials in %d seconds' % (len(X), (end-start)), self.logfile) return coef_, scores_
def print_run_details(X, sm, LOGFILE=None): ''' prints other run details IN: X - pd DataSeries - Raw feature data (used to report the number of trials) sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run LOGFILE - fild obj - open logfile for outputting print statements OUT: None ''' ptf('\n\n>> Run details <<') ptf('\tntrials: %d' % len(X), LOGFILE) ptf('\tntimes: %d' % len(sm.times), LOGFILE) ptf('\n\n>> Other model details ', LOGFILE) ptf(sm, LOGFILE)
def featurize_triggers(self, X, t): ''' Extracts features for detection, gram, classification from pruneed data using conditions passed to init. IN: SeriesModel X - pd dataframe - preprocessed trial data t - int - time index OUT: X - np_array - extracted features (ntrials X nfeatures) as this timestep ''' start = time.time() number_of_times = t # featurize, storing featurizers at each timestep if self.verbose: ptf('> 1. Featurizing nt=%d ...' % number_of_times, self.logfile) X_train = self._subset_data(X, number_of_times) if self.debug: print t, X.iloc[0].shape (X_trigger, trigger_times), trigger_featurizer = self._featurize_class( X_train, self.detection_base_featurizer, self.detection_base_featurizer_arguments) if self.debug: print t, X_trigger.iloc[0].shape, trigger_times.iloc[0].shape # convert to numpy arrays np_X_trigger = self._pandas_to_numpy(X_trigger) np_trigger_times = self._pandas_to_numpy(trigger_times) if self.debug: print 'Checking featurized shapes', np_X_trigger.shape, np_trigger_times.shape # store features if not self.on_disk: self.trigger_features[t] = np_X_trigger self.trigger_feature_times[t] = np_trigger_times self.trigger_featurizers[t] = trigger_featurizer else: self.pickle_time_step(np_X_trigger, 'trigger_features', t) self.pickle_time_step(np_trigger_times, 'trigger_feature_times', t) self.pickle_time_step(trigger_featurizer, 'trigger_featurizer', t) # Append results to results df later after scoring end = time.time() ptf('\n...(%s seconds) <' % (end - start), self.logfile) return np_X_trigger
def featurize_triggers(self, X, t): ''' Extracts features for detection, gram, classification from pruneed data using conditions passed to init. IN: SeriesModel X - pd dataframe - preprocessed trial data t - int - time index OUT: X - np_array - extracted features (ntrials X nfeatures) as this timestep ''' start = time.time() number_of_times = t # featurize, storing featurizers at each timestep if self.verbose: ptf( '> 1. Featurizing nt=%d ...' % number_of_times, self.logfile) X_train = self._subset_data(X, number_of_times) if self.debug: print t, X.iloc[0].shape (X_trigger, trigger_times), trigger_featurizer = self._featurize_class(X_train, self.detection_base_featurizer, self.detection_base_featurizer_arguments) if self.debug: print t, X_trigger.iloc[0].shape, trigger_times.iloc[0].shape # convert to numpy arrays np_X_trigger = self._pandas_to_numpy(X_trigger) np_trigger_times = self._pandas_to_numpy(trigger_times) if self.debug: print 'Checking featurized shapes', np_X_trigger.shape, np_trigger_times.shape # store features if not self.on_disk: self.trigger_features[t] = np_X_trigger self.trigger_feature_times[t] = np_trigger_times self.trigger_featurizers[t] = trigger_featurizer else: self.pickle_time_step(np_X_trigger, 'trigger_features', t) self.pickle_time_step(np_trigger_times, 'trigger_feature_times', t) self.pickle_time_step(trigger_featurizer, 'trigger_featurizer', t) # Append results to results df later after scoring end = time.time() ptf('\n...(%s seconds) <' % (end-start), self.logfile) return np_X_trigger
def resample(self, X, y, t, fold): if not self.resample_method: return X, y else: start = time.time() if self.verbose: ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile) # create resampler if self.resample_method == 'under': print 'UNDER SAMPLING is not implemented yet' return X, y elif self.resample_method == 'over': if self.oversample_method.lower() == 'smote': resampler = SMOTE(**self.oversample_arguments) else: print 'Your resampling method is not implemented yet' return X, y print type(X), type(y) print X.shape, y[0].shape Xsmote, ysmote = resampler.fit_transform(X, y[0]) # resample ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold) # ysmote_df = self.build_smoted_label_df(ysmote, y, fold) # # find new folds # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y) if self.debug: print np.sum(y[0]==0), np.sum(ysmote == 0) print np.sum(y[0]==1), np.sum(ysmote == 1) if self.on_disk: self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t) self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t) else: self.trigger_resample_labels[fold][t] = ysmote_tuple self.trigger_resample_features[fold][t] = Xsmote end = time.time() if self.verbose: ptf('... %d s' % (end-start), self.logfile) return Xsmote, ysmote_tuple
def trigger_train(self, X, y, fold, t): ''' Trains models for a timestep, fold IN: TriggeredSeriesModel X - nparrays - final features for this train fold, timestep (ntrials X nfeatures) y - dict of nparrays - labels for this train fold (ntrials) for each label class (key) fold - int - fold index NOTE: only the detection label is used in this implementation, but all are passed in for backwords compatibility with series model t - int - time index OUT: models - model - trained model for this timestep, fold predictions - np array - train predictions nparray (ntrials in this fold) probabilities - np array - train probabilities nparray (ntrials in this fold X 2) ''' number_of_times = t # (X_train_detection, X_train_gram, X_train_classification) = X np_X_detection = X print len(y) (y_train_detection, y_train_gram, y_train_classification) = y # fit detection if self.verbose: ptf( 'Training detection fold:%d, nt:%d ...' % (fold, number_of_times), self.logfile) model_detection = self._fit_class(np_X_detection, y_train_detection, self.detection_base_model, self.detection_base_model_arguments, step=('detection t=%d_%d' % (fold,number_of_times))) # store model, predict y_predict_detection = model_detection.predict(np_X_detection) y_probabilities_detection = model_detection.predict_proba(np_X_detection) if not self.on_disk: self.trigger_models[fold][number_of_times] = model_detection else: self.pickle_time_step(model_detection, 'trigger_model', t=number_of_times, fold=fold) return model_detection, y_predict_detection, y_probabilities_detection
def reload_data(LOGFILE=None, PICKLE_DATA=True, root_folder='Shared Sepsis Data', csv_filename='Sepsis_JCM.csv'): ''' Reloads raw_data from folders. IN: LOGFILE - fileobj - an open text file where logs are written PICKLE_DATA - bool - whether to pickle data once loaded root_folder - str - relative path to top level folder for all data and csv_file csv_filename - str - name of csv file containing the trial labels and locations OUT: X - pd Series - Series of features. Each row is a trial (index) and a number of features + 1 X number of times numpy array (data) y - pd DataFrame - labels data frame. Each row is a trial (index) and the labels of each class the columns used_column_headers - list of str - df - pd DataFrame - DataFrame containing all trial data after elmination of extraneous spots, trials df_raw - pd DataFrame - DataFrame containing all trial data (before pruning) ''' csv_file = os.path.join(root_folder, csv_filename) X, y, used_column_headers, df, df_raw = load_data(root_folder, csv_file, verbose=False, LOGFILE=LOGFILE) # pickle data for later loading efficiency if PICKLE_DATA: start = time.time() ptf('\n>> Pickling data ...\n', LOGFILE) for z, zname in izip([X, y, used_column_headers], PICKLE_NAMES): my_pickle(z, zname) end = time.time() ptf( 'Data pickled in %d seconds (%d total trials)' % ((end - start), len(X)), LOGFILE) return X, y, used_column_headers, df, df_raw
def _regress(self, X, model, poly): # timing of the regress step start = time.time() number_of_spots = X.iloc[0].shape[1]-1 self.number_of_spots = number_of_spots coef_ = X.copy() scores_ = X.apply(lambda x: np.zeros(number_of_spots)) for trial_index, x in enumerate(X): if trial_index % 100 == 0: if self.verbose: ptf( 'Polynomial Featurizing trial %d'% trial_index, self.logfile) # regress coefficients are (poly order +1 )x(n_spots) coefficients = np.zeros(((self.n+1), number_of_spots)) scores = np.zeros(number_of_spots) # number of times different for each observation # QUESTION - what for trials of different lengths? # -> maybe deal with this in the fit/predict steps? t = poly.fit_transform((x[:,0]).reshape(-1,1)) # only regress on data past reference time t = t[self.reference_time:] for column_index in np.arange(x.shape[1]): spot_index = column_index - 1 if column_index == 0: pass else: # only fit data past the reference_time # other data is 0 from preprocessing model.fit(t, x[self.reference_time:,column_index]) coefficients[:, spot_index] = model.coef_ scores[spot_index] = model.score(t, x[self.reference_time:, column_index]) # print trial_index, spot_index, model.coef_, scores[spot_index] coef_.iloc[trial_index] = coefficients scores_.iloc[trial_index] = scores end = time.time() # ptf( 'Regressed %d trials, n=%d in %d seconds' % (len(X), self.n, (end-start)), self.logfile) print 'PFR', coef_.iloc[0][:,0], X.iloc[0].shape return coef_, scores_
def make_df(sm, y, runid, logfile=None): trial_hash = make_trial_hash(y) ch = make_ch(sm) # set-up data frame sm.logfile = logfile tf_collection = pd.DataFrame(columns=ch) tft_collection = pd.DataFrame(columns=ch) # append to dataframes index_list = [] for t in sm.times: if t==12: continue tf = sm.load_time_step('trigger_features', t=t) tft = sm.load_time_step('trigger_feature_times', t=t) # check for nans # x = [i for i,k in enumerate(tf) if np.isnan(k)] if np.sum(np.isnan(tf)): ptf('NANS in timestep %d for features' % t, logfile) ptf(x, logfile) # x = [i for i,k in enumerate(tft) if np.isnan(k)] if np.sum(np.isnan(tft)): ptf('NANS in timestep %d for feature_timess' % t, logfile) ptf(x, logfile) # print t, tf.shape, tft.shape df = pd.DataFrame(tf, columns=ch[:-2]) df['timestep'] = t df['trial'] = df.index if len(index_list): df['trial'] = index_list else: df['trial'] = df['trial'].apply(lambda x: trial_hash[x]) index_list = df['trial'].values tf_collection = tf_collection.append(df) df = pd.DataFrame(tft, columns=ch[:-2]) df['timestep'] = t df['trial'] = index_list tft_collection = tft_collection.append(df) # drop nans tf_collection = tf_collection.dropna() tft_collection = tft_collection.dropna() return tf_collection, tft_collection
def _trigger_score_one_fold(self, yt, yp, probas, t, testtrain='test', fold='all'): number_of_times = t fpr, tpr, thresholds = roc_curve(yt, probas[:,1], pos_label=1) roc_auc = auc(fpr, tpr) if self.verbose and fold=='all': ptf('%s results' % testtrain, self.logfile) ptf(mcm.classification_report_ovr(yt, yp, self.confusion_labels['detection']), self.logfile) scores = mcm.scores_binary(yt, yp) # builds confusion matrix of TP, FP, etc. for the detection case cm = mcm.confusion_matrix_binary(yt, yp) # detection - populate scores overall_acc = accuracy_score(yt, yp) score_dict = self._trigger_populate_score_dict(cm, scores, number_of_times, fpr, tpr, thresholds, roc_auc, overall_acc=overall_acc) score_dict['fold'] = fold if testtrain == 'train': self._append_row_to_df(self.trigger_scores, score_dict) else: self._append_row_to_df(self.trigger_scores_test, score_dict)
def reload_data(LOGFILE = None, PICKLE_DATA = True, root_folder = 'Shared Sepsis Data', csv_filename = 'Sepsis_JCM.csv'): ''' Reloads raw_data from folders. IN: LOGFILE - fileobj - an open text file where logs are written PICKLE_DATA - bool - whether to pickle data once loaded root_folder - str - relative path to top level folder for all data and csv_file csv_filename - str - name of csv file containing the trial labels and locations OUT: X - pd Series - Series of features. Each row is a trial (index) and a number of features + 1 X number of times numpy array (data) y - pd DataFrame - labels data frame. Each row is a trial (index) and the labels of each class the columns used_column_headers - list of str - df - pd DataFrame - DataFrame containing all trial data after elmination of extraneous spots, trials df_raw - pd DataFrame - DataFrame containing all trial data (before pruning) ''' csv_file = os.path.join(root_folder, csv_filename) X, y, used_column_headers, df, df_raw = load_data(root_folder, csv_file, verbose=False, LOGFILE=LOGFILE) # pickle data for later loading efficiency if PICKLE_DATA: start = time.time() ptf( '\n>> Pickling data ...\n', LOGFILE) for z, zname in izip([X,y,used_column_headers], PICKLE_NAMES): my_pickle(z, zname) end = time.time() ptf( 'Data pickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) return X, y, used_column_headers, df, df_raw
def make_df(sm, y, runid, logfile=None): trial_hash = make_trial_hash(y) ch = make_ch(sm) # set-up data frame sm.logfile = logfile tf_collection = pd.DataFrame(columns=ch) tft_collection = pd.DataFrame(columns=ch) # append to dataframes index_list = [] for t in sm.times: if t == 12: continue tf = sm.load_time_step('trigger_features', t=t) tft = sm.load_time_step('trigger_feature_times', t=t) # check for nans # x = [i for i,k in enumerate(tf) if np.isnan(k)] if np.sum(np.isnan(tf)): ptf('NANS in timestep %d for features' % t, logfile) ptf(x, logfile) # x = [i for i,k in enumerate(tft) if np.isnan(k)] if np.sum(np.isnan(tft)): ptf('NANS in timestep %d for feature_timess' % t, logfile) ptf(x, logfile) # print t, tf.shape, tft.shape df = pd.DataFrame(tf, columns=ch[:-2]) df['timestep'] = t df['trial'] = df.index if len(index_list): df['trial'] = index_list else: df['trial'] = df['trial'].apply(lambda x: trial_hash[x]) index_list = df['trial'].values tf_collection = tf_collection.append(df) df = pd.DataFrame(tft, columns=ch[:-2]) df['timestep'] = t df['trial'] = index_list tft_collection = tft_collection.append(df) # drop nans tf_collection = tf_collection.dropna() tft_collection = tft_collection.dropna() return tf_collection, tft_collection
def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False, DO_TESTS=False, PROFILE=False, verbose=False, debug=False, RELOAD=False, n_cpus=1, PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']): ''' Runs our series model or triggered series model job based on the runtime conditions and run parameters. IN: RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone MODELFILENAME - str - filename of model (for pickling) PICKLE_DATA - bool - if the raw data should be pickled after loading into a data frame DO_TESTS - bool - if unittests should be run (True), or a job run (False) PROFILE - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True) RELOAD - bool - whether data should be loaded from pickle (False), or reloaded from raw data (True). Set to true only for first run on a new instance, then set to False for future runs to save load time. n_cpus - int - number of cpus to use for multiprocessing jobs. PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels) data and spots_used file names. When RELOAD is set to True, this is the filenames where this data will be saved. When RELOAD is set to False, this is where the data will be loaded from. OUT: None ''' RUNID = command_line_process(RUNID) # prepare to run job LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR) LOGFILE = create_logfile(RUNID, LOGFILENAME) # get the run conditions for the runid from the json # NOTE excludes verbose and debug flags - those are fit parameters # and exludes runid since that is set up above with open((RUNID + '.json')) as f: run_params = json.load(f, object_hook=ascii_encode_dict) # to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE: tr, tr_sm = start_memory_profiling if RUNTYPE == 'trigger': ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID) elif RUNTYPE == 'series': ptf('*** %s - SERIES MODEL - ***' % RUNID) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=True) if RELOAD: X, y, used_column_headers, df, df_raw = reload_data(LOGFILE, PICKLE_DATA) else: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) run_params['logfile'] = LOGFILE run_params['runid'] = RUNID # create model if RUNTYPE == 'trigger': sm = TriggeredSeriesModel(used_column_headers.values, **run_params) elif RUNTYPE == 'series': sm = SeriesModel(**run_params) # Altogether now print ('** DOING THE FIT **') sm.fit(X, y, verbose=verbose, debug=debug) bigend = time.time() ptf('====> %d seconds (%0.1f mins)' % ((bigend-bigstart), (bigend-bigstart)/60.0), LOGFILE) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=False) print_run_details(X, sm, LOGFILE) save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE) ## VIEW RESULTS if RUNTYPE == 'trigger': make_trigger_plots(sm, y, RUNID, debug=debug) elif RUNTYPE == 'series': make_series_plots(sm) if PROFILE: print_memory_profiles(sm, tr, tr_sm, LOGFILE = None) LOGFILE.close()
def save_model(sm, RUNID, MODELFILENAME, LOGFILE=None): # saves model to file model_file = open('./' + RUNID + '/' + MODELFILENAME, 'wb') ptf('\n>> Writing model results to %s' % MODELFILENAME, LOGFILE) pickle.dump(sm, model_file, -1) model_file.close()
def make_kde_plot(df, spot, runid, title=None, cmap='Greens', plotclass=None, logfile=None, debug=False): plt.figure() ptf('Plot KDE %s - %s' % (title, spot), logfile) x,y = stack_rows(df, spot) ptf('%s, %s' % (x.shape, y.shape), logfile) ptf('Check for nans', logfile) ptf('%s, %s' % (np.sum(np.isnan(x)), np.sum(np.isnan(y))), logfile) ptf('computing kde...', logfile) sns.kdeplot(x,y, shade=True, cmap=cmap) plottitle = runid + '-' + spot + ' - KDE trigger vs t' if title: plottitle += ' - ' + title if plotclass: plottitle += ' - ' + plotclass plt.title(plottitle) plt.xlabel('t (hrs)') if title: plt.ylabel(title) else: plt.ylabel('trigger metric') filename = runid + '/' + runid + '-' + spot + ' - KDE trigger vs t' if title: filename += ' - ' + title if plotclass: filename += ' - ' + plotclass ptf('Saving plot %s' % filename, logfile) plt.savefig(filename, dpi=200) if debug: plt.show() else: plt.close()
def print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE = None, debug = False, profile=False, verbose = True, start = True): # Outputs header, footer describing job conditions if start: ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) else: ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE) ptf('\tdebug: %s' % debug, LOGFILE) ptf('\tprofile: %s' % profile, LOGFILE) ptf('\tverbose: %s' % verbose, LOGFILE) for k, v in run_params.iteritems(): ptf('\t%s: %s' % (k,v), LOGFILE)
def split_train_test(X, y, test_size=0.25, verbose=False, LOGFILE=None): # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, # stratify=y['classification'].unique()) sss = StratifiedShuffleSplit(y=y['classification'], n_iter=1, test_size=test_size, random_state=1) for train_index, test_index in sss: ptf(['Train: ', len(train_index)], LOGFILE) ptf(['Test: ', len(test_index)], LOGFILE) X_train = X.iloc[train_index] X_test = X.iloc[test_index] y_train = y.iloc[train_index] y_test = y.iloc[test_index] if verbose: ptf( '\nTEST summary:', LOGFILE) ptf( y_test.groupby('classification').count(), LOGFILE) ptf( '\nTRAIN summary:', LOGFILE) ptf( y_train.groupby('classification').count(), LOGFILE) return X_train, X_test, y_train, y_test
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None): '''prints report on memory profiles''' ptf( '\nSERIESMODEL profiling', LOGFILE) ptf( 'Look at size of seriesmodel object', LOGFILE) ptf( asizeof.asizeof(sm), LOGFILE) ptf( asizeof.asized(sm, detail=1).format(), LOGFILE) ptf( 'Look at how the SeriesModel class is doing', LOGFILE) tr_sm.create_snapshot() tr_sm.stats.print_summary() tr_sm.stats.print_summary() >> LOGFILE ptf( 'PROFILING', LOGFILE) ptf( 'Look at memory leaks up to this point', LOGFILE) tr.print_diff() >> LOGFILE tr.print_diff()
def print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE = None, debug = False, profile=False, verbose = True, start = True): ''' Outputs header, footer describing job conditions IN: run_params - dict - Dictionary from the runparameters json describing. Contains the initializiation conditions for the seriesmodel of this run. n_jobs - int - number of jobs to be used by parallelizable solvers in seriesmodel n_cpus - int - number of cpus available on this machine RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True). Condition for main/seriesmodel. profile - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output. Condition for main/seriesmodel. start - bool - whether this is the header (True) or footer (False) of the run output OUT: None ''' if start: ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) else: ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE) ptf('\tdebug: %s' % debug, LOGFILE) ptf('\tprofile: %s' % profile, LOGFILE) ptf('\tverbose: %s' % verbose, LOGFILE) for k, v in run_params.iteritems(): ptf('\t%s: %s' % (k,v), LOGFILE)
def make_kde_plot(df, spot, runid, title=None, cmap='Greens', plotclass=None, logfile=None, debug=False): plt.figure() ptf('Plot KDE %s - %s' % (title, spot), logfile) x, y = stack_rows(df, spot) ptf('%s, %s' % (x.shape, y.shape), logfile) ptf('Check for nans', logfile) ptf('%s, %s' % (np.sum(np.isnan(x)), np.sum(np.isnan(y))), logfile) ptf('computing kde...', logfile) sns.kdeplot(x, y, shade=True, cmap=cmap) plottitle = runid + '-' + spot + ' - KDE trigger vs t' if title: plottitle += ' - ' + title if plotclass: plottitle += ' - ' + plotclass plt.title(plottitle) plt.xlabel('t (hrs)') if title: plt.ylabel(title) else: plt.ylabel('trigger metric') filename = runid + '/' + runid + '-' + spot + ' - KDE trigger vs t' if title: filename += ' - ' + title if plotclass: filename += ' - ' + plotclass ptf('Saving plot %s' % filename, logfile) plt.savefig(filename, dpi=200) if debug: plt.show() else: plt.close()
def fit(self, X, y, verbose=False, trigger_only=True, debug=False): self.trigger_only = trigger_only self.verbose = verbose self.debug = debug # start with the second time tmin = self.reference_time + self.min_time self.times = np.arange(tmin, self.max_time, 1) # i) SETUP # self.setup(X, y) # Check trial integrity self._check_trial_integrity() # 0) PREPROCESS All trials X_preprocessed = self.preprocess(X) X_pruned = self.prune_spots(X_preprocessed, self.trigger_spots, self.column_headers) # check load state and run only needed times if self.load_state == 'featurize': ptf( '\n>> 1. Computing triggers from timestep %d ...' % self.load_time, self.logfile) run_times = self.make_run_times(self.times, self.load_time) else: ptf('\n>> 1. Computing triggers from first timestep %d ...' % tmin, self.logfile) run_times = self.times if debug: self.times = [30, 40, 50] run_times = self.times for t in run_times: start = time.time() if self.verbose: ptf('\n\nTIMESTEP %d...' % t, self.logfile) # 1) trigger_featurize X_featurized = self.featurize_triggers(X_pruned, t) # results to accumulate for this timestep y_train_true_timestep = [] y_train_predict_timestep = [] y_test_true_timestep = [] y_test_predict_timestep = [] y_train_probabilities = [] y_test_probabilities = [] for i, (fold, fold_indexes) in enumerate(self.folds.iteritems()): (X_train, X_test) = self._subset_fold_triggers(X_featurized, fold) (y_train, y_test) = self._subset_fold_y(y, fold) # resample X_resampled, y_resampled = self.resample( X_train, y_train, t, fold) y_train_resampled = y_resampled # 1) scale and/or reduce the data if self.verbose: ptf('Scaling fold %d' % fold, self.logfile) X_scaled, scaler = self._scale_class( X_resampled, self.detection_base_scaler, self.detection_base_scaler_arguments) X_test_scaled = scaler.fit_transform(X_test) # 1A) reduce if self.verbose: ptf('Reducing fold %d' % fold, self.logfile) X_reduced, reducer = self._reduce_class( X_scaled, self.detection_base_reducer, self.detection_base_reducer_arguments) X_test_reduced = reducer.transform(X_test_scaled) # 2) trigger_train if self.verbose: ptf('Training fold %d' % fold, self.logfile) model, train_predictions, train_probabilities = \ self.trigger_train(X_reduced, y_train_resampled, fold, t) # 3) trigger_predict if self.verbose: ptf('Predicting fold %d' % fold, self.logfile) test_predictions, test_probabilities = \ self.trigger_predict(model, X_test_reduced, fold, t) # 3A) store fold if self.verbose: ptf('Storing fold %d' % fold, self.logfile) self._trigger_store_one_fold( (train_predictions, train_probabilities), (test_predictions, test_probabilities), fold, t) # 3B) score one fold if self.verbose: ptf('Scoring fold %d' % fold, self.logfile) # print t, y_resampled[0].shape, self._trigger_score_one_fold(y_train_resampled[0], train_predictions, train_probabilities, t, testtrain='train', fold=fold) self._trigger_score_one_fold(y_test[0], test_predictions, test_probabilities, t, testtrain='test', fold=fold) # stack probas if fold == 0: y_test_probabilities = test_probabilities y_train_probabilities = train_probabilities else: y_test_probabilities = np.vstack( (y_test_probabilities, test_probabilities)) y_train_probabilities = np.vstack( (y_train_probabilities, train_probabilities)) y_train_true_timestep.extend(y_train_resampled[0]) y_train_predict_timestep.extend(train_predictions) y_test_true_timestep.extend(y_test[0]) y_test_predict_timestep.extend(test_predictions) # 4) trigger_score if self.verbose: ptf('Scoring timestep %d' % t, self.logfile) self._trigger_score_one_fold(y_train_true_timestep, y_train_predict_timestep, y_train_probabilities, t, 'train') self._trigger_score_one_fold(y_test_true_timestep, y_test_predict_timestep, y_test_probabilities, t, 'test') # if self.trigger_only: # return X_featurized # 4A) Write avg tau to file for each trial. We will pass this into triggered series model ## UPDATES FOR V2 BEYOND THIS ## # 5) tranform tau # for t in range(0, self.max_postdetection_time, 20): return X_featurized
def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False, DO_TESTS=False, PROFILE=False, verbose=False, debug=False, RELOAD=False, n_cpus=1, PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']): ''' Runs our series model or triggered series model job based on the runtime conditions and run parameters. IN: RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone MODELFILENAME - str - filename of model (for pickling) PICKLE_DATA - bool - if the raw data should be pickled after loading into a data frame DO_TESTS - bool - if unittests should be run (True), or a job run (False) PROFILE - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True) RELOAD - bool - whether data should be loaded from pickle (False), or reloaded from raw data (True). Set to true only for first run on a new instance, then set to False for future runs to save load time. n_cpus - int - number of cpus to use for multiprocessing jobs. PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels) data and spots_used file names. When RELOAD is set to True, this is the filenames where this data will be saved. When RELOAD is set to False, this is where the data will be loaded from. OUT: None ''' RUNID = command_line_process(RUNID) # prepare to run job LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR) LOGFILE = create_logfile(RUNID, LOGFILENAME) # get the run conditions for the runid from the json # NOTE excludes verbose and debug flags - those are fit parameters # and exludes runid since that is set up above with open((RUNID + '.json')) as f: run_params = json.load(f, object_hook=ascii_encode_dict) # to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf('\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end - start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE: tr, tr_sm = start_memory_profiling if RUNTYPE == 'trigger': ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID) elif RUNTYPE == 'series': ptf('*** %s - SERIES MODEL - ***' % RUNID) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=True) if RELOAD: X, y, used_column_headers, df, df_raw = reload_data( LOGFILE, PICKLE_DATA) else: start = time.time() ptf('\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end - start), len(X)), LOGFILE) run_params['logfile'] = LOGFILE run_params['runid'] = RUNID # create model if RUNTYPE == 'trigger': sm = TriggeredSeriesModel(used_column_headers.values, **run_params) elif RUNTYPE == 'series': sm = SeriesModel(**run_params) # Altogether now print('** DOING THE FIT **') sm.fit(X, y, verbose=verbose, debug=debug) bigend = time.time() ptf( '====> %d seconds (%0.1f mins)' % ((bigend - bigstart), (bigend - bigstart) / 60.0), LOGFILE) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=False) print_run_details(X, sm, LOGFILE) save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE) ## VIEW RESULTS if RUNTYPE == 'trigger': make_trigger_plots(sm, y, RUNID, debug=debug) elif RUNTYPE == 'series': make_series_plots(sm) if PROFILE: print_memory_profiles(sm, tr, tr_sm, LOGFILE=None) LOGFILE.close()
def load_data(root_folder, csv_filename, verbose=False, LOGFILE=None): ptf('\n>> Loading csv...\n', LOGFILE) df_raw = pd.read_csv(csv_filename) # only work with "good" trials df_raw = df_raw[df_raw['Ignore'] != True] column_headers = create_column_headers() columns_to_drop = populate_columns_to_drop() start = time.time() ptf('\n>> Loading data files...\n', LOGFILE) df_raw = df_raw.apply(lambda x: load_spots_files( x, root_folder, column_headers, columns_to_drop, LOGFILE=LOGFILE), axis=1) end = time.time() ptf( 'Data loaded in %d seconds (%d total trials)' % ((end - start), len(df_raw)), LOGFILE) # DATA INSPECTION - finding values outside of 0, 4096 # Reference_time must be greater than 1 if we use DII # see code and snippet below # All trials from the same day and at the same time (1: 20 minutes) # ==> instrumentation error at that time if verbose: start = time.time() ptf('Finding anomalous trials...', LOGFILE) an_df = find_data_anomalies(df_raw) end = time.time() ptf( 'Anomalous trials found in %d seconds (%d trials):' % ((end - start), len(an_df)), LOGFILE) ptf(an_df, LOGFILE) ''' Finding anomalous trials... Anomalous trial found in 44 seconds (13 trials): 372 ([1], 20120504\BCB\E. coli 25922 10 CFU\F1) 373 ([1], 20120504\BCB\E. coli 25922 10 CFU\F16) 374 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F7) 375 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F12) 376 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F21) 377 ([1], 20120504\BCB\S. maltophilia Clinical A\F4) 378 ([1], 20120504\BCB\S. maltophilia Clinical A\F13) 379 ([1], 20120504\BCB\S. maltophilia Clinical A\F18) 380 ([1], 20120504\BCB\S. maltophilia Clinical A\F23) 381 ([1], 20120504\BCB\S. maltophilia Clinical B\F5) 382 ([1], 20120504\BCB\S. maltophilia Clinical B\F10) 383 ([1], 20120504\BCB\S. maltophilia Clinical B\F15) 384 ([1], 20120504\BCB\S. maltophilia Clinical B\F20) ''' # re-order 'data' part of frame for convenience # currently exists as a data frame with name columns # time, 2R, 2G, 2B .... 79R, 79G, 79B # need to be able to manipulate data as numpy arrays # keep column headers around for later use df, used_column_headers = prepare_data_frame(df_raw) ptf('Creating labels...', LOGFILE) label_dictionaries = create_labels_dictionaries() df = create_labels(df, label_dictionaries) # drop unwanted labels df = df[df['Ignore_label'] != True] X = df['data'] y = df[['classification', 'gram', 'detection']] if verbose: ptf('\nSummary counts after cleaning:', LOGFILE) ptf(y.groupby('gram').count(), LOGFILE) ptf(y.groupby('detection').count(), LOGFILE) ptf(y.groupby('classification').count(), LOGFILE) return X, y, used_column_headers, df, df_raw
def print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=None, debug=False, profile=False, verbose=True, start=True): ''' Outputs header, footer describing job conditions IN: run_params - dict - Dictionary from the runparameters json describing. Contains the initializiation conditions for the seriesmodel of this run. n_jobs - int - number of jobs to be used by parallelizable solvers in seriesmodel n_cpus - int - number of cpus available on this machine RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True). Condition for main/seriesmodel. profile - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output. Condition for main/seriesmodel. start - bool - whether this is the header (True) or footer (False) of the run output OUT: None ''' if start: ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) else: ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE) ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE) ptf('\tdebug: %s' % debug, LOGFILE) ptf('\tprofile: %s' % profile, LOGFILE) ptf('\tverbose: %s' % verbose, LOGFILE) for k, v in run_params.iteritems(): ptf('\t%s: %s' % (k, v), LOGFILE)
LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR) LOGFILE = create_logfile(RUNID, LOGFILENAME) # get the run conditions for the runid from the json # NOTE excludes verbose and debug flags - those are fit parameters # and exludes runid since that is set up above with open((RUNID + '.json')) as f: run_params = json.load(f, object_hook=ascii_encode_dict) # to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE:
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None): ''' Prints report on memory profiles IN: sm - SeriesModel - SeriesModel object for this run tr - SummaryTracker - SummaryTracker object for the whole run tr_sm - ClassTrackers - ClassTracker object of SeriesModel LOGFILE - file obj - Open logfile for print output OUT: None ''' ptf( '\nSERIESMODEL profiling', LOGFILE) ptf( 'Look at size of seriesmodel object', LOGFILE) ptf( asizeof.asizeof(sm), LOGFILE) ptf( asizeof.asized(sm, detail=1).format(), LOGFILE) ptf( 'Look at how the SeriesModel class is doing', LOGFILE) tr_sm.create_snapshot() tr_sm.stats.print_summary() tr_sm.stats.print_summary() >> LOGFILE ptf( 'PROFILING', LOGFILE) ptf( 'Look at memory leaks up to this point', LOGFILE) tr.print_diff() >> LOGFILE tr.print_diff()
def split_train_test(X, y, test_size=0.25, verbose=False, LOGFILE=None): # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, # stratify=y['classification'].unique()) sss = StratifiedShuffleSplit(y=y['classification'], n_iter=1, test_size=test_size, random_state=1) for train_index, test_index in sss: ptf(['Train: ', len(train_index)], LOGFILE) ptf(['Test: ', len(test_index)], LOGFILE) X_train = X.iloc[train_index] X_test = X.iloc[test_index] y_train = y.iloc[train_index] y_test = y.iloc[test_index] if verbose: ptf('\nTEST summary:', LOGFILE) ptf(y_test.groupby('classification').count(), LOGFILE) ptf('\nTRAIN summary:', LOGFILE) ptf(y_train.groupby('classification').count(), LOGFILE) return X_train, X_test, y_train, y_test
def load_data(root_folder, csv_filename, verbose=False, LOGFILE=None): ptf( '\n>> Loading csv...\n', LOGFILE) df_raw = pd.read_csv(csv_filename) # only work with "good" trials df_raw = df_raw[df_raw['Ignore'] != True] column_headers = create_column_headers() columns_to_drop = populate_columns_to_drop() start = time.time() ptf( '\n>> Loading data files...\n', LOGFILE) df_raw = df_raw.apply(lambda x: load_spots_files(x, root_folder, column_headers, columns_to_drop, LOGFILE=LOGFILE), axis=1) end = time.time() ptf( 'Data loaded in %d seconds (%d total trials)' % ((end-start), len(df_raw)), LOGFILE) # DATA INSPECTION - finding values outside of 0, 4096 # Reference_time must be greater than 1 if we use DII # see code and snippet below # All trials from the same day and at the same time (1: 20 minutes) # ==> instrumentation error at that time if verbose: start = time.time() ptf( 'Finding anomalous trials...', LOGFILE) an_df = find_data_anomalies(df_raw) end = time.time() ptf( 'Anomalous trials found in %d seconds (%d trials):' % ((end-start), len(an_df)), LOGFILE) ptf( an_df, LOGFILE) ''' Finding anomalous trials... Anomalous trial found in 44 seconds (13 trials): 372 ([1], 20120504\BCB\E. coli 25922 10 CFU\F1) 373 ([1], 20120504\BCB\E. coli 25922 10 CFU\F16) 374 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F7) 375 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F12) 376 ([1], 20120504\BCB\S. aureus 29213 10 CFU\F21) 377 ([1], 20120504\BCB\S. maltophilia Clinical A\F4) 378 ([1], 20120504\BCB\S. maltophilia Clinical A\F13) 379 ([1], 20120504\BCB\S. maltophilia Clinical A\F18) 380 ([1], 20120504\BCB\S. maltophilia Clinical A\F23) 381 ([1], 20120504\BCB\S. maltophilia Clinical B\F5) 382 ([1], 20120504\BCB\S. maltophilia Clinical B\F10) 383 ([1], 20120504\BCB\S. maltophilia Clinical B\F15) 384 ([1], 20120504\BCB\S. maltophilia Clinical B\F20) ''' # re-order 'data' part of frame for convenience # currently exists as a data frame with name columns # time, 2R, 2G, 2B .... 79R, 79G, 79B # need to be able to manipulate data as numpy arrays # keep column headers around for later use df, used_column_headers = prepare_data_frame(df_raw) ptf( 'Creating labels...', LOGFILE) label_dictionaries = create_labels_dictionaries() df = create_labels(df, label_dictionaries) # drop unwanted labels df = df[df['Ignore_label'] != True] X = df['data'] y = df[['classification', 'gram', 'detection']] if verbose: ptf( '\nSummary counts after cleaning:', LOGFILE) ptf( y.groupby('gram').count(), LOGFILE) ptf( y.groupby('detection').count(), LOGFILE) ptf( y.groupby('classification').count(), LOGFILE) return X, y, used_column_headers, df, df_raw
def print_memory_profiles(sm, tr, tr_sm, LOGFILE=None): ''' Prints report on memory profiles IN: sm - SeriesModel - SeriesModel object for this run tr - SummaryTracker - SummaryTracker object for the whole run tr_sm - ClassTrackers - ClassTracker object of SeriesModel LOGFILE - file obj - Open logfile for print output OUT: None ''' ptf('\nSERIESMODEL profiling', LOGFILE) ptf('Look at size of seriesmodel object', LOGFILE) ptf(asizeof.asizeof(sm), LOGFILE) ptf(asizeof.asized(sm, detail=1).format(), LOGFILE) ptf('Look at how the SeriesModel class is doing', LOGFILE) tr_sm.create_snapshot() tr_sm.stats.print_summary() tr_sm.stats.print_summary() >> LOGFILE ptf('PROFILING', LOGFILE) ptf('Look at memory leaks up to this point', LOGFILE) tr.print_diff() >> LOGFILE tr.print_diff()
def fit(self, X, y, verbose=False, trigger_only=True, debug=False): self.trigger_only = trigger_only self.verbose = verbose self.debug = debug # start with the second time tmin = self.reference_time + self.min_time self.times = np.arange(tmin, self.max_time, 1) # i) SETUP # self.setup(X,y) # Check trial integrity self._check_trial_integrity() # 0) PREPROCESS All trials X_preprocessed = self.preprocess(X) X_pruned = self.prune_spots(X_preprocessed, self.trigger_spots, self.column_headers) # check load state and run only needed times if self.load_state == 'featurize': ptf('\n>> 1. Computing triggers from timestep %d ...' % self.load_time, self.logfile) run_times = self.make_run_times(self.times, self.load_time) else: ptf('\n>> 1. Computing triggers from first timestep %d ...' % tmin, self.logfile) run_times = self.times if debug: self.times = [30,40,50] run_times = self.times for t in run_times: start = time.time() if self.verbose: ptf('\n\nTIMESTEP %d...' %t, self.logfile) # 1) trigger_featurize X_featurized = self.featurize_triggers(X_pruned, t) # results to accumulate for this timestep y_train_true_timestep = [] y_train_predict_timestep = [] y_test_true_timestep = [] y_test_predict_timestep = [] y_train_probabilities = [] y_test_probabilities = [] for i, (fold, fold_indexes) in enumerate(self.folds.iteritems()): (X_train, X_test) = self._subset_fold_triggers(X_featurized, fold) (y_train, y_test) = self._subset_fold_y(y, fold) # resample X_resampled, y_resampled = self.resample(X_train,y_train,t,fold) y_train_resampled = y_resampled # 1) scale and/or reduce the data if self.verbose: ptf('Scaling fold %d' % fold, self.logfile) X_scaled, scaler = self._scale_class(X_resampled, self.detection_base_scaler, self.detection_base_scaler_arguments) X_test_scaled = scaler.fit_transform(X_test) # 1A) reduce if self.verbose: ptf('Reducing fold %d' % fold, self.logfile) X_reduced, reducer = self._reduce_class(X_scaled, self.detection_base_reducer, self.detection_base_reducer_arguments) X_test_reduced = reducer.transform(X_test_scaled) # 2) trigger_train if self.verbose: ptf('Training fold %d' % fold, self.logfile) model, train_predictions, train_probabilities = \ self.trigger_train(X_reduced, y_train_resampled, fold, t) # 3) trigger_predict if self.verbose: ptf('Predicting fold %d' % fold, self.logfile) test_predictions, test_probabilities = \ self.trigger_predict(model, X_test_reduced, fold, t) # 3A) store fold if self.verbose: ptf('Storing fold %d' % fold, self.logfile) self._trigger_store_one_fold( (train_predictions, train_probabilities), (test_predictions, test_probabilities), fold, t ) # 3B) score one fold if self.verbose: ptf('Scoring fold %d' % fold, self.logfile) # print t, y_resampled[0].shape, self._trigger_score_one_fold(y_train_resampled[0], train_predictions, train_probabilities, t, testtrain='train', fold=fold) self._trigger_score_one_fold(y_test[0], test_predictions, test_probabilities, t, testtrain='test', fold=fold) # stack probas if fold == 0: y_test_probabilities = test_probabilities y_train_probabilities = train_probabilities else: y_test_probabilities = np.vstack((y_test_probabilities, test_probabilities)) y_train_probabilities = np.vstack((y_train_probabilities, train_probabilities)) y_train_true_timestep.extend(y_train_resampled[0]) y_train_predict_timestep.extend(train_predictions) y_test_true_timestep.extend(y_test[0]) y_test_predict_timestep.extend(test_predictions) # 4) trigger_score if self.verbose: ptf('Scoring timestep %d' % t, self.logfile) self._trigger_score_one_fold(y_train_true_timestep, y_train_predict_timestep, y_train_probabilities, t, 'train') self._trigger_score_one_fold(y_test_true_timestep, y_test_predict_timestep, y_test_probabilities, t, 'test') # if self.trigger_only: # return X_featurized # 4A) Write avg tau to file for each trial. We will pass this into triggered series model ## UPDATES FOR V2 BEYOND THIS ## # 5) tranform tau # for t in range(0, self.max_postdetection_time, 20): return X_featurized
def _regress(self, X): start = time.time() number_of_spots = X.iloc[0].shape[1]-1 self.number_of_spots = number_of_spots Xp_ = X.copy() scores_ = X.apply(lambda x: np.zeros(number_of_spots)) for trial_index, x in enumerate(X): if trial_index % 100 == 0: if self.verbose: ptf( 'Taking derivatives of trial %d'% trial_index, self.logfile) number_of_times = len(x) if self.maxmin: Xp = np.zeros((1, number_of_spots)) if self.stacked: Xp = np.zeros((2, number_of_spots)) else: Xp = np.zeros((number_of_times, number_of_spots)) trigger_times = np.zeros(number_of_spots) if self.stacked: trigger_times = np.zeros((2, number_of_spots)) # print x.shape, number_of_times, number_of_spots, Xp.shape, scores.shape for column_index in np.arange(x.shape[1]): spot_index = column_index - 1 if column_index == 0: pass else: score = 0 fp = x[:, column_index].reshape(-1,1) # want the first AND second derivative if self.stacked: if self.order > 2: print 'ERR - Not implemented for order >2' return if not self.maxmin: print 'ERR - only works with maxmin' return fp, s = pade(fp, self.dx) if self.gauss: fp = gaussian_filter1d(fp, self.sigma) fpp, s2 = pade(fp, self.dx) if self.gauss: fpp = gaussian_filter1d(fpp, self.sigma) Xp[0, spot_index] = np.max(np.abs(fp)) Xp[1, spot_index] = np.max(np.abs(fpp)) trigger_times[0, spot_index] = x[np.argmax(np.abs(fp)), 0] trigger_times[1, spot_index] = x[np.argmax(np.abs(fpp)), 0] else: # print 'about to derive', fp.shape for dummy in range(self.order): # print 'derive loop', fp.shape if np.sum(np.isnan(fp)): print 'Incoming error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy) fp, s = pade(fp, self.dx) if np.sum(np.isnan(fp)): print 'PADE error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy) if self.gauss: fp = fp.T fp = gaussian_filter1d(fp, self.sigma) fp = fp.T if np.sum(np.isnan(fp)): print 'Gauss error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy) # print 'after derive', fp.shape, s.shape # score += s # print fp.shape, Xp[:, spot_index].shape # return the trigger time as the other feature instead of a score trigger_times[spot_index] = x[np.argmax(np.abs(fp)),0] if self.maxmin: Xp[0, spot_index] = np.max(np.abs(fp)) else: Xp[:,spot_index] = fp.flatten() Xp[:self.reference_time, spot_index] = 0 # scores[spot_index] = score Xp_.iloc[trial_index] = Xp scores_.iloc[trial_index] = trigger_times end = time.time() ptf( 'Regressed %d trials in %d seconds' % (len(X), (end-start)), self.logfile) return Xp_, scores_