def test_extract_trial_features_real(): csvs = [ "./data/fh_pca_space_merge_Insula_rt_fast.csv", "./data/fh_pca_space_merge_Insula_rt_slow.csv" ] # load_dimreduce_data_fromcl(csvs, feature_cols, label_col, cv_col, trial_tr_col) feature_col = range(0,3) label_col = 5 cv_col = 6 trial_tr_col = 7 Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl( csvs, feature_col, label_col, cv_col, trial_tr_col ) X = np.concatenate(Xs) y = create_y(np.concatenate(ys)) trials = np.concatenate(indices) windowstr = '0:10' window = range(*[int(i) for i in windowstr.split(':')]) Xfea, indexfea, otherfea = extract_trial_features( X, trials, window, [trials], None ) yea = otherfea[0] # Xmax, Xmin, Xdiff, Xmean, Xslope print("Xmax\n{0}".format(Xfea[:,0:3])) print("Xmin\n{0}".format(Xfea[:,3:6])) print("Xdiff\n{0}".format(Xfea[:,6:9])) print("Xmean\n{0}".format(Xfea[:,9:12])) print("Xslope\n{0}".format(Xfea[:,12:15]))
def simple(Xtrain, Xtest, labels_train, labels_test, clf, verbose): """Run a very simple classification exp. Parameters ---------- X - a 2d array, column oriented labels - a list of class labels, one for each row in X clf - a sklearn classifier (that implements .fit() and .predict()) verbose - Print out useful debugging/status info (True). If False this function is silent. Returns ------- truths - a list of correct classes for each test set predictions - a list of the predicted classes for each test set. """ Xtest = np.array(Xtest) Xtrain = np.array(Xtrain) labels_train = np.array(labels_train) labels_test = np.array(labels_test) ytrain = create_y(labels_train) ytest = create_y(labels_test) ## Labels as ints if verbose: print("\tTrain labels:") print_label_counts(labels_train) print("\tTest labels:") print_label_counts(labels_test) print("\tShapes of Xtrain and ytrain: {0}, {1}".format( Xtrain.shape, ytrain.shape)) print("\tNumber of Xtest and ytest: {0}, {1}".format( Xtest.shape, ytest.shape)) clf.fit(scale(Xtrain), ytrain) predictions = clf.predict(scale(Xtest)) truths = ytest return truths, predictions
def fit_transform(self, X, y, trial_index, window, tr): """Converts X into time-avearage trials and decomposes that matrix, possibly several times depending on y. Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length norm : True A dummy argument Return ------ Xcs : a list of 2D arrays (n_sample, n_components) The components for each unique y. csnames : 1D array The names of the components matrices """ selector = fs.SelectPercentile(percentile=20) Xsel = selector.fit_transform(X, create_y(y)) Xtrials = [] Xcs = [] csnames = [] Xtrial, feature_names = self.avgfn(Xsel, y, trial_index, window, tr) unique_fn = sort_nanfirst(unique_nan(feature_names)) # Split up by feature_names for yi in unique_fn: Xtrials.append(Xtrial[:, feature_names == yi]) # and decompose. if self.mode == 'decompose': Xcs = [self._ft(Xt) for Xt in Xtrials] elif self.mode == 'cluster': Xcs = [self._fp(Xt) for Xt in Xtrials] else: raise ValueError("mode not understood.") ti_cs = [np.arange(xc.shape[0]) for xc in Xcs] ## In this case, Xcs[i] is only 1 trial long. return Xcs, unique_fn, ti_cs
def simpleCV(X, labels, cv, clf, verbose=True): """Run a simple CV based classification exp. Parameters ---------- X - a 2d array, column oriented labels - a list of class labels, one for each row in X cv - a sklearn cross-validation object clf - a sklearn classifier (that implements .fit() and .predict()) verbose - Print out useful debugging/status info (True). If False this function is silent. Returns ------- truths - a list of correct classes for each test set predictions - a list of the predicted classes for each test set. """ X = np.array(X) labels = np.array(labels) y = create_y(labels) ## Labels as ints truths = [] predictions = [] for train_index, test_index in cv: # ---- # Partition the data Xtrain = X[train_index,:] Xtest = X[test_index,:] ytrain = y[train_index] ytest = y[test_index] if verbose: print("Next fold:") print("\tShapes of Xtrain and Xtest: {0}, {1}".format( Xtrain.shape, Xtest.shape)) print("\tNumber of ytrain and ytest: {0}, {1}".format( ytrain.shape, ytest.shape)) # ---- # Class! clf.fit(scale(Xtrain), ytrain) truths.append(ytest) predictions.append(clf.predict(scale(Xtest))) return truths, predictions
def _data(csvs, feature_index, window, label_col, trial_tr_col): """Data loading and feature selection helper.""" Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl( csvs, feature_index, label_col, label_col, trial_tr_col ) X = np.concatenate(Xs) y = create_y(np.concatenate(ys)) index = np.concatenate(indices) cvcode = np.concatenate(cvcodes) X, index, othermeta = extract_trial_features(X, index, window, [y, cvcode], None ) y, _ = othermeta ## toss cvcode, not applicable return X, y
def fit_transform(self, X, y, trial_index, window, tr): selector = fs.SelectPercentile(percentile=25) Xsel = selector.fit_transform(X, create_y(y)) import ipdb; ipdb.set_trace() if self.mode == 'decompose': Xc = self._ft(Xsel) elif self.mode == 'cluster': Xc = self._fp(Xsel) else: raise ValueError("mode not understood.") unique_y = sort_nanfirst(unique_nan(y)) Xcs = [Xc[y == uy,:] for uy in unique_y] ti_cs = [trial_index[y == uy] for uy in unique_y] return Xcs, unique_y, ti_cs
args = parser.parse_args() prng = np.random.RandomState(42) # ---- # Load and preprocess data feature_index = range(*[int(i) for i in args.data.split(':')]) csvs = args.t # load_dimreduce_data_fromcl(csvs, feature_cols, label_col, cv_col, trial_tr_col) Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl( csvs, feature_index, args.labels, args.cv, args.trial_tr ) X = np.concatenate(Xs) y = create_y(np.concatenate(ys)) index = np.concatenate(indices) cvcode = np.concatenate(cvcodes) window = range(*[int(i) for i in args.window.split(':')]) X, index, othermeta = extract_trial_features( X, index, window, [y, cvcode], None ) y, cvcode = othermeta assert X.shape[0] == y.shape[0], "X and y length mismatch" # ---- # Setup CV: # # Kfold splitting by label or by custom label
if (X is None) and (y is None): X = np.asarray(dftmp.ix[:,feature_index]) y = np.asarray(dftmp.ix[:,args.labels], dtype=np.str) index = np.asarray(dftmp.ix[:,args.index]) ## Otherwise stack else: X = np.vstack([X, np.asarray(dftmp.ix[:,feature_index])]) y = np.concatenate([y, np.asarray(dftmp.ix[:,args.labels], dtype=np.str)]) index = np.concatenate([index, np.asarray(dftmp.ix[:,args.index])]) # ---- # Preproces # ---- # Convert y to integer codes y = create_y(y) # Sane so far? assert checkX(X) assert X.shape[0] == y.shape[0], "X and y length mismatch" # Use trial chunks of labels to CV? if args.labels == args.index: cv = KFold(y.shape[0], n_folds=5, indices=True) else: # ---- # To split by trial # Convert the trial index # to trial/chunk counter #chunks = [] #cnt = -1
def fir(X, y, trial_index, window, tr): """ Average trials for each feature in X, using Burock's (2000) method. Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are treated as baseline labels. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xfir : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array """ # Norm then pad. scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X.astype(np.float)) X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)]) # Save the org y names ynames = sorted(np.unique(y)) ynames = unique_sorted_with_nan(ynames) # y becomes integers y = create_y(y) # Make the design matrix. dm = _create_dm(y, window) # dm DEBUG #import time #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f") dm = np.matrix(dm) # FIR! fir_names = [] firs = [] for j in range(X.shape[1]): x = np.matrix(X[:, j]) fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] ## Drop dummy fir = fir.reshape(len(ynames) - 1, window) firs.append(fir) fir_names.extend(ynames[1:]) ## Drop nan/baseline Xfir = np.vstack(firs).transpose() fir_names = np.asarray(fir_names) assert checkX(Xfir) assert Xfir.shape[0] == window, ("After FIR rows not equal to window") assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" "FIR wrong number of features") assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" "don't match") return Xfir, fir_names
def make_bold(cond, index, wheelerdata, cond_to_rt, filtfile=None, TR=2, trname="TR", n_features=10, n_univariate=None, n_accumulator=None, n_decision=None, n_noise=None, drift_noise=False, step_noise=False, z_noise=False, drift_noise_param=None, step_noise_param=None, z_noise_param=None, noise_f=white, hrf_f=None, hrf_params=None, prng=None): """Make BOLD timecourse features based on Wheelerdata Parameters --------- cond : str A condition name found in the wheelerdata objects metadata index : str A name of a trial index found in the wheelerdata object metadata wheelerdata : object, instance of Wheelerdata A Wheelerdata object cond_to_rt: dict A map of cond (key) to reaction time (item, (int, float)) filtfile : str, None A name of json file designed for reprocessing Wheelerdata metadata TR : float, int The repitition time of the experiement trname : str The name of the index of TRs in the metadata n_features : int The number of features in total (other n_* arguements must sum to this value n_univariate : int The number of univariate (boxcar) features n_accumulator : int The number of accumulator features n_decision : int The number of decision features n_noise : int The number of noise features drift_noise : boolean, optional Add noise to the drift rate of the accumulator features step_noise : boolean, optional Add Noise to each step accumulator features z_noise : boolean, optional Add noise to the start value of accumulator features drift_noise_param : None or dict, optional Parameters for drift_noise which is drawn from a Gaussian distribution. None defaults to: `{"loc": 0, "scale" : 0.5}` step_noise_param : None or dict, optional Parameters for step_noise which is drawn from a Gaussian distribution. None defaults to: `{"loc" : 0, "scale" : 0.2, "size" : 1}` z_noise_param : None or dict, optional Parameters for z_noise which is drawn from the uniform distribution. None defaults to: `{"low" : 0.01, "high" : 0.5, "size" : 1}` noise_f : function, optional Produces noise, must have signatures like `noise, prng = f(N, prng)` hrf_f : function, optional Returns a haemodynamic response, signature hrf_f(**hrf_params) hrf_params : dict Keyword parameters for hrf_f prng : None or RandomState object Allows for independent random draws, used for all random sampling """ # ---- # Feature composition if n_noise == None: n_noise = 0 if n_accumulator == None: n_accumulator = 0 if n_decision == None: n_decision = 0 if n_univariate == None: n_univariate = (n_features - n_noise - n_accumulator - n_decision) if (n_features - n_univariate - n_accumulator - n_noise - n_decision) != 0: raise ValueError("The number of features don't add up.") # Load wheelerdata metas = wheelerdata.get_RT_metadata_paths() # Get to work simulating Xs, ys, yindices = [], [], [] for meta in metas: # Get data, preprocess too, data = csv_to_targets(meta) data = tr_pad_targets(data, trname, data[trname].shape[0], pad=np.nan) if filtfile is not None: data = reprocess_targets(filtfile, data, np.nan, ("TR", "trialcount")) # Check cond_to_rt for c in unique_nan(data[cond]): try: cond_to_rt[c] except KeyError: raise KeyError("{0} not present in cond_to_rt".format(c)) # use cond to create y y = create_y(data[cond]) yindex = data[index] # make accumulator and decision traces if n_accumulator > 0: data["accumulator"] = _make_accumulator_array(y, yindex, cond_to_rt, drift_noise, step_noise, z_noise, drift_noise_param, step_noise_param, z_noise_param, prng=prng) if n_decision > 0: data["decision"] = _make_decision_array(y, yindex, cond_to_rt) # Populate Xmeta boldsim = Reproduce(y, data, noise_f=noise_f, hrf_f=hrf_f, hrf_params=hrf_params, TR=TR, prng=prng) boldsim.create_dm_from_y(convolve=False) n_sample_feature = boldsim.dm.shape[0] Xmeta = np.zeros((n_sample_feature, n_features)) # 1. univariate features start = 0 stop = n_univariate for j in range(start, stop): boldsim.create_bold(np.sum(boldsim.dm[:, 1:], axis=1), convolve=True) Xmeta[:, j] = boldsim.bold # 2. accumulator features start = stop stop = start + n_accumulator for j in range(start, stop): boldsim.create_bold(data["accumulator"], convolve=True) Xmeta[:, j] = boldsim.bold # 3. decision features start = stop stop = start + n_decision for j in range(start, stop): boldsim.create_bold(data["decision"], convolve=True) Xmeta[:, j] = boldsim.bold # 4. noise features: start = stop stop = start + n_noise for j in range(start, stop): # Drop baseline from noise randbold = rand(boldsim.dm.shape[0]) randbold[boldsim.y == 0] = 0.0 boldsim.create_bold(randbold, convolve=True) Xmeta[:, j] = boldsim.bold Xs.append(Xmeta) ys.append(y) yindices.append(yindex) return Xs, ys, yindices
def fir(X, y, trial_index, window, tr): """ Average trials for each feature in X, using Burock's (2000) method. Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are treated as baseline labels. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xfir : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array """ # Norm then pad. scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X.astype(np.float)) X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)]) # Save the org y names ynames = sorted(np.unique(y)) ynames = unique_sorted_with_nan(ynames) # y becomes integers y = create_y(y) # Make the design matrix. dm = _create_dm(y, window) # dm DEBUG #import time #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f") dm = np.matrix(dm) # FIR! fir_names = [] firs = [] for j in range(X.shape[1]): x = np.matrix(X[:,j]) fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] ## Drop dummy fir = fir.reshape(len(ynames)-1, window) firs.append(fir) fir_names.extend(ynames[1:]) ## Drop nan/baseline Xfir = np.vstack(firs).transpose() fir_names = np.asarray(fir_names) assert checkX(Xfir) assert Xfir.shape[0] == window, ("After FIR rows not equal to window") assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" "FIR wrong number of features") assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" "don't match") return Xfir, fir_names