예제 #1
0
def test_extract_trial_features_real():
    csvs = [
            "./data/fh_pca_space_merge_Insula_rt_fast.csv",
            "./data/fh_pca_space_merge_Insula_rt_slow.csv"
            ]

    # load_dimreduce_data_fromcl(csvs, feature_cols, label_col, cv_col, trial_tr_col)
    feature_col = range(0,3)
    label_col = 5
    cv_col = 6
    trial_tr_col = 7
    Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl(
            csvs, feature_col, label_col, cv_col, trial_tr_col
            )
    
    X = np.concatenate(Xs)
    y = create_y(np.concatenate(ys)) 
    trials = np.concatenate(indices)
    windowstr = '0:10'
    window =  range(*[int(i) for i in windowstr.split(':')])

    Xfea, indexfea, otherfea = extract_trial_features(
            X, trials, window, [trials], None
            )
    yea = otherfea[0]
    
    # Xmax, Xmin, Xdiff, Xmean, Xslope
    print("Xmax\n{0}".format(Xfea[:,0:3]))
    print("Xmin\n{0}".format(Xfea[:,3:6]))
    print("Xdiff\n{0}".format(Xfea[:,6:9]))
    print("Xmean\n{0}".format(Xfea[:,9:12]))
    print("Xslope\n{0}".format(Xfea[:,12:15]))
예제 #2
0
def simple(Xtrain, Xtest, labels_train, labels_test, clf, verbose):
    """Run a very simple classification exp. 
    
    Parameters
    ----------
    X - a 2d array, column oriented
    labels - a list of class labels, one for each row in X
    clf - a sklearn classifier (that implements .fit() and 
        .predict())
    verbose - Print out useful debugging/status info (True).  If False
        this function is silent.

    Returns 
    -------
    truths - a list of correct classes for each test set
    predictions - a list of the predicted classes for each test set.
    """

    Xtest = np.array(Xtest)
    Xtrain = np.array(Xtrain)
    
    labels_train = np.array(labels_train)
    labels_test = np.array(labels_test)    
    
    ytrain = create_y(labels_train)
    ytest = create_y(labels_test)
        ## Labels as ints
    
    if verbose:
        print("\tTrain labels:")
        print_label_counts(labels_train)
        print("\tTest labels:")
        print_label_counts(labels_test)
        
        print("\tShapes of Xtrain and ytrain: {0}, {1}".format(
                Xtrain.shape, ytrain.shape))
        print("\tNumber of Xtest and ytest: {0}, {1}".format(
                Xtest.shape, ytest.shape))
    
    clf.fit(scale(Xtrain), ytrain)

    predictions = clf.predict(scale(Xtest))
    truths = ytest
    
    return truths, predictions
예제 #3
0
    def fit_transform(self, X, y, trial_index, window, tr):
        """Converts X into time-avearage trials and decomposes  that
        matrix, possibly several times depending on y.

        Parameters
        ----------
        X : 2D array-like (n_sample, n_feature)
            The data to decompose

        y : 1D array, None by default
            Sample labels for the data

        trial_index : 1D array (n_sample, )
            Each unique entry should match a trial.

        window : int 
            Trial length

        norm : True
            A dummy argument

        Return
        ------
        Xcs : a list of 2D arrays (n_sample, n_components)
            The components for each unique y.

        csnames : 1D array
            The names of the components matrices
        """

        selector = fs.SelectPercentile(percentile=20)
        Xsel = selector.fit_transform(X, create_y(y))
        
        Xtrials = []
        Xcs = []
        csnames = []
    
        Xtrial, feature_names = self.avgfn(Xsel, y, trial_index, window, tr)
        unique_fn = sort_nanfirst(unique_nan(feature_names))

        # Split up by feature_names
        for yi in unique_fn:
            Xtrials.append(Xtrial[:, feature_names == yi])
            
        # and decompose.
        if self.mode == 'decompose':
            Xcs = [self._ft(Xt) for Xt in Xtrials]
        elif self.mode == 'cluster':
            Xcs = [self._fp(Xt) for Xt in Xtrials]
        else:
            raise ValueError("mode not understood.")

        ti_cs = [np.arange(xc.shape[0]) for xc in Xcs]
            ## In this case, Xcs[i] is only 1 trial long.
    
        return Xcs, unique_fn, ti_cs
예제 #4
0
def simpleCV(X, labels, cv, clf, verbose=True):
    """Run a simple CV based classification exp. 
    
    Parameters
    ----------
    X - a 2d array, column oriented
    labels - a list of class labels, one for each row in X
    cv - a sklearn cross-validation object
    clf - a sklearn classifier (that implements .fit() and 
        .predict())
    verbose - Print out useful debugging/status info (True).  If False
        this function is silent.

    Returns 
    -------
    truths - a list of correct classes for each test set
    predictions - a list of the predicted classes for each test set.
    """
    
    X = np.array(X)
    labels = np.array(labels)
    
    y = create_y(labels)
        ## Labels as ints
    
    truths = []
    predictions = []
    for train_index, test_index in cv:
        # ----
        # Partition the data
        Xtrain = X[train_index,:]
        Xtest = X[test_index,:]        
        ytrain = y[train_index]
        ytest = y[test_index]
        
        if verbose:
            print("Next fold:")
            print("\tShapes of Xtrain and Xtest: {0}, {1}".format(
                    Xtrain.shape, Xtest.shape))
            print("\tNumber of ytrain and ytest: {0}, {1}".format(
                    ytrain.shape, ytest.shape))

        # ----
        # Class!
        clf.fit(scale(Xtrain), ytrain)

        truths.append(ytest)
        predictions.append(clf.predict(scale(Xtest)))
    
    return truths, predictions
예제 #5
0
def _data(csvs, feature_index, window, label_col, trial_tr_col):
    """Data loading and feature selection helper."""
    Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl(
            csvs, feature_index, label_col, 
            label_col, trial_tr_col
            )

    X = np.concatenate(Xs)
    y = create_y(np.concatenate(ys)) 
    index = np.concatenate(indices)
    cvcode = np.concatenate(cvcodes)

    X, index, othermeta = extract_trial_features(X, index,  
            window, [y, cvcode], None
            )
    y, _ = othermeta  ## toss cvcode, not applicable
        
    return X, y
예제 #6
0
 def fit_transform(self, X, y, trial_index, window, tr):        
     selector = fs.SelectPercentile(percentile=25)
     Xsel = selector.fit_transform(X, create_y(y))
     
     import ipdb; ipdb.set_trace()
     
     if self.mode == 'decompose':
         Xc = self._ft(Xsel)
     elif self.mode == 'cluster':
         Xc = self._fp(Xsel)
     else:
         raise ValueError("mode not understood.")
     
     unique_y = sort_nanfirst(unique_nan(y))        
     Xcs = [Xc[y == uy,:] for uy in unique_y]
     ti_cs = [trial_index[y == uy] for uy in unique_y]
     
     return Xcs, unique_y, ti_cs
예제 #7
0
args = parser.parse_args()
prng = np.random.RandomState(42)


# ----
# Load and preprocess data
feature_index = range(*[int(i) for i in args.data.split(':')])

csvs = args.t
# load_dimreduce_data_fromcl(csvs, feature_cols, label_col, cv_col, trial_tr_col)
Xs, ys, indices, cvcodes = load_dimreduce_data_fromcl(
        csvs, feature_index, args.labels, 
        args.cv, args.trial_tr
        )
X = np.concatenate(Xs)
y = create_y(np.concatenate(ys)) 
index = np.concatenate(indices)
cvcode = np.concatenate(cvcodes)

window =  range(*[int(i) for i in args.window.split(':')])
X, index, othermeta = extract_trial_features(
        X, index,  window, [y, cvcode], None
        )
y, cvcode = othermeta

assert X.shape[0] == y.shape[0], "X and y length mismatch"

# ----
# Setup CV: 
#
# Kfold splitting by label or by custom label
예제 #8
0
    if (X is None) and (y is None):
        X = np.asarray(dftmp.ix[:,feature_index])
        y = np.asarray(dftmp.ix[:,args.labels], dtype=np.str)
        index = np.asarray(dftmp.ix[:,args.index])
    ## Otherwise stack
    else:
        X = np.vstack([X, np.asarray(dftmp.ix[:,feature_index])])
        y = np.concatenate([y, np.asarray(dftmp.ix[:,args.labels], 
                dtype=np.str)])
        index = np.concatenate([index, np.asarray(dftmp.ix[:,args.index])])

# ----
# Preproces
# ----
# Convert y to integer codes
y = create_y(y)

# Sane so far?
assert checkX(X)
assert X.shape[0] == y.shape[0], "X and y length mismatch"

# Use trial chunks of labels to CV?
if args.labels == args.index:
    cv = KFold(y.shape[0], n_folds=5, indices=True)
else:
    # ----
    # To split by trial
    # Convert the trial index 
    # to trial/chunk counter
    #chunks = []
    #cnt = -1
예제 #9
0
def fir(X, y, trial_index, window, tr):
    """ Average trials for each feature in X, using Burock's 
    (2000) method.
    
    Parameters
    ----------
    X : 2D array-like (n_sample, n_feature)
        The data to decompose
    y : 1D array, None by default
         Sample labels for the data. In y, np.nan and 'nan' values 
         are treated as baseline labels.
    trial_index : 1D array (n_sample, )
        Each unique entry should match a trial.
    window : int 
        Trial length

    Return
    ------
    Xfir : a 2D arrays (n_feature*unique_y, window)
        The average trials
    feature_names : 1D array
    """

    # Norm then pad.
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X.astype(np.float))
    X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)])

    # Save the org y names
    ynames = sorted(np.unique(y))
    ynames = unique_sorted_with_nan(ynames)

    # y becomes integers
    y = create_y(y)

    # Make the design matrix.
    dm = _create_dm(y, window)
    # dm DEBUG
    #import time
    #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f")
    dm = np.matrix(dm)

    # FIR!
    fir_names = []
    firs = []
    for j in range(X.shape[1]):
        x = np.matrix(X[:, j])
        fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1]
        ## Drop dummy
        fir = fir.reshape(len(ynames) - 1, window)

        firs.append(fir)
        fir_names.extend(ynames[1:])  ## Drop nan/baseline

    Xfir = np.vstack(firs).transpose()
    fir_names = np.asarray(fir_names)

    assert checkX(Xfir)
    assert Xfir.shape[0] == window, ("After FIR rows not equal to window")
    assert Xfir.shape[1] == (len(ynames[1:]) *
                             X.shape[1]), ("After"
                                           "FIR wrong number of features")
    assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir"
                                                 "don't match")

    return Xfir, fir_names
예제 #10
0
def make_bold(cond,
              index,
              wheelerdata,
              cond_to_rt,
              filtfile=None,
              TR=2,
              trname="TR",
              n_features=10,
              n_univariate=None,
              n_accumulator=None,
              n_decision=None,
              n_noise=None,
              drift_noise=False,
              step_noise=False,
              z_noise=False,
              drift_noise_param=None,
              step_noise_param=None,
              z_noise_param=None,
              noise_f=white,
              hrf_f=None,
              hrf_params=None,
              prng=None):
    """Make BOLD timecourse features based on Wheelerdata

    Parameters
    ---------
    cond : str
        A condition name found in the wheelerdata objects metadata
    index : str
        A name of a trial index found in the wheelerdata object metadata
    wheelerdata : object, instance of Wheelerdata
        A Wheelerdata object
    cond_to_rt: dict
        A map of cond (key) to reaction time (item, (int, float))
    filtfile : str, None
        A name of json file designed for reprocessing Wheelerdata metadata
    TR : float, int
        The repitition time of the experiement
    trname : str
        The name of the index of TRs in the metadata
    n_features : int
        The number of features in total (other n_* arguements
        must sum to this value
    n_univariate : int
        The number of univariate (boxcar) features
    n_accumulator : int
        The number of accumulator features
    n_decision : int
        The number of decision features
    n_noise : int
        The number of noise features
    drift_noise : boolean, optional
        Add noise to the drift rate of the accumulator features
    step_noise : boolean, optional
        Add Noise to each step accumulator features
    z_noise : boolean, optional
        Add noise to the start value of accumulator features
    drift_noise_param : None or dict, optional
        Parameters for drift_noise which is drawn from a
        Gaussian distribution. None defaults to: 
        `{"loc": 0, "scale" : 0.5}`
    step_noise_param : None or dict, optional
        Parameters for step_noise which is drawn from a 
        Gaussian distribution. None defaults to:
        `{"loc" : 0, "scale" : 0.2, "size" : 1}`
    z_noise_param : None or dict, optional
        Parameters for z_noise which is drawn from the uniform
        distribution. None defaults to:
        `{"low" : 0.01, "high" : 0.5, "size" : 1}`
    noise_f : function, optional
        Produces noise, must have signatures like `noise, prng = f(N, prng)`
    hrf_f : function, optional
        Returns a haemodynamic response, signature hrf_f(**hrf_params)
    hrf_params : dict
        Keyword parameters for hrf_f
    prng : None or RandomState object
        Allows for independent random draws, used for all 
        random sampling
    """

    # ----
    # Feature composition
    if n_noise == None:
        n_noise = 0
    if n_accumulator == None:
        n_accumulator = 0
    if n_decision == None:
        n_decision = 0
    if n_univariate == None:
        n_univariate = (n_features - n_noise - n_accumulator - n_decision)

    if (n_features - n_univariate - n_accumulator - n_noise - n_decision) != 0:
        raise ValueError("The number of features don't add up.")

    # Load wheelerdata
    metas = wheelerdata.get_RT_metadata_paths()

    # Get to work simulating
    Xs, ys, yindices = [], [], []
    for meta in metas:
        # Get data, preprocess too,
        data = csv_to_targets(meta)
        data = tr_pad_targets(data, trname, data[trname].shape[0], pad=np.nan)

        if filtfile is not None:
            data = reprocess_targets(filtfile, data, np.nan,
                                     ("TR", "trialcount"))

        # Check cond_to_rt
        for c in unique_nan(data[cond]):
            try:
                cond_to_rt[c]
            except KeyError:
                raise KeyError("{0} not present in cond_to_rt".format(c))

        # use cond to create y
        y = create_y(data[cond])
        yindex = data[index]

        # make accumulator and decision traces
        if n_accumulator > 0:
            data["accumulator"] = _make_accumulator_array(y,
                                                          yindex,
                                                          cond_to_rt,
                                                          drift_noise,
                                                          step_noise,
                                                          z_noise,
                                                          drift_noise_param,
                                                          step_noise_param,
                                                          z_noise_param,
                                                          prng=prng)
        if n_decision > 0:
            data["decision"] = _make_decision_array(y, yindex, cond_to_rt)

        # Populate Xmeta
        boldsim = Reproduce(y,
                            data,
                            noise_f=noise_f,
                            hrf_f=hrf_f,
                            hrf_params=hrf_params,
                            TR=TR,
                            prng=prng)
        boldsim.create_dm_from_y(convolve=False)

        n_sample_feature = boldsim.dm.shape[0]
        Xmeta = np.zeros((n_sample_feature, n_features))

        # 1. univariate features
        start = 0
        stop = n_univariate
        for j in range(start, stop):
            boldsim.create_bold(np.sum(boldsim.dm[:, 1:], axis=1),
                                convolve=True)
            Xmeta[:, j] = boldsim.bold

        # 2. accumulator features
        start = stop
        stop = start + n_accumulator
        for j in range(start, stop):
            boldsim.create_bold(data["accumulator"], convolve=True)
            Xmeta[:, j] = boldsim.bold

        # 3. decision features
        start = stop
        stop = start + n_decision
        for j in range(start, stop):
            boldsim.create_bold(data["decision"], convolve=True)
            Xmeta[:, j] = boldsim.bold

        # 4. noise features:
        start = stop
        stop = start + n_noise
        for j in range(start, stop):
            # Drop baseline from noise
            randbold = rand(boldsim.dm.shape[0])
            randbold[boldsim.y == 0] = 0.0
            boldsim.create_bold(randbold, convolve=True)
            Xmeta[:, j] = boldsim.bold

        Xs.append(Xmeta)
        ys.append(y)
        yindices.append(yindex)

    return Xs, ys, yindices
예제 #11
0
def fir(X, y, trial_index, window, tr):
    """ Average trials for each feature in X, using Burock's 
    (2000) method.
    
    Parameters
    ----------
    X : 2D array-like (n_sample, n_feature)
        The data to decompose
    y : 1D array, None by default
         Sample labels for the data. In y, np.nan and 'nan' values 
         are treated as baseline labels.
    trial_index : 1D array (n_sample, )
        Each unique entry should match a trial.
    window : int 
        Trial length

    Return
    ------
    Xfir : a 2D arrays (n_feature*unique_y, window)
        The average trials
    feature_names : 1D array
    """

    # Norm then pad.
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X.astype(np.float))
    X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)])

    # Save the org y names
    ynames = sorted(np.unique(y))
    ynames = unique_sorted_with_nan(ynames)
    
    # y becomes integers
    y = create_y(y)

    # Make the design matrix.
    dm = _create_dm(y, window)
    # dm DEBUG
    #import time
    #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f")
    dm = np.matrix(dm)
    
    # FIR!
    fir_names = []
    firs = []
    for j in range(X.shape[1]):
        x = np.matrix(X[:,j])
        fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] 
            ## Drop dummy
        fir = fir.reshape(len(ynames)-1, window)  

        firs.append(fir)
        fir_names.extend(ynames[1:])  ## Drop nan/baseline

    Xfir = np.vstack(firs).transpose()
    fir_names = np.asarray(fir_names)

    assert checkX(Xfir)
    assert Xfir.shape[0] == window, ("After FIR rows not equal to window")
    assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" 
        "FIR wrong number of features")
    assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" 
        "don't match")

    return Xfir, fir_names