Пример #1
0
    def _fp(self, X):
        """The cluster workhorse

        Parameters
        ----------
        X : 2D array-like (n_sample, n_feature)
            The data to decompose

        Return
        ------
        Xc - 2D array-like (n_sample, n_clusters)
        """

        nrow = X.shape[0]

        clabels = self.estimator.fit_predict(X.transpose())
        uclabels = unique_nan(clabels)
        uclabels = sort_nanfirst(uclabels)
        # uclabels = sorted(np.unique(clabels))
        # uclabels = unique_sorted_with_nan(uclabels)

        # Average cluster examples, filling Xc
        Xc = np.zeros((nrow, len(uclabels)))         ## Init w/ 0
        for i, ucl in enumerate(uclabels):
            Xc[:,i] = X[:,ucl == clabels].mean(1)

        assert checkX(Xc)
        assert Xc.shape[0] == X.shape[0], ("After transform wrong row number")
        assert Xc.shape[1] == len(uclabels), ("Afer transform" 
            " wrong col number")

        return Xc
Пример #2
0
    def _ft(self, X):
        """The decompose workhorse

        Parameters
        ----------
        X : 2D array-like (n_sample, n_feature)
            The data to decompose

        Return
        ------
        Xc - 2D array-like (n_sample, n_components)
        """ 
        
        Xc = self.estimator.fit_transform(X)

        assert checkX(Xc)
        assert Xc.shape[0] == X.shape[0], ("After transform wrong row number")

        # The n_components attr is optional
        try: 
            assert Xc.shape[1] <= self.estimator.n_components, ("Too many" 
                "components")
        except AttributeError:
            pass

        return Xc
Пример #3
0
def eva(X, y, trial_index, window, tr):
    """Average trials for each feature in X

    Parameters
     ----------
     X : 2D array-like (n_sample, n_feature)
         The data to decompose
     y : 1D array, None by default
         Sample labels for the data.  In y, np.nan and 'nan' values 
         are ignored.
     trial_index : 1D array (n_sample, )
         Each unique entry should match a trial.
     window : int 
         Trial length

     Return
     ------
     Xeva : a 2D arrays (n_feature*unique_y, window)
         The average trials
     feature_names : 1D array
         The names of the features (taken from y)
    """

    evas = []
    eva_names = []
    scaler = MinMaxScaler(feature_range=(0, 1))
    for j in range(X.shape[1]):
        Xtrials = []
        
        xj = X[:,j][:,np.newaxis]  ## Need 2D

        # Each feature into trials, rescale too
        Xtrial, feature_names = by_trial(xj, trial_index, window, y)
        Xtrial = scaler.fit_transform(Xtrial.astype(np.float))
        unique_fn = sorted(np.unique(feature_names))
        unique_fn = unique_sorted_with_nan(unique_fn)

        # and again by unique_y/fe]ature_names
        Xlabels, _ = by_labels(X=Xtrial.transpose(), y=feature_names)

        # put all that togthether
        Xtrials.extend([Xl.transpose() for Xl in Xlabels])

        # and average the trials then
        # name names.
        evas.extend([Xt.mean(axis=1) for Xt in Xtrials])
        eva_names.extend(unique_fn)

    # Reshape : (window, len(unique_y)*n_features)
    Xeva = np.vstack(evas).transpose()
    eva_names = np.asarray(eva_names)

    assert checkX(Xeva)
    assert Xeva.shape[0] == window, ("After EVA rows not equal to window")
    assert Xeva.shape[1] == len(unique_fn) * X.shape[1], ("After" 
        "EVA wrong number of features")
    assert eva_names.shape[0] == Xeva.shape[1], ("eva_names and Xeva" 
        "don't match")

    return Xeva, eva_names
Пример #4
0
def load_nii(nifiti, clean=True, sparse=False, smooth=False, **kwargs):
    """Convert the nifiti-1 file into a 2D array (n_sample x n_features).
    
    Parameters
    ----------
    nifti - str
        The name of the data to load
    clean - boolean (True)
        Remove invariant features features?  If used n_features will 
        not match n_voxels in the orignal nifit1 file.  This operation
        is not reversable.  If you clean there is probablity little
        point in converting to a sparse representation.
    sparse - boolean (False)
        Use the (CSC) sparse format (True)?
    smooth - boolean (False)
        High/low pass filter the data?
    [, ...] - Optional parameters for smooth 
        (defaults: tr=1.5, ub=0.06, lb=0.006)

    Return
    ------
    X - 2D array (n_sample x n_features)
        The BOLD data
    """

    # Data is 4d (x,y,z,t) we want 2d, where each column is
    # a voxel and each row is the temporal (t) data
    # i.e. the final shape should be (x*y*x, t)
    nii = nb.nifti1.load(nifiti)

    numt = nii.shape[3]
    numxyz = nii.shape[0] * nii.shape[1] * nii.shape[2]
    dims = (numxyz, numt)

    # Get into 2d (n_feature, n_sample)
    X = nii.get_data().astype('int16').reshape(dims).transpose()
    if clean:
        X = remove_invariant_features(X, sparse=False)

    if smooth:
        # Setup smooth params
        tr = 1.5
        ub = 0.06
        lb = 0.001
        if "tr" in kwargs:
            tr = kwargs["tr"]
        if "ub" in kwargs:
            ub = kwargs["ub"]
        if "lb" in kwargs:
            ub = kwargs["lb"]

        X = smoothfn(X, tr=tr, ub=ub, lb=lb)

    assert checkX(X)

    if sparse:
        X = csc_matrix(X)

    return X
Пример #5
0
def load_nii(nifiti, clean=True, sparse=False, smooth=False, **kwargs):
    """Convert the nifiti-1 file into a 2D array (n_sample x n_features).
    
    Parameters
    ----------
    nifti - str
        The name of the data to load
    clean - boolean (True)
        Remove invariant features features?  If used n_features will 
        not match n_voxels in the orignal nifit1 file.  This operation
        is not reversable.  If you clean there is probablity little
        point in converting to a sparse representation.
    sparse - boolean (False)
        Use the (CSC) sparse format (True)?
    smooth - boolean (False)
        High/low pass filter the data?
    [, ...] - Optional parameters for smooth 
        (defaults: tr=1.5, ub=0.06, lb=0.006)

    Return
    ------
    X - 2D array (n_sample x n_features)
        The BOLD data
    """
    
    # Data is 4d (x,y,z,t) we want 2d, where each column is 
    # a voxel and each row is the temporal (t) data
    # i.e. the final shape should be (x*y*x, t)
    nii = nb.nifti1.load(nifiti)

    numt = nii.shape[3]
    numxyz = nii.shape[0] * nii.shape[1] * nii.shape[2]
    dims = (numxyz, numt)
    
    # Get into 2d (n_feature, n_sample)
    X = nii.get_data().astype('int16').reshape(dims).transpose()
    if clean:
        X = remove_invariant_features(X, sparse=False)
    
    if smooth:
        # Setup smooth params
        tr = 1.5
        ub = 0.06
        lb = 0.001
        if "tr" in kwargs:
            tr = kwargs["tr"]
        if "ub" in kwargs:
            ub = kwargs["ub"]
        if "lb" in kwargs:
            ub = kwargs["lb"]
        
        X = smoothfn(X, tr=tr, ub=ub, lb=lb)
    
    assert checkX(X)
    
    if sparse: 
        X = csc_matrix(X)

    return X
Пример #6
0
def correlateX(X, y, corr="spearman"):
    """Correlate each feature in X, with y (some set of dummmy 
        coded labels).
     
    Parameters
    ----------
    X - a 2d col oreinted array of features
    y - a 1d array of labels
    corr - name of correlation function:
        'pearson' or 'spearman'
    
    Returns
    -------
    corrs - a 1d array of correlations
    ps - a 1d array of p-values
    
    Note
    ----
    Correlation's are calculated using either pearson's r (which 
    assumes Gaussian errors) of spearman's rho (a rank-based 
    non-parametric method.)
    """
    
    X = np.array(X)
    y = np.array(y)
        ## Force... just in case
    
    checkX(X)

    if corr == "pearson":
        corrf = pearsonr
    elif corr == "spearman":
        corrf = spearmanr
    else:
        raise ValueError("stat was not valid.")
    
    corrs = []
    ps = []
    for jj in range(X.shape[1]):
        r, p = corrf(X[:,jj], y)
        corrs.append(r)
        ps.append(p)
        
    return np.array(corrs), np.array(ps)
Пример #7
0
def correlateX(X, y, corr="spearman"):
    """Correlate each feature in X, with y (some set of dummmy 
        coded labels).
     
    Parameters
    ----------
    X - a 2d col oreinted array of features
    y - a 1d array of labels
    corr - name of correlation function:
        'pearson' or 'spearman'
    
    Returns
    -------
    corrs - a 1d array of correlations
    ps - a 1d array of p-values
    
    Note
    ----
    Correlation's are calculated using either pearson's r (which 
    assumes Gaussian errors) of spearman's rho (a rank-based 
    non-parametric method.)
    """

    X = np.array(X)
    y = np.array(y)
    ## Force... just in case

    checkX(X)

    if corr == "pearson":
        corrf = pearsonr
    elif corr == "spearman":
        corrf = spearmanr
    else:
        raise ValueError("stat was not valid.")

    corrs = []
    ps = []
    for jj in range(X.shape[1]):
        r, p = corrf(X[:, jj], y)
        corrs.append(r)
        ps.append(p)

    return np.array(corrs), np.array(ps)
Пример #8
0
def by_trial(X, trial_index, window, y):
    """Rehapes X so each trial is feature.

    Note
    ----
    In y, np.nan and 'nan' values are ignored.
    """

    ncol = X.shape[1]

    # ----
    # Remove short trials from X.
    locations = locate_short_trials(trial_index, window)
    if len(locations) > 0:
        short_mask = locations.pop() == trial_index
        for i in locations:
            short_mask = short_mask | (i == trial_index)
        short_mask = np.logical_not(short_mask)
    
        X = X[short_mask,]
        trial_index = trial_index[short_mask]

    # ----
    # Find all the trials
    trial_masks = []
    for trial in np.unique(trial_index):
        if np.isnan(trial): continue
        trial_masks.append(trial == trial_index)
    
    # And split up X
    Xlist = []
    feature_names = []
    for mask in trial_masks:
        y0 = 0
        if y is not None:
            y0 = y[mask][0]

        if np.str(y0) != 'nan':
            Xlist.append(X[mask,][0:window,])
            feature_names.append(np.repeat(y0, ncol))
        
        
        
    feature_names = np.hstack(feature_names)

    # Create Xtrial by horizonal stacking
    Xtrial = np.hstack(Xlist)

    # Sanity
    assert checkX(Xtrial)
    assert Xtrial.shape[1] == feature_names.shape[0], ("After reshape" 
        "Xtrial and feature_names don't match")
    assert Xtrial.shape[0] == window, ("Number of samples in Xtrial" 
        "doesn't match window")
    
    return Xtrial, feature_names
Пример #9
0
def by_trial(X, trial_index, window, y):
    """Rehapes X so each trial is feature.

    Note
    ----
    In y, np.nan and 'nan' values are ignored.
    """

    ncol = X.shape[1]

    # ----
    # Remove short trials from X.
    locations = locate_short_trials(trial_index, window)
    if len(locations) > 0:
        short_mask = locations.pop() == trial_index
        for i in locations:
            short_mask = short_mask | (i == trial_index)
        short_mask = np.logical_not(short_mask)

        X = X[short_mask, ]
        trial_index = trial_index[short_mask]

    # ----
    # Find all the trials
    trial_masks = []
    for trial in np.unique(trial_index):
        if np.isnan(trial): continue
        trial_masks.append(trial == trial_index)

    # And split up X
    Xlist = []
    feature_names = []
    for mask in trial_masks:
        y0 = 0
        if y is not None:
            y0 = y[mask][0]

        if np.str(y0) != 'nan':
            Xlist.append(X[mask, ][0:window, ])
            feature_names.append(np.repeat(y0, ncol))

    feature_names = np.hstack(feature_names)

    # Create Xtrial by horizonal stacking
    Xtrial = np.hstack(Xlist)

    # Sanity
    assert checkX(Xtrial)
    assert Xtrial.shape[1] == feature_names.shape[0], (
        "After reshape"
        "Xtrial and feature_names don't match")
    assert Xtrial.shape[0] == window, ("Number of samples in Xtrial"
                                       "doesn't match window")

    return Xtrial, feature_names
Пример #10
0
def restack(X, feature_names):
    """Reshape X into a stack of matrices based on feature names, 
    one new 'layer' for each unique (sorted) entry in x.

    Note
    ----
    In order to ensure the layers are stackable, if the number of cols 
    is off zero are added as pad whereever needed.
    """

    X = np.array(X)
    feature_names = np.array(feature_names)
    unique_names = np.unique(feature_names)

    nrow = X.shape[0]
    if X.shape[1] != feature_names.shape[0]:
        raise ValueError(
            "Number of features in X doesn't match feature_names.")

    # Init the reshaped X (Xstack) and the feature
    # mask, then loop over the rest
    mask = unique_names[0] == feature_names
    assert mask.shape[0] == feature_names.shape[0], ("The mask was the"
                                                     "wrong shape")
    assert np.sum(mask) > 1, ("The mask was empty")

    Xstack = X[:, mask]
    for name in unique_names[1:]:
        mask = name == feature_names
        assert np.sum(mask) > 1, ("The mask was empty")

        Xname = X[:, mask]
        diff = Xstack.shape[1] - Xname.shape[1]
        if diff < 0:
            Xstack = _addcol(Xstack, np.abs(diff))
        elif diff > 0:
            Xcond = _addcol(Xname, np.abs(diff))
        Xstack = np.vstack([Xstack, Xname])

    fn_stack = []
    for name in unique_names:
        fn_stack.extend([
            name,
        ] * nrow)
    fn_stack = np.array(fn_stack)

    assert checkX(Xstack)
    assert fn_stack.shape[0] == Xstack.shape[0], (
        "After stacking X and"
        "feature_names did not match.")

    return Xstack, fn_stack
Пример #11
0
def decompose_tcdf(tcdf):
    """Decompose tcdf into its parts - X, cond, dataname, index."""

    index = np.array(tcdf["index"].tolist())
    cond = np.array(tcdf["cond"].tolist())
    dataname = np.array(tcdf["dataname"].tolist())

    tcdf = tcdf.drop(labels=["index", "cond", "dataname"], axis=1)
    X = np.array(tcdf.as_matrix())

    assert checkX(X)

    return X, cond, dataname, index
Пример #12
0
def decompose_tcdf(tcdf):
    """Decompose tcdf into its parts - X, cond, dataname, index."""

    index = np.array(tcdf["index"].tolist())
    cond = np.array(tcdf["cond"].tolist())
    dataname = np.array(tcdf["dataname"].tolist())
    
    tcdf = tcdf.drop(labels=["index", "cond", "dataname"], axis=1)
    X = np.array(tcdf.as_matrix())

    assert checkX(X)
    
    return X, cond, dataname, index
Пример #13
0
def restack(X, feature_names):
    """Reshape X into a stack of matrices based on feature names, 
    one new 'layer' for each unique (sorted) entry in x.

    Note
    ----
    In order to ensure the layers are stackable, if the number of cols 
    is off zero are added as pad whereever needed.
    """

    X = np.array(X)
    feature_names = np.array(feature_names)
    unique_names = np.unique(feature_names)

    nrow = X.shape[0]
    if X.shape[1] != feature_names.shape[0]:
        raise ValueError("Number of features in X doesn't match feature_names.")
   
    # Init the reshaped X (Xstack) and the feature
    # mask, then loop over the rest
    mask = unique_names[0] == feature_names 
    assert mask.shape[0] == feature_names.shape[0], ("The mask was the" 
        "wrong shape")
    assert np.sum(mask) > 1, ("The mask was empty")

    Xstack = X[:,mask]
    for name in unique_names[1:]:
        mask = name == feature_names
        assert np.sum(mask) > 1, ("The mask was empty")

        Xname = X[:,mask]
        diff = Xstack.shape[1] - Xname.shape[1]
        if diff < 0:
            Xstack = _addcol(Xstack, np.abs(diff))
        elif diff > 0:
            Xcond = _addcol(Xname, np.abs(diff))
        Xstack = np.vstack([Xstack, Xname])

    fn_stack = []
    for name in unique_names:
        fn_stack.extend([name, ] * nrow)
    fn_stack = np.array(fn_stack)

    assert checkX(Xstack)
    assert fn_stack.shape[0] == Xstack.shape[0], ("After stacking X and" 
        "feature_names did not match.")

    return Xstack, fn_stack
Пример #14
0
def eva(X, y, trial_index, window, tr):
    """Average trials for each feature in X

    Parameters
     ----------
     X : 2D array-like (n_sample, n_feature)
         The data to decompose
     y : 1D array, None by default
         Sample labels for the data.  In y, np.nan and 'nan' values 
         are ignored.
     trial_index : 1D array (n_sample, )
         Each unique entry should match a trial.
     window : int 
         Trial length

     Return
     ------
     Xeva : a 2D arrays (n_feature*unique_y, window)
         The average trials
     feature_names : 1D array
         The names of the features (taken from y)
    """

    evas = []
    eva_names = []
    scaler = MinMaxScaler(feature_range=(0, 1))
    for j in range(X.shape[1]):
        Xtrials = []

        xj = X[:, j][:, np.newaxis]  ## Need 2D

        # Each feature into trials, rescale too
        Xtrial, feature_names = by_trial(xj, trial_index, window, y)
        Xtrial = scaler.fit_transform(Xtrial.astype(np.float))
        unique_fn = sorted(np.unique(feature_names))
        unique_fn = unique_sorted_with_nan(unique_fn)

        # and again by unique_y/fe]ature_names
        Xlabels, _ = by_labels(X=Xtrial.transpose(), y=feature_names)

        # put all that togthether
        Xtrials.extend([Xl.transpose() for Xl in Xlabels])

        # and average the trials then
        # name names.
        evas.extend([Xt.mean(axis=1) for Xt in Xtrials])
        eva_names.extend(unique_fn)

    # Reshape : (window, len(unique_y)*n_features)
    Xeva = np.vstack(evas).transpose()
    eva_names = np.asarray(eva_names)

    assert checkX(Xeva)
    assert Xeva.shape[0] == window, ("After EVA rows not equal to window")
    assert Xeva.shape[1] == len(unique_fn) * X.shape[1], (
        "After"
        "EVA wrong number of features")
    assert eva_names.shape[0] == Xeva.shape[1], ("eva_names and Xeva"
                                                 "don't match")

    return Xeva, eva_names
Пример #15
0
def fir(X, y, trial_index, window, tr):
    """ Average trials for each feature in X, using Burock's 
    (2000) method.
    
    Parameters
    ----------
    X : 2D array-like (n_sample, n_feature)
        The data to decompose
    y : 1D array, None by default
         Sample labels for the data. In y, np.nan and 'nan' values 
         are treated as baseline labels.
    trial_index : 1D array (n_sample, )
        Each unique entry should match a trial.
    window : int 
        Trial length

    Return
    ------
    Xfir : a 2D arrays (n_feature*unique_y, window)
        The average trials
    feature_names : 1D array
    """

    # Norm then pad.
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X.astype(np.float))
    X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)])

    # Save the org y names
    ynames = sorted(np.unique(y))
    ynames = unique_sorted_with_nan(ynames)

    # y becomes integers
    y = create_y(y)

    # Make the design matrix.
    dm = _create_dm(y, window)
    # dm DEBUG
    #import time
    #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f")
    dm = np.matrix(dm)

    # FIR!
    fir_names = []
    firs = []
    for j in range(X.shape[1]):
        x = np.matrix(X[:, j])
        fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1]
        ## Drop dummy
        fir = fir.reshape(len(ynames) - 1, window)

        firs.append(fir)
        fir_names.extend(ynames[1:])  ## Drop nan/baseline

    Xfir = np.vstack(firs).transpose()
    fir_names = np.asarray(fir_names)

    assert checkX(Xfir)
    assert Xfir.shape[0] == window, ("After FIR rows not equal to window")
    assert Xfir.shape[1] == (len(ynames[1:]) *
                             X.shape[1]), ("After"
                                           "FIR wrong number of features")
    assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir"
                                                 "don't match")

    return Xfir, fir_names
Пример #16
0
        range(*[int(i) for i in args.train_data.split(':')]),
        range(*[int(i) for i in args.train_window.split(':')]),
        args.train_labels, 
        args.train_trial_tr
        )
Xtest, ytest = _data(
        args.test,
        range(*[int(i) for i in args.test_data.split(':')]),
        range(*[int(i) for i in args.test_window.split(':')]),
        args.test_labels, 
        args.test_trial_tr
        )
X = np.vstack([Xtrain, Xtest])
y = np.concatenate([ytrain, ytest])
cvcode = np.asarray([0]*ytrain.shape[0] + [1]*ytest.shape[0])
assert checkX(X)
assert X.shape[0] == y.shape[0], "X and y length mismatch"
assert X.shape[0] == cvcode.shape[0], "X and cvcode length mismatch"

# CV
cv = StratifiedKFold(cvcode, n_folds=2, indices=True)

# Classifier
if args.clf == "RandomForestClassifier":
    clf = RandomForestClassifier(
            n_estimators=500, max_features=None
            )
elif args.clf == "GradientBoostingClassifier":   
    clf = GradientBoostingClassifier(
            n_estimators=100, learning_rate=1.0, 
            max_depth=1, random_state=prng
Пример #17
0
def fir(X, y, trial_index, window, tr):
    """ Average trials for each feature in X, using Burock's 
    (2000) method.
    
    Parameters
    ----------
    X : 2D array-like (n_sample, n_feature)
        The data to decompose
    y : 1D array, None by default
         Sample labels for the data. In y, np.nan and 'nan' values 
         are treated as baseline labels.
    trial_index : 1D array (n_sample, )
        Each unique entry should match a trial.
    window : int 
        Trial length

    Return
    ------
    Xfir : a 2D arrays (n_feature*unique_y, window)
        The average trials
    feature_names : 1D array
    """

    # Norm then pad.
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X.astype(np.float))
    X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)])

    # Save the org y names
    ynames = sorted(np.unique(y))
    ynames = unique_sorted_with_nan(ynames)
    
    # y becomes integers
    y = create_y(y)

    # Make the design matrix.
    dm = _create_dm(y, window)
    # dm DEBUG
    #import time
    #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f")
    dm = np.matrix(dm)
    
    # FIR!
    fir_names = []
    firs = []
    for j in range(X.shape[1]):
        x = np.matrix(X[:,j])
        fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] 
            ## Drop dummy
        fir = fir.reshape(len(ynames)-1, window)  

        firs.append(fir)
        fir_names.extend(ynames[1:])  ## Drop nan/baseline

    Xfir = np.vstack(firs).transpose()
    fir_names = np.asarray(fir_names)

    assert checkX(Xfir)
    assert Xfir.shape[0] == window, ("After FIR rows not equal to window")
    assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" 
        "FIR wrong number of features")
    assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" 
        "don't match")

    return Xfir, fir_names