def test_sanity_checks():
    """Ensure that sanity checks are performed, and as expected."""

    ### -------------- as you add them to dataset --------------
    with raises(EmptyFeatureSetException):
        ds.add_samplet('empty_features', [], 'target')

    ### -------------- as you save them to disk --------------

    ds.add_samplet('all_zeros', np.zeros((ds.num_features, 1)), 'target')
    with raises(ConstantValuesException):
        ds.save(out_file)

    ds.del_samplet('all_zeros')

    # checking for random constant value!
    const_value = np.random.randint(10, 100)
    const_feat_set = np.full((ds.num_features, 1), const_value)
    ds.add_samplet('all_constant', const_feat_set, 'target')
    with raises(ConstantValuesException):
        ds.save(out_file)

    # now checking for constants across samplets
    #   this is easily achieved by adding different samplets with same features
    #   such a bug is possible, when user made a mistake querying
    #   the right files for the right samplet ID
    const_ds = ClfDataset()
    rand_feat_same_across_samplets = np.random.randn(10)
    for index in range(np.random.randint(10, 100)):
        const_ds.add_samplet(str(index), rand_feat_same_across_samplets, index)

    with raises(ConstantValuesException):
        const_ds.save(out_file)
Пример #2
0
def make_random_Dataset(max_num_classes=20,
                        max_class_size=50,
                        max_dim=100,
                        stratified=True):
    "Generates a random Dataset for use in testing."

    smallest = 10
    max_class_size = max(smallest, max_class_size)
    largest = max(50, max_class_size)
    largest = max(smallest + 3, largest)

    if max_num_classes != 2:
        num_classes = np.random.randint(2, max_num_classes, 1)
    else:
        num_classes = 2

    if type(num_classes) == np.ndarray:
        num_classes = num_classes[0]
    if not stratified:
        class_sizes = np.random.random_integers(smallest,
                                                largest,
                                                size=[num_classes, 1])
    else:
        class_sizes = np.repeat(np.random.randint(smallest, largest),
                                num_classes)

    num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0]
    feat_names = [str(x) for x in range(num_features)]

    class_ids = list()
    labels = list()
    for cl in range(num_classes):
        class_ids.append('class-{}'.format(cl))
        labels.append(int(cl))

    ds = ClfDataset()
    for cc, class_ in enumerate(class_ids):
        subids = [
            'sub{:03}-class{:03}'.format(ix, cc)
            for ix in range(class_sizes[cc])
        ]
        for sid in subids:
            ds.add_samplet(samplet_id=sid,
                           features=feat_generator(num_features),
                           target=class_,
                           feature_names=feat_names)

    return ds
Пример #3
0
def make_fully_separable_classes(max_class_size=50, max_dim=100):
    from sklearn.datasets import make_blobs

    random_center = np.random.rand(max_dim)
    cluster_std = 1.5
    centers = [random_center, random_center + cluster_std * 6]
    blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim,
                                  centers=centers, cluster_std=cluster_std)

    unique_labels = np.unique(blobs_y)
    class_ids = {lbl: str(lbl) for lbl in unique_labels}

    new_ds = ClfDataset()
    for index, row in enumerate(blobs_X):
        new_ds.add_samplet(samplet_id='sub{}'.format(index),
                           features=row,  # label=blobs_y[index],
                           target=class_ids[blobs_y[index]])

    return new_ds
Пример #4
0
def load_arff_dataset(ds_path):
    """Convenience utility to quickly load ARFF files into pyradigm format"""

    try:
        ds = ClassificationDataset.from_arff(ds_path)
    except:
        try:
            ds = RegressionDataset.from_arff(ds_path)
        except:
            try:
                ds = MLDataset(arff_path=ds_path)
            except:
                raise TypeError(
                    'Error in loading the ARFF dataset @ path below!'
                    ' Ignoring {}'.format(ds_path))

    return ds
Пример #5
0
def load_dataset(ds_path):
    """Convenience utility to quickly load any type of pyradigm dataset"""

    try:
        ds = ClassificationDataset(dataset_path=ds_path)
    except:
        try:
            ds = RegressionDataset(dataset_path=ds_path)
        except:
            try:
                warn(
                    'MLDtaset is deprecated. Switch to the latest pyradigm data '
                    'structures such as ClassificationDataset or '
                    'RegressionDataset as soon as possible.')
                ds = MLDataset(filepath=ds_path)
            except:
                raise TypeError('Dataset class @ path below not recognized!'
                                ' Must be a valid instance of one of '
                                'ClassificationDataset or '
                                'RegressionDataset or MLDataset.\n'
                                ' Ignoring {}'.format(ds_path))

    return ds
Пример #6
0
def get_features(samplet_id_list,
                 classes,
                 featdir,
                 outdir,
                 outname,
                 get_method=None,
                 feature_type='dir_of_dris'):
    """
    Populates the pyradigm data structure with features from a given method.

    Parameters
    ----------
    samplet_id_list : list or ndarray
        List of subject IDs
    classes : dict
        dict of class labels keyed in by subject id
    featdir : str
        Path to input directory to read the features from
    outdir : str
        Path to output directory to save the gathered features to.
    outname : str
        Name of the feature set
    get_method : callable
        Callable that takes in a path and returns a vectorized feature set
        e.g. set of subcortical volumes, with an optional array of names for each
        feature.
    feature_type : str
        Identifier of data organization for features.

    Returns
    -------
    saved_path : str
        Path where the features have been saved to as a pyradigm dataset

    """

    if not callable(get_method):
        raise ValueError("Supplied get_method is not callable! "
                         " It must take in a path and "
                         "return a vectorized feature set and labels.")

    # generating an unique numeric label for each class
    # (sorted in order of their appearance in metadata file)
    class_set = set(classes.values())
    class_labels = dict()
    for idx, cls in enumerate(class_set):
        class_labels[cls] = idx

    ids_excluded = list()

    if feature_type == 'data_matrix':
        data_matrix = get_data_matrix(featdir)

    ds = ClassificationDataset()
    for samplet_id in samplet_id_list:
        try:
            if feature_type == 'data_matrix':
                data = data_matrix[samplet_id_list.index(samplet_id), :]
                feat_names = None
            else:
                data, feat_names = get_method(featdir, samplet_id)

            ds.add_samplet(samplet_id=samplet_id,
                           features=data,
                           target=classes[samplet_id],
                           feature_names=feat_names)
        except:
            ids_excluded.append(samplet_id)
            traceback.print_exc()
            warnings.warn(
                "Features for {} via {} method could not be read or added."
                " Excluding it.".format(samplet_id, get_method.__name__))

    # warning for if failed to extract features even for one subject
    alert_failed_feature_extraction(len(ids_excluded), ds.num_samplets,
                                    len(samplet_id_list))

    # save the dataset to disk to enable passing on multiple dataset(s)
    saved_path = realpath(pjoin(outdir, outname))
    try:
        ds.save(saved_path)
    except IOError as ioe:
        print('Unable to save {} features to disk in folder:\n{}'
              ''.format(outname, outdir))
        raise ioe

    return saved_path
Пример #7
0
estimator = 'randomforestclassifier'  # 'svm' #
dr_method = 'isomap'  # 'selectkbest_f_classif' # 'variancethreshold'  #
dr_size = 'tenth'
gs_level = 'none'  # 'light'

random.seed(42)  # to save time for local tests

covar_list = ('age', 'gender', 'dummy')
covar_types = ('age', 'gender', 'float')
covar_arg = ' '.join(['age', 'gender'])
deconf_method = 'residualize'

out_path1 = os.path.join(out_dir, 'random_clf_ds1.pkl')
out_path2 = os.path.join(out_dir, 'random_clf_ds2.pkl')
if pexists(out_path1) and pexists(out_path2):
    ds_one = ClassificationDataset(dataset_path=out_path1)
    ds_two = ClassificationDataset(dataset_path=out_path2)
else:
    ds_one = make_random_ClfDataset(max_num_classes=max_num_classes,
                                    stratified=True,
                                    max_class_size=max_class_size,
                                    max_dim=max_dim,
                                    min_num_classes=min_num_classes,
                                    attr_names=covar_list,
                                    attr_types=covar_types)
    ds_one.save(out_path1)

    ds_two = dataset_with_new_features_same_everything_else(ds_one, max_dim)
    ds_two.save(out_path2)

A = 0