Python Bunch 예제들, sklearn.utils.Bunch Python 예제들

예제 #1

0

파일 보기

파일: load_my_minst.py 프로젝트: stealthness/sklearn-examples

import os.path

from matplotlib import image
from sklearn.neural_network import MLPClassifier
from sklearn.utils import Bunch
from sw_path import WORK_ROOT
from sklearn.metrics import classification_report,accuracy_score
TEST = True
LOGGING = True


print("start")



minst_training = Bunch()
minst_test = Bunch()

folder_path = (WORK_ROOT + "RES/MNIST/10k")

if not os.path.isdir(folder_path):
    raise Exception("dir not found")

test_path = folder_path+"/test"
training_path = folder_path+"/training"

minst_training.data = []
minst_training.target = []
minst_training.target_names = []
minst_test.data = []
minst_test.target = []

예제 #2

0

파일 보기

def set_args_2():
    # options for model
    args = Bunch()
    args.mask_mode = 'cross-wise'  # in ['row_wise', 'col_wise', 'cross_wise', 'cross_and_hier_wise']
    args.additional_ban = 0
    # args.pooling = 'avg-token'
    args.pooling = 'avg-cell-seg'
    args.table_object = 'first-column'
    args.noise_num = 2
    args.seq_len = 100
    args.row_wise_fill = True

    args.pretrained_model_path = "./models/bert_model.bin-000"
    args.vocab_path = 'models/google_uncased_en_vocab.txt'
    args.vocab = Vocab()
    args.vocab.load(args.vocab_path)
    args.emb_size = 768
    args.embedding = 'tab'  # before: bert
    args.encoder = 'bertTab'
    args.subword_type = 'none'
    args.tokenizer = 'bert'

    args.feedforward_size = 3072
    args.hidden_size = 768
    args.heads_num = 12
    args.layers_num = 12
    args.learning_rate = 2e-5
    args.warmup = 0.1
    args.batch_size = 32
    args.dropout = 0.1
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args.train_path = './data/aida/IO/train_samples'
    args.t2d_path = './data/aida/IO/test_samples_t2d'
    args.limaye_path = './data/aida/IO/test_samples_limaye'
    args.wiki_path = './data/aida/IO/test_samples_wikipedia'

    # other options
    args.report_steps = 100
    args.labels_map = get_labels_map_from_aida_file_2(args.train_path)
    args.labels_num = len(args.labels_map)
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)
    return args

예제 #3

0

파일 보기

파일: _real_datasets.py 프로젝트: skdom6/scikit-fda

def fetch_octane(
    return_X_y: bool = False,
    as_frame: bool = False,
) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]:
    """Load near infrared spectra of gasoline samples.

    This function fetchs the octane dataset from the R package 'mrfDepth'
    from CRAN.

    """
    descr = _octane_descr

    # octane file from mrfDepth R package
    raw_dataset = fetch_cran("octane", "mrfDepth", version="1.0.11")
    data = raw_dataset['octane'][..., 0].T

    # The R package only stores the values of the curves, but the paper
    # describes the rest of the data. According to [RDEH2006], Section 5.4:

    # "wavelengths ranging from 1102nm to 1552nm with measurements every two
    # nm.""
    wavelength_start = 1102
    wavelength_end = 1552
    wavelength_count = 226

    grid_points = np.linspace(
        wavelength_start,
        wavelength_end,
        wavelength_count,
    )

    # "The octane data set contains six outliers (25, 26, 36–39) to which
    # alcohol was added".
    target = np.zeros(len(data), dtype=np.bool_)
    target[24:26] = 1  # noqa: WPS432
    target[35:39] = 1  # noqa: WPS432

    target_name = "is outlier"

    curve_name = "absorbances"

    curves = FDataGrid(
        data,
        grid_points=grid_points,
        dataset_name="octane",
        argument_names=("wavelength (nm)", ),
        coordinate_names=("absorbances", ),
    )

    frame = None

    if as_frame:
        frame = pd.DataFrame({
            curve_name: curves,
            target_name: target,
        })
        curves = frame.iloc[:, [0]]
        target = frame.iloc[:, 1]

    if return_X_y:
        return curves, target

    return Bunch(
        data=curves,
        target=target,
        frame=frame,
        categories={},
        feature_names=[curve_name],
        target_names=[target_name],
        DESCR=descr,
    )

예제 #4

0

파일 보기

파일: fetchers.py 프로젝트: llevitis/netneurotools

def fetch_connectome(dataset, data_dir=None, url=None, resume=True, verbose=1):
    """
    Downloads files from multi-species connectomes

    Parameters
    ----------
    dataset : str
        Specifies which dataset to download; must be one of the datasets listed
        in :func:`netneurotools.datasets.available_connectomes()`.
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NNT_DATA'; if that is not set, will use
        `~/nnt-data` instead. Default: None
    url : str, optional
        URL from which to download data. Default: None
    resume : bool, optional
        Whether to attempt to resume partial download, if possible. Default:
        True
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    data : :class:`sklearn.utils.Bunch`
        Dictionary-like object with, at a minimum, keys ['conn', 'labels',
        'ref'] providing connectivity / correlation matrix, region labels, and
        relevant reference. Other possible keys include 'dist' (an array of
        Euclidean distances between regions of 'conn'), 'coords' (an array of
        xyz coordinates for regions of 'conn'), 'acronyms' (an array of
        acronyms for regions of 'conn'), and 'networks' (an array of network
        affiliations for regions of 'conn')

    References
    ----------
    See `ref` key of returned dictionary object for relevant dataset reference
    """

    if dataset not in available_connectomes():
        raise ValueError(
            'Provided dataset {} not available; must be one of {}'.format(
                dataset, available_connectomes()))

    dataset_name = 'ds-connectomes'

    data_dir = op.join(_get_data_dir(data_dir=data_dir), dataset_name)
    info = _get_dataset_info(dataset_name)[dataset]
    if url is None:
        url = info['url']
    opts = {
        'uncompress': True,
        'md5sum': info['md5'],
        'move': '{}.tar.gz'.format(dataset)
    }

    filenames = [op.join(dataset, '{}.csv'.format(fn))
                 for fn in info['keys']] + [op.join(dataset, 'ref.txt')]
    data = _fetch_files(data_dir,
                        files=[(f, url, opts) for f in filenames],
                        resume=resume,
                        verbose=verbose)

    # load data
    for n, arr in enumerate(data[:-1]):
        try:
            data[n] = np.loadtxt(arr, delimiter=',')
        except ValueError:
            data[n] = np.loadtxt(arr, delimiter=',', dtype=str)
    with open(data[-1]) as src:
        data[-1] = src.read().strip()

    return Bunch(**dict(zip(info['keys'] + ['ref'], data)))

예제 #5

0

파일 보기

파일: fetch_cimaq.py 프로젝트: FrancoisNadeau/cimaq_memory

def get_fmri_sessions(topdir: Union[str, PathLike, PosixPath],
                      events_dir: Union[str, PathLike, PosixPath] = None,
                      task: str = 'memory',
                      space: str = 'MNI152NLin2009cAsym',
                      output_type: str = 'preproc',
                      extension: str = '.nii.gz',
                      modality: str = 'bold',
                      sub_id: str = '*',
                      ses_id: str = '*',
                      **kwargs) -> list:
    """
    Return a sorted list of the desired BOLD fMRI nifti file paths.

    Args:
        topdir: str, PathLike, or PosixPath (Default = None)
            Database top-level directory path.

        events_dir: str, PathLike, or PosixPath (Default = None)
            Directory where the events and/or behavioural
            files are stored. If None is provided, it is assumed
            to be identical as ``topdir``.

        masker_dir: str, PathLike, or PosixPath (Default = None)
            Directory where prefitted nilearn nifti maskers
            are located. Used to save fitting time.

        task: str (Default = 'memory')
            Name of the experimental task
            (i.e. 'rest' is also valid).

        space: str (Default = 'MNI152NLin2009cAsym')
            Name of the template used during resampling.
            Most likely corresponding to a valid TemplateFlow name.

        output_type: str (Default = 'preproc')
            Name of the desired FMRIPrep output type. Most likely
            corresponding to a valid FMRIPrep output type name.

        extension: str (Default = '.nii.gz')
            Nifti files extension. The leading '.' is required.

        modality: str (Default = 'bold')
            Scanning modality used during the experiment.

        sub_id: str (Default = '*')
            Participant identifier. By default, returns all
            participants. The leading 'sub-' must be omitted.
            If the identifier is numeric, it should be quoted.

        ses_id: str (Default = '*')
            Session identifier. By default, returns all sessions.
            If the identifier is numeric, it should be quoted.
            The leading 'ses-' must be omitted.

    Returns: list
        Sorted list of the desired nifti file paths.

    Notes:
        All parameters excepting ``topdir`` and ``events_dir``
        can be replaced by '*' (Default), which is
        equivalent to a UNIX ``find`` pattern.
    """

    from decoding.cimaq_decoding_params import _params

    # Generate regex and glob patterns
    bold_pattern = '_'.join([
        f'sub-{sub_id}', f'ses-{ses_id}', f'task-{task}', f'space-{space}',
        f'desc-{output_type}', f'{modality}{extension}'
    ])
    ev_pattern = f'sub-{sub_id}_ses-{ses_id}_task-{task}_events.tsv'

    # Load fMRI and events files paths into lists
    bold_paths = sorted(map(str, Path(topdir).rglob(f'*{bold_pattern}')))
    event_paths = sorted(map(str, (Path(events_dir).rglob(f'*{ev_pattern}'))))
    # Get only the intersection of these lists
    valid_bold_paths = sorted(
        boldpath for boldpath in bold_paths if get_sub_ses_key(
            boldpath) in [get_sub_ses_key(apath) for apath in event_paths])
    valid_event_paths = sorted(
        evpath for evpath in event_paths if get_sub_ses_key(
            evpath) in [get_sub_ses_key(apath) for apath in valid_bold_paths])
    # Load corresponding anatomical T1w, brain mask and behavioural file paths
    valid_anat_paths = [
        get_fmriprep_anat(v_boldpath) for v_boldpath in valid_bold_paths
    ]
    valid_mask_paths = [
        get_fmriprep_mask(v_boldpath) for v_boldpath in valid_bold_paths
    ]
    valid_behav_paths = [
        get_behav(v_boldpath, events_dir) for v_boldpath in valid_bold_paths
    ]

    # Zip them together
    zipped_paths = sorted(
        zip(valid_bold_paths, valid_anat_paths, valid_mask_paths,
            valid_event_paths, valid_behav_paths))

    # Create sklearn.utils.Bunch objects
    sessions = [
        Bunch(**dict(
            zip([
                'fmri_path', 'anat_path', 'mask_path', 'events_path',
                'behav_path', 'sub_id', 'ses_id', 'task', 'space'
            ], (item + Path(item[0]).parts[-4:-2] +
                ('task-' + task, 'space-' + space))))) for item in zipped_paths
    ]

    # Setting default keyword arguments and parameters
    [session.update(**_params) for session in sessions]

    return sessions

예제 #6

0

파일 보기

    def __init__(self, opt, phase="train"):
        # TODO split the dataset of val and test
        if phase == "val":
            phase = "test"

        opt.load_dataset_mode = 'reader'
        super(Cifar100Dataset, self).__init__(opt, phase)

        self.data_dir = opt.cifar100_dataset_dir

        self.data_name = CIFAR100

        self.x_transforms_train = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ColorJitter(brightness=0.24705882352941178),
            transforms.Resize((opt.imsize, opt.imsize)),
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408),
                                 (0.2675, 0.2565, 0.2761))
        ])

        self.x_transforms_test = transforms.Compose([
            transforms.Resize((opt.imsize, opt.imsize)),
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408),
                                 (0.2675, 0.2565, 0.2761))
        ])

        self.y_transforms = None
        if self.opt.load_dataset_mode == 'dir':

            self.data = []  # image_paths,targets
            self.label2Indices = defaultdict(list)
            image_dir = os.path.join(self.data_dir, phase)

            self._labels = os.listdir(image_dir)
            # get label to targets, dict type
            self.label2target = dict([
                (label, target) for target, label in enumerate(self.labels)
            ])
            self.target2label = dict([
                (target, label) for target, label in enumerate(self.labels)
            ])

            if not os.path.exists(image_dir):
                raise FileNotFoundError(
                    f"Image Dir {image_dir} not exists, please check it")
            for root, label_dirs, files in os.walk(image_dir):
                for file in files:
                    label = os.path.basename(root)

                    image_path = os.path.join(root, file)
                    target = self.label2target[label]

                    self.label2Indices[label].append(len(self.data))

                    self.data.append(
                        Bunch(image_path=image_path, target=target))
        elif self.opt.load_dataset_mode == 'reader':
            dataset = datasets.CIFAR100(root=os.path.join(
                self.data_dir, 'raw_data'),
                                        train=self.isTrain,
                                        download=True)
            self.data, self._labels, self.label2Indices, self.label2target, self.target2label = prepare_datas_by_standard_data(
                dataset)
        else:
            raise ValueError(
                f"Expected load_dataset_mode in [dir,reader], but got {self.opt.load_dataset_mode}"
            )

예제 #7

0

파일 보기

파일: fetchers.py 프로젝트: llevitis/netneurotools

def fetch_cammoun2012(version='MNI152NLin2009aSym',
                      data_dir=None,
                      url=None,
                      resume=True,
                      verbose=1):
    """
    Downloads files for Cammoun et al., 2012 multiscale parcellation

    Parameters
    ----------
    version : str, optional
        Specifies which version of the dataset to download, where
        'MNI152NLin2009aSym' will return .nii.gz atlas files defined in MNI152
        space, 'fsaverageX' will return .annot files defined in fsaverageX
        space (FreeSurfer 6.0.1), 'fslr32k' will return .label.gii files in
        fs_LR_32k HCP space, and 'gcs' will return FreeSurfer-style .gcs
        probabilistic atlas files for generating new, subject-specific
        parcellations. Default: 'MNI152NLin2009aSym'
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NNT_DATA'; if that is not set, will use
        `~/nnt-data` instead. Default: None
    url : str, optional
        URL from which to download data. Default: None
    resume : bool, optional
        Whether to attempt to resume partial download, if possible. Default:
        True
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    filenames : :class:`sklearn.utils.Bunch`
        Dictionary-like object with keys ['scale033', 'scale060', 'scale125',
        'scale250', 'scale500'], where corresponding values are lists of
        filepaths to downloaded parcellation files.

    References
    ----------
    Cammoun, L., Gigandet, X., Meskaldji, D., Thiran, J. P., Sporns, O., Do, K.
    Q., Maeder, P., and Meuli, R., & Hagmann, P. (2012). Mapping the human
    connectome at multiple scales with diffusion spectrum MRI. Journal of
    Neuroscience Methods, 203(2), 386-397.

    Notes
    -----
    License: https://raw.githubusercontent.com/LTS5/cmp/master/COPYRIGHT
    """

    if version == 'surface':
        warnings.warn(
            'Providing `version="surface"` is deprecated and will '
            'be removed in a future release. For consistent '
            'behavior please use `version="fsaverage"` instead.',
            DeprecationWarning,
            stacklevel=2)
        version = 'fsaverage'
    elif version == 'volume':
        warnings.warn(
            'Providing `version="volume"` is deprecated and will '
            'be removed in a future release. For consistent '
            'behavior please use `version="MNI152NLin2009aSym"` '
            'instead.',
            DeprecationWarning,
            stacklevel=2)
        version = 'MNI152NLin2009aSym'

    versions = [
        'gcs', 'fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k',
        'MNI152NLin2009aSym'
    ]
    if version not in versions:
        raise ValueError(
            'The version of Cammoun et al., 2012 parcellation '
            'requested "{}" does not exist. Must be one of {}'.format(
                version, versions))

    dataset_name = 'atl-cammoun2012'
    keys = ['scale033', 'scale060', 'scale125', 'scale250', 'scale500']

    data_dir = _get_data_dir(data_dir=data_dir)
    info = _get_dataset_info(dataset_name)[version]
    if url is None:
        url = info['url']

    opts = {
        'uncompress': True,
        'md5sum': info['md5'],
        'move': '{}.tar.gz'.format(dataset_name)
    }

    # filenames differ based on selected version of dataset
    if version == 'MNI152NLin2009aSym':
        filenames = [
            'atl-Cammoun2012_space-MNI152NLin2009aSym_res-{}_deterministic{}'.
            format(res[-3:], suff) for res in keys for suff in ['.nii.gz']
        ] + ['atl-Cammoun2012_space-MNI152NLin2009aSym_info.csv']
    elif version == 'fslr32k':
        filenames = [
            'atl-Cammoun2012_space-fslr32k_res-{}_hemi-{}_deterministic{}'.
            format(res[-3:], hemi, suff) for res in keys
            for hemi in ['L', 'R'] for suff in ['.label.gii']
        ]
    elif version in ('fsaverage', 'fsaverage5', 'fsaverage6'):
        filenames = [
            'atl-Cammoun2012_space-{}_res-{}_hemi-{}_deterministic{}'.format(
                version, res[-3:], hemi, suff) for res in keys
            for hemi in ['L', 'R'] for suff in ['.annot']
        ]
    else:
        filenames = [
            'atl-Cammoun2012_res-{}_hemi-{}_probabilistic{}'.format(
                res[5:], hemi, suff)
            for res in keys[:-1] + ['scale500v1', 'scale500v2', 'scale500v3']
            for hemi in ['L', 'R'] for suff in ['.gcs', '.ctab']
        ]

    files = [(op.join(dataset_name, version, f), url, opts) for f in filenames]
    data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose)

    if version == 'MNI152NLin2009aSym':
        keys += ['info']
    elif version in ('fslr32k', 'fsaverage', 'fsaverage5', 'fsaverage6'):
        data = [ANNOT(*data[i:i + 2]) for i in range(0, len(data), 2)]
    else:
        data = [data[::2][i:i + 2] for i in range(0, len(data) // 2, 2)]
        # deal with the fact that last scale is split into three files :sigh:
        data = data[:-3] + [list(itertools.chain.from_iterable(data[-3:]))]

    return Bunch(**dict(zip(keys, data)))

예제 #8

0

파일 보기

import sys
import pandas as pd
import numpy as np
from sklearn.utils import Bunch
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn.metrics import brier_score_loss

bunch_train = Bunch()
bunch_test = Bunch()

# 从本地加载数据到内存
with open('G:\AI\Bunch_1w\Bunch_train',
          'rb') as file_obj:  # word_bag_filepath为本地文件路径
    bunch_train = pickle.load(file_obj)
with open('G:\AI\Bunch_1w\Bunch_test',
          'rb') as file_obj_2:  # word_bag_filepath为本地文件路径
    bunch_test = pickle.load(file_obj_2)


def metrics_result(actual, predict):
    print("精度：{0:.3f}".format(
        metrics.precision_score(actual, predict, average='micro')))
    print("召回：{0:0.3f}".format(

예제 #9

0

파일 보기

파일: ucr.py 프로젝트: srajthakur/new1

def _load_ucr_dataset(dataset, path):
    """Load a UCR data set from a local folder.

    Parameters
    ----------
    dataset : str
        Name of the dataset.

    path : str
        The path of the folder containing the cached data set.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array
            The classification labels in the training set.
        target_test : array
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    Notes
    -----
    Padded values are represented as NaN's.

    """
    new_path = path + dataset + '/'
    try:
        with (open(new_path + dataset + '.txt', encoding='utf-8')) as f:
            description = f.read()
    except UnicodeDecodeError:
        with (open(new_path + dataset + '.txt', encoding='ISO-8859-1')) as f:
            description = f.read()
    try:
        data_train = np.genfromtxt(new_path + dataset + '_TRAIN.txt')
        data_test = np.genfromtxt(new_path + dataset + '_TEST.txt')

        X_train, y_train = data_train[:, 1:], data_train[:, 0]
        X_test, y_test = data_test[:, 1:], data_test[:, 0]

    except IndexError:
        train = loadarff(new_path + dataset + '_TRAIN.arff')
        test = loadarff(new_path + dataset + '_TEST.arff')

        data_train = np.asarray([train[0][name] for name in train[1].names()])
        X_train = data_train[:-1].T.astype('float64')
        y_train = data_train[-1]

        data_test = np.asarray([test[0][name] for name in test[1].names()])
        X_test = data_test[:-1].T.astype('float64')
        y_test = data_test[-1]

    try:
        y_train = y_train.astype('float64').astype('int64')
        y_test = y_test.astype('float64').astype('int64')
    except ValueError:
        pass

    bunch = Bunch(data_train=X_train,
                  target_train=y_train,
                  data_test=X_test,
                  target_test=y_test,
                  DESCR=description,
                  url=("http://www.timeseriesclassification.com/"
                       "description.php?Dataset={}".format(dataset)))

    return bunch

예제 #10

0

파일 보기

def load_fon_w_campana():
    datos = np.load(ruta + 'fon_w_campana_ccas.npy')
    return Bunch(data=datos[:, :-1], target=datos[:, -1])

예제 #11

0

파일 보기

#
#
# we extract the values from the dataframe df_max to merge with the global df
df_to_merge = pd.DataFrame(df_max['encoded'].values.tolist())
df_to_merge['label'] = 1
df_enc = df_enc.append(df_to_merge, ignore_index=True)
df_enc['name'] = 'na'
df_enc.loc[df_enc['label'] == 1, 'name'] = df_max['name'].values
df_enc
#
#
# save the encoded results as 4x23 images
# we put the results in bunch
encoded4x23 = Bunch(
    target_names=df_enc['name'].values, 
    target=df_enc['label'].values, 
    images=(df_enc.iloc[:, 0:92].values * 254).reshape(-1,4,23, order='F')
)
plt.imshow(encoded4x23.images[0], cmap='Greys')

ispickle = False
if ispickle is True:
	# we create the pkl file for later use
	pickle_out = open("encoded4x23withoutTsai.pkl","wb")
	pkl.dump(encoded4x23, pickle_out)
	pickle_out.close()

# A -> [1, 0, 0, 0]
# G -> [0, 1, 0, 0]
# C -> [0, 0, 1, 0]
# T -> [0, 0, 0, 1]

예제 #12

0

파일 보기

def load_fon_w_braso():
    datos = np.load(ruta + 'fon_w_braso_ccas.npy')
    return Bunch(data=datos[:, :-1], target=datos[:, -1])

예제 #13

0

파일 보기

def load_art_w_petaka():
    datos = np.load(ruta + 'art_w_petaka_ccas.npy')
    return Bunch(data=datos[:, :-1], target=datos[:, -1])

예제 #14

0

파일 보기

def load_prs_rt():
    datos = np.load(ruta + 'prs_rt_ccas.npy')
    return Bunch(data=datos[:, :-1], target=datos[:, -1])

예제 #15

0

파일 보기

def set_args(predefined_dict_groups):
    # options for model
    args = Bunch()
    args.mask_mode = 'cross-wise'  # in ['row_wise', 'col_wise', 'cross_wise', 'cross_and_hier_wise']
    args.additional_ban = 0
    # args.pooling = 'avg-token'
    args.pooling = 'avg-cell-seg'
    args.table_object = 'first-column'
    args.noise_num = 2
    args.seq_len = 100
    args.row_wise_fill = True

    args.pretrained_model_path = "./models/bert_model.bin-000"
    args.vocab_path = 'models/google_uncased_en_vocab.txt'
    args.vocab = Vocab()
    args.vocab.load(args.vocab_path)
    args.emb_size = 768
    args.embedding = 'tab'  # before: bert
    args.encoder = 'bertTab'
    args.subword_type = 'none'
    args.tokenizer = 'bert'
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    args.feedforward_size = 3072
    args.hidden_size = 768
    args.heads_num = 12
    args.layers_num = 12
    args.learning_rate = 2e-5
    # args.learning_rate = 1e-4
    args.warmup = 0.1
    args.batch_size = 32
    args.dropout = 0.1
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # other options
    args.shuffle_rows = True
    args.report_steps = 100
    for predefined_dict_group in predefined_dict_groups.values():
        for k, v in predefined_dict_group.items():
            args[k] = v
    args.labels_map = get_labels_map_from_aida_file_2(args.train_path)
    args.labels_num = len(args.labels_map)

    # logger and tensorboard writer
    if args.tx_logger_dir_name:
        rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
        args.summary_writer = SummaryWriter(logdir=os.path.join(
            args.tx_logger_dir_name, '-'.join([args.exp_name, rq])))
    else:
        args.summary_writer = None
    if args.logger_dir_name is not None:
        args.logger_name = 'detail'
        args.logger = get_logger(logger_name=args.logger_name,
                                 dir_name=args.logger_dir_name,
                                 file_name=args.logger_file_name)
    else:
        args.logger = None
    return args

예제 #16

0

파일 보기

파일: Análise e Modelagem.py 프로젝트: theclanks/machine_learning

# In[794]:

# Diretamente das musicas
for f in listdir("data/raw"):
    arquivo = "data/raw/"+f
    
    genero_df, genero = carregar_arquivo(arquivo)
    #stemm = nltk.stem.RSLPStemmer()
    
    for l in genero_df.values:
        normalized = pre_process(l[0])
        data.append(normalized)
        target.append(genero)

dataset = Bunch(data=data, target=target)

    


# In[733]:

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.30, random_state=42)


# In[ ]:




# In[734]:

예제 #17

0

파일 보기

파일: bunch_example.py 프로젝트: stealthness/sklearn-examples

    time_started = time()
    x_train, x_test, y_train, y_test = train_test_split(bunch.data,
                                                        bunch.target,
                                                        test_size=0.5,
                                                        random_state=10)
    clf = svm.SVC(gamma='scale')
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    time_finished = time()
    return accuracy_score(y_test, y_pred), time_finished - time_started


print(f'our accuracy b is {get_result(iris)[0]}\n')

# There is better way to create a bunch object (see new_cancer)
new_iris = Bunch()
new_iris.DESCR = DESCR
new_iris.data = d
new_iris.target = labels
new_iris.target_names = labels_names
new_iris.feature_names = feature_names

result, time_taken = get_result(new_iris)
print(
    f'Our accuracy for new_iris is {result}\nCompleted in {1000*time_taken:.3f}\n'
)

cancer = datasets.load_breast_cancer()
result, time_taken = get_result(cancer)
print(
    f'Our accuracy for cancer dataset is {result}\nCompleted in {1000*time_taken:.3f}\n'

예제 #18

0

파일 보기

def compute_score(X, G, P, S, GWprior=None, score_method='bic'):
    """Compute score function of P."""
    n_samples, n_dim = X.shape

    d0 = GWprior.d0
    S0 = GWprior.S0

    # check prior size violations
    if S0.shape[0] != n_dim or S0.shape[1] != n_dim:
        raise ValueError('GWprior.S0 must be p-by-p, with p dimensions X')

    dn = n_samples + d0
    # C = (S + S0) / (dn - 2)

    # % need logdetP and invP
    # es, Q = np.linalg.eigh(x)
    # Inv = np.linalg.multi_dot((Q, np.diag(1. / es), Q.T))
    U, s, Vh = linalg.svd(P)

    # check
    invP = np.linalg.multi_dot((Vh.T, np.diag(1. / s), U.T))
    logdetP = np.sum(np.log(s))

    # % compute loglik
    loglik = n_samples * log_likelihood(S / n_samples, P)

    num_edges = np.triu(G, 1).sum()
    dof = num_edges + n_dim

    # pcor = cov2cor(P);

    # % the posterior Sn parameter
    # Sn = (dn - 2) * invP
    logh = (dn - 2) / 2. * (n_dim + logdetP)

    # find full param set V
    Vi, Vj = np.nonzero(np.triu(G))

    # to be the same as matlab
    idx = np.argsort(Vj)
    Vi, Vj = Vi[idx], Vj[idx]

    GWpost = Bunch()
    GWpost.Sn = S + S0
    # GWpost.C = C
    GWpost.dn = dn
    GWpost.P = P
    GWpost.num_edges = num_edges
    GWpost.dof = dof
    GWpost.logdetP = logdetP
    GWpost.loglik = loglik

    if score_method == 'bic':
        score = loglik - dof * np.log(n_samples) / 2 if n_samples > 0 else 0

    elif score_method == 'diaglaplace':
        # Diagonal hessian laplace approximation
        diagH = np.zeros(dof)
        for e1 in range(dof):
            # e2 = e1

            M1 = np.zeros((n_dim, n_dim))
            # M2 = M1.copy()

            nz1 = [Vi[e1], Vj[e1]]
            # nz2 = [Vi[e2], Vj[e2]]
            M1[:, nz1] = invP[:, [Vj[e1], Vi[e1]]]
            # M2[:, nz2] = invP[:, [Vj[e2], Vi[e2]]]

            # A = M1[nz2][:, nz1]
            # B = M2[nz1][:, nz2]
            A = M1[nz1][:, nz1]
            B = A

            tmp2 = A[0, :].dot(B[:, 0]) + A[1, :].dot(B[:, 1])

            diagH[e1] = -(dn - 2) * tmp2 / 2
            # diagH(e1) = -(dn-2) * trace(M1(nz2,nz1)*M2(nz1,nz2))/2;

        logdetHdiag = sum(np.log(-diagH))
        lognormconst = dof * np.log(2 * np.pi) / 2 + logh - logdetHdiag / 2.
        score = lognormconst - GWprior.lognormconst - \
            n_samples * n_dim * np.log(2 * np.pi) / 2
        GWpost.lognormconst = lognormconst

    elif score_method == 'laplace':
        # Full laplace approximation
        H = np.empty((dof, dof))
        for e1 in range(dof):
            # nz1 = [Vi[e1], Vj[e1]]
            i, j = Vi[e1], Vj[e1]

            for e2 in range(e1, dof):
                # nz2 = [Vi[e2], Vj[e2]]
                l, m = Vi[e2], Vj[e2]
                # A = invP[nz2][:, [Vj[e1], Vi[e1]]]
                # B = invP[nz1][:, [Vj[e2], Vi[e2]]]
                A = invP[[l, m]][:, [j, i]]
                B = invP[[i, j]][:, [m, l]]

                # tmp2 = A[0, :].dot(B[:, 0]) + A[1, :].dot(B[:, 1])
                # tmp2 = np.trace(A.dot(B))
                tmp2 = (A * B.T).sum()
                H[e2, e1] = H[e1, e2] = -(dn - 2) * tmp2 / 2.

        # neg Hessian will be posdef
        logdetH = 2 * sum(np.log(np.diag(linalg.cholesky(-H))))
        lognormconst = dof * np.log(2 * np.pi) / 2 + logh - logdetH / 2.
        score = lognormconst - GWprior.lognormconst - \
            n_samples * n_dim * np.log(2 * np.pi) / 2
        GWpost.lognormconst = lognormconst

    GWpost.score = score
    return GWpost

예제 #19

0

파일 보기

파일: fetchers.py 프로젝트: llevitis/netneurotools

def fetch_conte69(data_dir=None, url=None, resume=True, verbose=1):
    """
    Downloads files for Van Essen et al., 2012 Conte69 template

    Parameters
    ----------
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NNT_DATA'; if that is not set, will use
        `~/nnt-data` instead. Default: None
    url : str, optional
        URL from which to download data. Default: None
    resume : bool, optional
        Whether to attempt to resume partial download, if possible. Default:
        True
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    filenames : :class:`sklearn.utils.Bunch`
        Dictionary-like object with keys ['midthickness', 'inflated',
        'vinflated'], where corresponding values are lists of filepaths to
        downloaded template files.

    References
    ----------
    http://brainvis.wustl.edu/wiki/index.php//Caret:Atlases/Conte69_Atlas

    Van Essen, D. C., Glasser, M. F., Dierker, D. L., Harwell, J., & Coalson,
    T. (2011). Parcellations and hemispheric asymmetries of human cerebral
    cortex analyzed on surface-based atlases. Cerebral cortex, 22(10),
    2241-2262.

    Notes
    -----
    License: ???
    """

    dataset_name = 'tpl-conte69'
    keys = ['midthickness', 'inflated', 'vinflated']

    data_dir = _get_data_dir(data_dir=data_dir)
    info = _get_dataset_info(dataset_name)
    if url is None:
        url = info['url']

    opts = {
        'uncompress': True,
        'md5sum': info['md5'],
        'move': '{}.tar.gz'.format(dataset_name)
    }

    filenames = [
        'tpl-conte69/tpl-conte69_space-MNI305_variant-fsLR32k_{}.{}.surf.gii'.
        format(res, hemi) for res in keys for hemi in ['L', 'R']
    ] + ['tpl-conte69/template_description.json']

    data = _fetch_files(data_dir,
                        files=[(f, url, opts) for f in filenames],
                        resume=resume,
                        verbose=verbose)

    with open(data[-1], 'r') as src:
        data[-1] = json.load(src)

    # bundle hemispheres together
    data = [ANNOT(*data[:-1][i:i + 2]) for i in range(0, 6, 2)] + [data[-1]]

    return Bunch(**dict(zip(keys + ['info'], data)))

예제 #20

0

파일 보기

파일: fairness_nb_utils.py 프로젝트: swanderz/MachineLearningNotebooks

def fetch_census_dataset():
    """Fetch the Adult Census Dataset.

    This uses a particular URL for the Adult Census dataset. The code
    is a simplified version of fetch_openml() in sklearn.

    The data are copied from:
    https://openml.org/data/v1/download/1595261.gz
    (as of 2021-03-31)
    """
    try:
        from urllib import urlretrieve
    except ImportError:
        from urllib.request import urlretrieve

    filename = "1595261.gz"
    data_url = "https://rainotebookscdn.blob.core.windows.net/datasets/"

    remaining_attempts = 5
    sleep_duration = 10
    while remaining_attempts > 0:
        try:
            urlretrieve(data_url + filename, filename)

            http_stream = gzip.GzipFile(filename=filename, mode='rb')

            with closing(http_stream):
                def _stream_generator(response):
                    for line in response:
                        yield line.decode('utf-8')

                stream = _stream_generator(http_stream)
                data = arff.load(stream)
        except Exception as exc:  # noqa: B902
            remaining_attempts -= 1
            print("Error downloading dataset from {} ({} attempt(s) remaining)"
                  .format(data_url, remaining_attempts))
            print(exc)
            time.sleep(sleep_duration)
            sleep_duration *= 2
            continue
        else:
            # dataset successfully downloaded
            break
    else:
        raise Exception("Could not retrieve dataset from {}.".format(data_url))

    attributes = OrderedDict(data['attributes'])
    arff_columns = list(attributes)

    raw_df = pd.DataFrame(data=data['data'], columns=arff_columns)

    target_column_name = 'class'
    target = raw_df.pop(target_column_name)
    for col_name in _categorical_columns:
        dtype = pd.api.types.CategoricalDtype(attributes[col_name])
        raw_df[col_name] = raw_df[col_name].astype(dtype, copy=False)

    result = Bunch()
    result.data = raw_df
    result.target = target

    return result

예제 #21

0

파일 보기

파일: fetchers.py 프로젝트: llevitis/netneurotools

def fetch_fsaverage(version='fsaverage',
                    data_dir=None,
                    url=None,
                    resume=True,
                    verbose=1):
    """
    Downloads files for fsaverage FreeSurfer template

    Parameters
    ----------
    version : str, optional
        One of {'fsaverage', 'fsaverage3', 'fsaverage4', 'fsaverage5',
        'fsaverage6'}. Default: 'fsaverage'
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NNT_DATA'; if that is not set, will use
        `~/nnt-data` instead. Default: None
    url : str, optional
        URL from which to download data. Default: None
    resume : bool, optional
        Whether to attempt to resume partial download, if possible. Default:
        True
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    filenames : :class:`sklearn.utils.Bunch`
        Dictionary-like object with keys ['surf'] where corresponding values
        are length-2 lists downloaded template files (each list composed of
        files for the left and right hemisphere).

    References
    ----------

    """

    versions = [
        'fsaverage', 'fsaverage3', 'fsaverage4', 'fsaverage5', 'fsaverage6'
    ]
    if version not in versions:
        raise ValueError('The version of fsaverage requested "{}" does not '
                         'exist. Must be one of {}'.format(version, versions))

    dataset_name = 'tpl-fsaverage'
    keys = ['orig', 'white', 'smoothwm', 'pial', 'inflated', 'sphere']

    data_dir = _get_data_dir(data_dir=data_dir)
    info = _get_dataset_info(dataset_name)[version]
    if url is None:
        url = info['url']

    opts = {
        'uncompress': True,
        'md5sum': info['md5'],
        'move': '{}.tar.gz'.format(dataset_name)
    }

    filenames = [
        op.join(version, 'surf', '{}.{}'.format(hemi, surf)) for surf in keys
        for hemi in ['lh', 'rh']
    ]

    try:
        data_dir = check_fs_subjid(version)[1]
        data = [op.join(data_dir, f) for f in filenames]
    except FileNotFoundError:
        data = _fetch_files(data_dir,
                            resume=resume,
                            verbose=verbose,
                            files=[(op.join(dataset_name, f), url, opts)
                                   for f in filenames])

    data = [ANNOT(*data[i:i + 2]) for i in range(0, len(keys) * 2, 2)]

    return Bunch(**dict(zip(keys, data)))

예제 #22

0

파일 보기

파일: struct.py 프로젝트: zhiye9/nilearn

def fetch_icbm152_2009(data_dir=None, url=None, resume=True, verbose=1):
    """Download and load the ICBM152 template (dated 2009)

    Parameters
    ----------
    data_dir: string, optional
        Path of the data directory. Used to force data storage in a non-
        standard location. Default: None (meaning: default)
    url: string, optional
        Download URL of the dataset. Overwrite the default URL.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, interest keys are:
        "t1", "t2", "t2_relax", "pd": anatomical images obtained with the
        given modality (resp. T1, T2, T2 relaxometry and proton
        density weighted). Values are file paths.
        "gm", "wm", "csf": segmented images, giving resp. gray matter,
        white matter and cerebrospinal fluid. Values are file paths.
        "eye_mask", "face_mask", "mask": use these images to mask out
        parts of mri images. Values are file paths.

    References
    ----------
    VS Fonov, AC Evans, K Botteron, CR Almli, RC McKinstry, DL Collins
    and BDCG, "Unbiased average age-appropriate atlases for pediatric studies",
    NeuroImage,Volume 54, Issue 1, January 2011

    VS Fonov, AC Evans, RC McKinstry, CR Almli and DL Collins,
    "Unbiased nonlinear average age-appropriate brain templates from birth
    to adulthood", NeuroImage, Volume 47, Supplement 1, July 2009, Page S102
    Organization for Human Brain Mapping 2009 Annual Meeting.

    DL Collins, AP Zijdenbos, WFC Baare and AC Evans,
    "ANIMAL+INSECT: Improved Cortical Structure Segmentation",
    IPMI Lecture Notes in Computer Science, 1999, Volume 1613/1999, 210-223

    Notes
    -----
    For more information about this dataset's structure:
    http://www.bic.mni.mcgill.ca/ServicesAtlases/ICBM152NLin2009

    The original download URL is
    http://www.bic.mni.mcgill.ca/~vfonov/icbm/2009/mni_icbm152_nlin_sym_09a_nifti.zip
    """
    if url is None:
        # The URL can be retrieved from the nilearn account on OSF (Open
        # Science Framework), https://osf.io/4r3jt/quickfiles/
        # Clicking on the "share" button gives the root of the URL.
        url = "https://osf.io/7pj92/download"
    opts = {'uncompress': True}

    keys = ("csf", "gm", "wm", "pd", "t1", "t2", "t2_relax", "eye_mask",
            "face_mask", "mask")
    filenames = [
        (os.path.join("mni_icbm152_nlin_sym_09a", name), url, opts)
        for name in ("mni_icbm152_csf_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_gm_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_wm_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_pd_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_t1_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_t2_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_t2_relx_tal_nlin_sym_09a.nii.gz",
                     "mni_icbm152_t1_tal_nlin_sym_09a_eye_mask.nii.gz",
                     "mni_icbm152_t1_tal_nlin_sym_09a_face_mask.nii.gz",
                     "mni_icbm152_t1_tal_nlin_sym_09a_mask.nii.gz")
    ]

    dataset_name = 'icbm152_2009'
    data_dir = _get_dataset_dir(dataset_name,
                                data_dir=data_dir,
                                verbose=verbose)
    sub_files = _fetch_files(data_dir,
                             filenames,
                             resume=resume,
                             verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    params = dict([('description', fdescr)] + list(zip(keys, sub_files)))
    return Bunch(**params)

예제 #23

0

파일 보기

파일: fetchers.py 프로젝트: llevitis/netneurotools

def fetch_schaefer2018(version='fsaverage',
                       data_dir=None,
                       url=None,
                       resume=True,
                       verbose=1):
    """
    Downloads FreeSurfer .annot files for Schaefer et al., 2018 parcellation

    Parameters
    ----------
    version : {'fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k'}
        Specifies which surface annotation files should be matched to. Default:
        'fsaverage'
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NNT_DATA'; if that is not set, will use
        `~/nnt-data` instead. Default: None
    url : str, optional
        URL from which to download data. Default: None
    resume : bool, optional
        Whether to attempt to resume partial download, if possible. Default:
        True
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    filenames : :class:`sklearn.utils.Bunch`
        Dictionary-like object with keys of format '{}Parcels{}Networks' where
        corresponding values are the left/right hemisphere annotation files

    References
    ----------
    Schaefer, A., Kong, R., Gordon, E. M., Laumann, T. O., Zuo, X. N., Holmes,
    A. J., ... & Yeo, B. T. (2017). Local-global parcellation of the human
    cerebral cortex from intrinsic functional connectivity MRI. Cerebral
    Cortex, 28(9), 3095-3114.

    Notes
    -----
    License: https://github.com/ThomasYeoLab/CBIG/blob/master/LICENSE.md
    """

    versions = ['fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k']
    if version not in versions:
        raise ValueError(
            'The version of Schaefer et al., 2018 parcellation '
            'requested "{}" does not exist. Must be one of {}'.format(
                version, versions))

    dataset_name = 'atl-schaefer2018'
    keys = [
        '{}Parcels{}Networks'.format(p, n) for p in range(100, 1001, 100)
        for n in [7, 17]
    ]

    data_dir = _get_data_dir(data_dir=data_dir)
    info = _get_dataset_info(dataset_name)[version]
    if url is None:
        url = info['url']

    opts = {
        'uncompress': True,
        'md5sum': info['md5'],
        'move': '{}.tar.gz'.format(dataset_name)
    }

    if version == 'fslr32k':
        hemispheres, suffix = ['LR'], 'dlabel.nii'
    else:
        hemispheres, suffix = ['L', 'R'], 'annot'
    filenames = [
        'atl-Schaefer2018_space-{}_hemi-{}_desc-{}_deterministic.{}'.format(
            version, hemi, desc, suffix) for desc in keys
        for hemi in hemispheres
    ]

    files = [(op.join(dataset_name, version, f), url, opts) for f in filenames]
    data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose)

    if suffix == 'annot':
        data = [ANNOT(*data[i:i + 2]) for i in range(0, len(keys) * 2, 2)]

    return Bunch(**dict(zip(keys, data)))

예제 #24

0

파일 보기

파일: struct.py 프로젝트: zhiye9/nilearn

def fetch_oasis_vbm(n_subjects=None,
                    dartel_version=True,
                    data_dir=None,
                    url=None,
                    resume=True,
                    verbose=1):
    """Download and load Oasis "cross-sectional MRI" dataset (416 subjects).

    Parameters
    ----------
    n_subjects: int, optional
        The number of subjects to load. If None is given, all the
        subjects are used.

    dartel_version: boolean,
        Whether or not to use data normalized with DARTEL instead of standard
        SPM8 normalization.

    data_dir: string, optional
        Path of the data directory. Used to force data storage in a specified
        location. Default: None

    url: string, optional
        Override download URL. Used for test only (or if you setup a mirror of
        the data).

    resume: bool, optional
        If true, try resuming download if possible

    verbose: int, optional
        verbosity level (0 means no message).

    Returns
    -------
    data: Bunch
        Dictionary-like object, the interest attributes are :

        - 'gray_matter_maps': string list
          Paths to nifti gray matter density probability maps
        - 'white_matter_maps' string list
          Paths to nifti white matter density probability maps
        - 'ext_vars': np.recarray
          Data from the .csv file with information about selected subjects
        - 'data_usage_agreement': string
          Path to the .txt file containing the data usage agreement.

    References
    ----------
    * http://www.oasis-brains.org/

    * Open Access Series of Imaging Studies (OASIS): Cross-sectional MRI
      Data in Young, Middle Aged, Nondemented, and Demented Older Adults.
      Marcus, D. S and al., 2007, Journal of Cognitive Neuroscience.

    Notes
    -----
    In the DARTEL version, original Oasis data have been preprocessed
    with the following steps:

      1. Dimension swapping (technically required for subsequent steps)
      2. Brain Extraction
      3. Segmentation with SPM8
      4. Normalization using DARTEL algorithm
      5. Modulation
      6. Replacement of NaN values with 0 in gray/white matter density maps.
      7. Resampling to reduce shape and make it correspond to the shape of
         the non-DARTEL data (fetched with dartel_version=False).
      8. Replacement of values < 1e-4 with zeros to reduce the file size.

    In the non-DARTEL version, the following steps have been performed instead:

      1. Dimension swapping (technically required for subsequent steps)
      2. Brain Extraction
      3. Segmentation and normalization to a template with SPM8
      4. Modulation
      5. Replacement of NaN values with 0 in gray/white matter density maps.

    An archive containing the gray and white matter density probability maps
    for the 416 available subjects is provided. Gross outliers are removed and
    filtered by this data fetcher (DARTEL: 13 outliers; non-DARTEL: 1 outlier)
    Externals variates (age, gender, estimated intracranial volume,
    years of education, socioeconomic status, dementia score) are provided
    in a CSV file that is a copy of the original Oasis CSV file. The current
    downloader loads the CSV file and keeps only the lines corresponding to
    the subjects that are actually demanded.

    The Open Access Structural Imaging Series (OASIS) is a project
    dedicated to making brain imaging data openly available to the public.
    Using data available through the OASIS project requires agreeing with
    the Data Usage Agreement that can be found at
    http://www.oasis-brains.org/app/template/UsageAgreement.vm

    """
    # check number of subjects
    if n_subjects is None:
        n_subjects = 403 if dartel_version else 415
    if dartel_version:  # DARTEL version has 13 identified outliers
        if n_subjects > 403:
            warnings.warn('Only 403 subjects are available in the '
                          'DARTEL-normalized version of the dataset. '
                          'All of them will be used instead of the wanted %d' %
                          n_subjects)
            n_subjects = 403
    else:  # all subjects except one are available with non-DARTEL version
        if n_subjects > 415:
            warnings.warn('Only 415 subjects are available in the '
                          'non-DARTEL-normalized version of the dataset. '
                          'All of them will be used instead of the wanted %d' %
                          n_subjects)
            n_subjects = 415
    if n_subjects < 1:
        raise ValueError("Incorrect number of subjects (%d)" % n_subjects)

    # pick the archive corresponding to preprocessings type
    if url is None:
        if dartel_version:
            url_images = ('https://www.nitrc.org/frs/download.php/'
                          '6364/archive_dartel.tgz?i_agree=1&download_now=1')
        else:
            url_images = ('https://www.nitrc.org/frs/download.php/'
                          '6359/archive.tgz?i_agree=1&download_now=1')
        # covariates and license are in separate files on NITRC
        url_csv = ('https://www.nitrc.org/frs/download.php/'
                   '6348/oasis_cross-sectional.csv?i_agree=1&download_now=1')
        url_dua = ('https://www.nitrc.org/frs/download.php/'
                   '6349/data_usage_agreement.txt?i_agree=1&download_now=1')
    else:  # local URL used in tests
        url_csv = url + "/oasis_cross-sectional.csv"
        url_dua = url + "/data_usage_agreement.txt"
        if dartel_version:
            url_images = url + "/archive_dartel.tgz"
        else:
            url_images = url + "/archive.tgz"

    opts = {'uncompress': True}

    # missing subjects create shifts in subjects ids
    missing_subjects = [
        8, 24, 36, 48, 89, 93, 100, 118, 128, 149, 154, 171, 172, 175, 187,
        194, 196, 215, 219, 225, 242, 245, 248, 251, 252, 257, 276, 297, 306,
        320, 324, 334, 347, 360, 364, 391, 393, 412, 414, 427, 436
    ]

    if dartel_version:
        # DARTEL produces outliers that are hidden by nilearn API
        removed_outliers = [
            27, 57, 66, 83, 122, 157, 222, 269, 282, 287, 309, 428
        ]
        missing_subjects = sorted(missing_subjects + removed_outliers)
        file_names_gm = [(
            os.path.join("OAS1_%04d_MR1",
                         "mwrc1OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") %
            (s, s), url_images, opts) for s in range(1, 457)
                         if s not in missing_subjects][:n_subjects]
        file_names_wm = [(
            os.path.join("OAS1_%04d_MR1",
                         "mwrc2OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") %
            (s, s), url_images, opts) for s in range(1, 457)
                         if s not in missing_subjects]
    else:
        # only one gross outlier produced, hidden by nilearn API
        removed_outliers = [390]
        missing_subjects = sorted(missing_subjects + removed_outliers)
        file_names_gm = [
            (os.path.join("OAS1_%04d_MR1",
                          "mwc1OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") %
             (s, s), url_images, opts) for s in range(1, 457)
            if s not in missing_subjects
        ][:n_subjects]
        file_names_wm = [
            (os.path.join("OAS1_%04d_MR1",
                          "mwc2OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") %
             (s, s), url_images, opts) for s in range(1, 457)
            if s not in missing_subjects
        ]
    file_names_extvars = [("oasis_cross-sectional.csv", url_csv, {})]
    file_names_dua = [("data_usage_agreement.txt", url_dua, {})]
    # restrict to user-specified number of subjects
    file_names_gm = file_names_gm[:n_subjects]
    file_names_wm = file_names_wm[:n_subjects]

    file_names = (file_names_gm + file_names_wm + file_names_extvars +
                  file_names_dua)
    dataset_name = 'oasis1'
    data_dir = _get_dataset_dir(dataset_name,
                                data_dir=data_dir,
                                verbose=verbose)
    files = _fetch_files(data_dir, file_names, resume=resume, verbose=verbose)

    # Build Bunch
    gm_maps = files[:n_subjects]
    wm_maps = files[n_subjects:(2 * n_subjects)]
    ext_vars_file = files[-2]
    data_usage_agreement = files[-1]

    # Keep CSV information only for selected subjects
    csv_data = np.recfromcsv(ext_vars_file)
    # Comparisons to recfromcsv data must be bytes.
    actual_subjects_ids = [
        ("OAS1" + str.split(os.path.basename(x), "OAS1")[1][:9]).encode()
        for x in gm_maps
    ]
    subject_mask = np.asarray(
        [subject_id in actual_subjects_ids for subject_id in csv_data['id']])
    csv_data = csv_data[subject_mask]

    fdescr = _get_dataset_descr(dataset_name)

    return Bunch(gray_matter_maps=gm_maps,
                 white_matter_maps=wm_maps,
                 ext_vars=csv_data,
                 data_usage_agreement=data_usage_agreement,
                 description=fdescr)

예제 #25

0

파일 보기

def get_args_aida_task():
    args = Bunch()
    args.seq_len = 64
    args.row_wise_fill = True
    args.mask_mode = 'cross-wise'
    args.additional_ban = 2
    args.table_object = 'first-column'
    args.pooling = 'avg-token'

    args.pretrained_model_path = "./models/bert_model.bin-000"
    args.vocab_path = 'models/google_uncased_en_vocab.txt'
    args.vocab = Vocab()
    args.vocab.load(args.vocab_path)
    args.emb_size = 768
    args.embedding = 'tab'  # before: bert
    args.encoder = 'bertTab'
    args.subword_type = 'none'
    args.tokenizer = 'bert'
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    args.feedforward_size = 3072
    args.hidden_size = 768
    args.heads_num = 12
    args.layers_num = 12
    args.learning_rate = 2e-5
    args.batch_size = 4
    args.dropout = 0.1

    # args.target = 'bert'
    return args

예제 #26

0

파일 보기

파일: iwmv.py 프로젝트: mitll/StatSWAG

    def fit(self, X, return_pi_T=False):
        """See Algorithm 1 in Li & Yu '14.

        Paramters
        ---------
        X : array-like, shape=(n_samples,n_experts)

        return_pi_T : boolean
            Whether or not to return (accuracies,labels) as a tuple instead
            of a Bunch object.

        Returns
        -------
            data : Bunch
                Dictionary-like object, the interesting attributes are:
                'accuracies', the estimated expert accuracies, 'labels', the
                best-guess labels, 'class_names' the name of each unique class
                this estimator observed, 'probs' the probability of each
                possible label for each sample (None if not available), and class_names
                the name (and ordering) of the classes.

                The ordering of columns in probs corresponds to that in class_names
        """
        Z = np.array(X)
        n_samples,n_experts = np.shape(Z)
        # Workaround for not getting NaNs in the list of classes
        # Since NaN == NaN evaluates to False
        classes = np.sort(pd.Series(Z.flatten()).dropna().unique())
        L = len(classes)

        # Initialize equal weights for all experts
        v = np.array([1 for i in range(n_experts)])

        # Identity matrix, response or no-response
        T = ~pd.isnull(Z)
        T = np.array(T).astype(int)
        s = 0 # Keep track of iterations
        converged = False
        # Initialize 'best-guess' with all one class
        y_prev = np.full(n_samples,classes[0])

        while (s<self.n_iter and not converged):
            # Estimate best-guess labels
            all_votes = np.array([np.sum(v*(Z==k).astype(int),axis=1) for k in classes])
            y_hat = np.array([classes[i] for i in np.argmax(all_votes,axis=0)])
            # Calculate expert accuracies (according to the updated best-guess labels)
            w_hat = np.sum((Z.T==y_hat).astype(int),axis=1)
            w_hat = w_hat / np.sum(T,axis=0)
            # Calculate new expert weights (how much their vote counts)
            if self.mode == 'log':
                MIN_INT = np.iinfo(np.int16).min
                v = np.array([MIN_INT if w_i == 0 else math.log((L-1)*w_i)/(1-w_i) for w_i in w_hat])
            else:
                # Derived in eq. 33 in Li & Yu paper
                v = L*w_hat-1

            # If the labels haven't changed since last time, it's converged!
            if (y_hat == y_prev).all():
                converged = True

            # Updated number of iterations completed
            s += 1
            y_prev = y_hat

        if return_pi_T:
            return w_hat,y_hat
        else:
            return Bunch(accuracies=w_hat,labels=y_hat,probs=None,class_names=classes)

예제 #27

0

파일 보기

def read_data(name,
              with_classes=True,
              prefer_attr_nodes=False,
              prefer_attr_edges=False,
              produce_labels_nodes=False,
              as_graphs=False,
              is_symmetric=symmetric_dataset):
    """Create a dataset iterable for GraphKernel.

    Parameters
    ----------
    name : str
        The dataset name.

    with_classes : bool, default=False
        Return an iterable of class labels based on the enumeration.

    produce_labels_nodes : bool, default=False
        Produce labels for nodes if not found.
        Currently this means labeling its node by its degree inside the Graph.
        This operation is applied only if node labels are non existent.

    prefer_attr_nodes : bool, default=False
        If a dataset has both *node* labels and *node* attributes
        set as labels for the graph object for *nodes* the attributes.

    prefer_attr_edges : bool, default=False
        If a dataset has both *edge* labels and *edge* attributes
        set as labels for the graph object for *edge* the attributes.

    as_graphs : bool, default=False
        Return data as a list of Graph Objects.

    is_symmetric : bool, default=False
        Defines if the graph data describe a symmetric graph.

    Returns
    -------
    Gs : iterable
        An iterable of graphs consisting of a dictionary, node
        labels and edge labels for each graph.

    classes : np.array, case_of_appearance=with_classes==True
        An one dimensional array of graph classes aligned with the lines
        of the `Gs` iterable. Useful for classification.

    """
    indicator_path = "./" + str(name) + "/" + str(
        name) + "_graph_indicator.txt"
    edges_path = "./" + str(name) + "/" + str(name) + "_A.txt"
    node_labels_path = "./" + str(name) + "/" + str(name) + "_node_labels.txt"
    node_attributes_path = "./" + str(name) + "/" + str(
        name) + "_node_attributes.txt"
    edge_labels_path = "./" + str(name) + "/" + str(name) + "_edge_labels.txt"
    edge_attributes_path = \
        "./" + str(name) + "/" + str(name) + "_edge_attributes.txt"
    graph_classes_path = \
        "./" + str(name) + "/" + str(name) + "_graph_labels.txt"

    # node graph correspondence
    ngc = dict()
    # edge line correspondence
    elc = dict()
    # dictionary that keeps sets of edges
    Graphs = dict()
    # dictionary of labels for nodes
    node_labels = dict()
    # dictionary of labels for edges
    edge_labels = dict()

    # Associate graphs nodes with indexes
    with open(indicator_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            ngc[i] = int(line[:-1])
            if int(line[:-1]) not in Graphs:
                Graphs[int(line[:-1])] = set()
            if int(line[:-1]) not in node_labels:
                node_labels[int(line[:-1])] = dict()
            if int(line[:-1]) not in edge_labels:
                edge_labels[int(line[:-1])] = dict()

    # Extract graph edges
    with open(edges_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            edge = line[:-1].replace(' ', '').split(",")
            elc[i] = (int(edge[0]), int(edge[1]))
            Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1])))
            if is_symmetric:
                Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0])))

    # Extract node attributes
    if (prefer_attr_nodes and dataset_metadata[name].get(
            "na", os.path.exists(node_attributes_path))):
        with open(node_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = \
                    [float(num) for num in
                     line[:-1].replace(' ', '').split(",")]
    # Extract node labels
    elif dataset_metadata[name].get("nl", os.path.exists(node_labels_path)):
        with open(node_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = int(line[:-1])
    elif produce_labels_nodes:
        for i in range(1, len(Graphs) + 1):
            node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d))

    # Extract edge attributes
    if (prefer_attr_edges and dataset_metadata[name].get(
            "ea", os.path.exists(edge_attributes_path))):
        with open(edge_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                attrs = [
                    float(num) for num in line[:-1].replace(' ', '').split(",")
                ]
                edge_labels[ngc[elc[i][0]]][elc[i]] = attrs
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = attrs

    # Extract edge labels
    elif dataset_metadata[name].get("el", os.path.exists(edge_labels_path)):
        with open(edge_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1])
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \
                        int(line[:-1])

    Gs = list()
    if as_graphs:
        for i in range(1, len(Graphs) + 1):
            Gs.append(Graph(Graphs[i], node_labels[i], edge_labels[i]))
    else:
        for i in range(1, len(Graphs) + 1):
            Gs.append([Graphs[i], node_labels[i], edge_labels[i]])

    if with_classes:
        classes = []
        with open(graph_classes_path, "r") as f:
            for line in f:
                classes.append(int(line[:-1]))

        classes = np.array(classes, dtype=np.int)
        return Bunch(data=Gs, target=classes)
    else:
        return Bunch(data=Gs)

예제 #28

0

파일 보기

def fetch(
    collection: str,
    name: str,
    data_home: Optional[str] = None,
    nfolds: Literal[None, 1, 5, 10] = None,
    dobscv: bool = False,
    *,
    return_X_y: bool = False,
) -> Union[Bunch, Tuple[np.typing.NDArray[float], np.typing.NDArray[Union[
        int, float]]], ]:
    """
    Fetch Keel dataset.

    Fetch a Keel dataset by collection and name. More info at
    http://sci2s.ugr.es/keel.

    Parameters
    ----------
    collection : string
        Collection name.
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.
    nfolds : int, default=None
        Number of folds. Depending on the dataset, valid values are
        {None, 1, 5, 10}.
    dobscv : bool, default=False
        If folds are in {5, 10}, indicates that the cv folds are distribution
        optimally balanced stratified. Only available for some datasets.
    return_X_y : bool, default=False
        If True, returns ``(data, target)`` instead of a Bunch object.
    kwargs : dict
        Optional key-value arguments

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    (data, target) : tuple if ``return_X_y`` is True

    """
    if collection not in COLLECTIONS:
        raise ValueError('Avaliable collections are ' + str(list(COLLECTIONS)))
    nattrs, DESCR = _load_descr(collection, name, data_home=data_home)
    X, y, cv = _load_folds(
        collection,
        name,
        nfolds,
        dobscv,
        nattrs,
        data_home=data_home,
    )

    if return_X_y:
        return X, y

    return Bunch(
        data=X,
        target=y,
        train_indices=[],
        validation_indices=[],
        test_indices=[],
        inner_cv=None,
        outer_cv=cv,
        DESCR=DESCR,
    )

예제 #29

0

파일 보기

파일: _real_datasets.py 프로젝트: skdom6/scikit-fda

def fetch_phoneme(
    *,
    return_X_y: bool = False,
    as_frame: bool = False,
) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]:
    """
    Load the phoneme dataset.

    The data is obtained from the R package 'ElemStatLearn', which takes it
    from the dataset in `https://web.stanford.edu/~hastie/ElemStatLearn/`.

    """
    descr = _phoneme_descr

    raw_dataset = _fetch_elem_stat_learn("phoneme")

    data = raw_dataset["phoneme"]

    n_points = 256

    curve_data = data.iloc[:, 0:n_points]
    sound = data["g"].values
    speaker = data["speaker"].values

    curves = FDataGrid(
        data_matrix=curve_data.values,
        grid_points=np.linspace(0, 8, n_points),
        domain_range=[0, 8],
        dataset_name="Phoneme",
        argument_names=("frequency (kHz)", ),
        coordinate_names=("log-periodogram", ),
    )

    curve_name = "log-periodogram"
    target_name = "phoneme"
    frame = None

    if as_frame:
        frame = pd.DataFrame({
            curve_name: curves,
            target_name: sound,
        })
        curves = frame.iloc[:, [0]]
        target = frame.iloc[:, 1]
        meta = pd.Series(speaker, name="speaker")
    else:
        target = sound.codes
        meta = np.array([speaker]).T

    if return_X_y:
        return curves, target

    return Bunch(
        data=curves,
        target=target,
        frame=frame,
        categories={target_name: sound.categories.tolist()},
        feature_names=[curve_name],
        target_names=[target_name],
        meta=meta,
        meta_names=["speaker"],
        DESCR=descr,
    )

예제 #30

0

파일 보기

def test_loads_dumps_bunch():
    bunch = Bunch(x="x")
    bunch_from_pkl = loads(dumps(bunch))
    bunch_from_pkl.x = "y"
    assert bunch_from_pkl["x"] == bunch_from_pkl.x