import os.path from matplotlib import image from sklearn.neural_network import MLPClassifier from sklearn.utils import Bunch from sw_path import WORK_ROOT from sklearn.metrics import classification_report,accuracy_score TEST = True LOGGING = True print("start") minst_training = Bunch() minst_test = Bunch() folder_path = (WORK_ROOT + "RES/MNIST/10k") if not os.path.isdir(folder_path): raise Exception("dir not found") test_path = folder_path+"/test" training_path = folder_path+"/training" minst_training.data = [] minst_training.target = [] minst_training.target_names = [] minst_test.data = [] minst_test.target = []
def set_args_2(): # options for model args = Bunch() args.mask_mode = 'cross-wise' # in ['row_wise', 'col_wise', 'cross_wise', 'cross_and_hier_wise'] args.additional_ban = 0 # args.pooling = 'avg-token' args.pooling = 'avg-cell-seg' args.table_object = 'first-column' args.noise_num = 2 args.seq_len = 100 args.row_wise_fill = True args.pretrained_model_path = "./models/bert_model.bin-000" args.vocab_path = 'models/google_uncased_en_vocab.txt' args.vocab = Vocab() args.vocab.load(args.vocab_path) args.emb_size = 768 args.embedding = 'tab' # before: bert args.encoder = 'bertTab' args.subword_type = 'none' args.tokenizer = 'bert' args.feedforward_size = 3072 args.hidden_size = 768 args.heads_num = 12 args.layers_num = 12 args.learning_rate = 2e-5 args.warmup = 0.1 args.batch_size = 32 args.dropout = 0.1 args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.train_path = './data/aida/IO/train_samples' args.t2d_path = './data/aida/IO/test_samples_t2d' args.limaye_path = './data/aida/IO/test_samples_limaye' args.wiki_path = './data/aida/IO/test_samples_wikipedia' # other options args.report_steps = 100 args.labels_map = get_labels_map_from_aida_file_2(args.train_path) args.labels_num = len(args.labels_map) args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) return args
def fetch_octane( return_X_y: bool = False, as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """Load near infrared spectra of gasoline samples. This function fetchs the octane dataset from the R package 'mrfDepth' from CRAN. """ descr = _octane_descr # octane file from mrfDepth R package raw_dataset = fetch_cran("octane", "mrfDepth", version="1.0.11") data = raw_dataset['octane'][..., 0].T # The R package only stores the values of the curves, but the paper # describes the rest of the data. According to [RDEH2006], Section 5.4: # "wavelengths ranging from 1102nm to 1552nm with measurements every two # nm."" wavelength_start = 1102 wavelength_end = 1552 wavelength_count = 226 grid_points = np.linspace( wavelength_start, wavelength_end, wavelength_count, ) # "The octane data set contains six outliers (25, 26, 36–39) to which # alcohol was added". target = np.zeros(len(data), dtype=np.bool_) target[24:26] = 1 # noqa: WPS432 target[35:39] = 1 # noqa: WPS432 target_name = "is outlier" curve_name = "absorbances" curves = FDataGrid( data, grid_points=grid_points, dataset_name="octane", argument_names=("wavelength (nm)", ), coordinate_names=("absorbances", ), ) frame = None if as_frame: frame = pd.DataFrame({ curve_name: curves, target_name: target, }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] if return_X_y: return curves, target return Bunch( data=curves, target=target, frame=frame, categories={}, feature_names=[curve_name], target_names=[target_name], DESCR=descr, )
def fetch_connectome(dataset, data_dir=None, url=None, resume=True, verbose=1): """ Downloads files from multi-species connectomes Parameters ---------- dataset : str Specifies which dataset to download; must be one of the datasets listed in :func:`netneurotools.datasets.available_connectomes()`. data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- data : :class:`sklearn.utils.Bunch` Dictionary-like object with, at a minimum, keys ['conn', 'labels', 'ref'] providing connectivity / correlation matrix, region labels, and relevant reference. Other possible keys include 'dist' (an array of Euclidean distances between regions of 'conn'), 'coords' (an array of xyz coordinates for regions of 'conn'), 'acronyms' (an array of acronyms for regions of 'conn'), and 'networks' (an array of network affiliations for regions of 'conn') References ---------- See `ref` key of returned dictionary object for relevant dataset reference """ if dataset not in available_connectomes(): raise ValueError( 'Provided dataset {} not available; must be one of {}'.format( dataset, available_connectomes())) dataset_name = 'ds-connectomes' data_dir = op.join(_get_data_dir(data_dir=data_dir), dataset_name) info = _get_dataset_info(dataset_name)[dataset] if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset) } filenames = [op.join(dataset, '{}.csv'.format(fn)) for fn in info['keys']] + [op.join(dataset, 'ref.txt')] data = _fetch_files(data_dir, files=[(f, url, opts) for f in filenames], resume=resume, verbose=verbose) # load data for n, arr in enumerate(data[:-1]): try: data[n] = np.loadtxt(arr, delimiter=',') except ValueError: data[n] = np.loadtxt(arr, delimiter=',', dtype=str) with open(data[-1]) as src: data[-1] = src.read().strip() return Bunch(**dict(zip(info['keys'] + ['ref'], data)))
def get_fmri_sessions(topdir: Union[str, PathLike, PosixPath], events_dir: Union[str, PathLike, PosixPath] = None, task: str = 'memory', space: str = 'MNI152NLin2009cAsym', output_type: str = 'preproc', extension: str = '.nii.gz', modality: str = 'bold', sub_id: str = '*', ses_id: str = '*', **kwargs) -> list: """ Return a sorted list of the desired BOLD fMRI nifti file paths. Args: topdir: str, PathLike, or PosixPath (Default = None) Database top-level directory path. events_dir: str, PathLike, or PosixPath (Default = None) Directory where the events and/or behavioural files are stored. If None is provided, it is assumed to be identical as ``topdir``. masker_dir: str, PathLike, or PosixPath (Default = None) Directory where prefitted nilearn nifti maskers are located. Used to save fitting time. task: str (Default = 'memory') Name of the experimental task (i.e. 'rest' is also valid). space: str (Default = 'MNI152NLin2009cAsym') Name of the template used during resampling. Most likely corresponding to a valid TemplateFlow name. output_type: str (Default = 'preproc') Name of the desired FMRIPrep output type. Most likely corresponding to a valid FMRIPrep output type name. extension: str (Default = '.nii.gz') Nifti files extension. The leading '.' is required. modality: str (Default = 'bold') Scanning modality used during the experiment. sub_id: str (Default = '*') Participant identifier. By default, returns all participants. The leading 'sub-' must be omitted. If the identifier is numeric, it should be quoted. ses_id: str (Default = '*') Session identifier. By default, returns all sessions. If the identifier is numeric, it should be quoted. The leading 'ses-' must be omitted. Returns: list Sorted list of the desired nifti file paths. Notes: All parameters excepting ``topdir`` and ``events_dir`` can be replaced by '*' (Default), which is equivalent to a UNIX ``find`` pattern. """ from decoding.cimaq_decoding_params import _params # Generate regex and glob patterns bold_pattern = '_'.join([ f'sub-{sub_id}', f'ses-{ses_id}', f'task-{task}', f'space-{space}', f'desc-{output_type}', f'{modality}{extension}' ]) ev_pattern = f'sub-{sub_id}_ses-{ses_id}_task-{task}_events.tsv' # Load fMRI and events files paths into lists bold_paths = sorted(map(str, Path(topdir).rglob(f'*{bold_pattern}'))) event_paths = sorted(map(str, (Path(events_dir).rglob(f'*{ev_pattern}')))) # Get only the intersection of these lists valid_bold_paths = sorted( boldpath for boldpath in bold_paths if get_sub_ses_key( boldpath) in [get_sub_ses_key(apath) for apath in event_paths]) valid_event_paths = sorted( evpath for evpath in event_paths if get_sub_ses_key( evpath) in [get_sub_ses_key(apath) for apath in valid_bold_paths]) # Load corresponding anatomical T1w, brain mask and behavioural file paths valid_anat_paths = [ get_fmriprep_anat(v_boldpath) for v_boldpath in valid_bold_paths ] valid_mask_paths = [ get_fmriprep_mask(v_boldpath) for v_boldpath in valid_bold_paths ] valid_behav_paths = [ get_behav(v_boldpath, events_dir) for v_boldpath in valid_bold_paths ] # Zip them together zipped_paths = sorted( zip(valid_bold_paths, valid_anat_paths, valid_mask_paths, valid_event_paths, valid_behav_paths)) # Create sklearn.utils.Bunch objects sessions = [ Bunch(**dict( zip([ 'fmri_path', 'anat_path', 'mask_path', 'events_path', 'behav_path', 'sub_id', 'ses_id', 'task', 'space' ], (item + Path(item[0]).parts[-4:-2] + ('task-' + task, 'space-' + space))))) for item in zipped_paths ] # Setting default keyword arguments and parameters [session.update(**_params) for session in sessions] return sessions
def __init__(self, opt, phase="train"): # TODO split the dataset of val and test if phase == "val": phase = "test" opt.load_dataset_mode = 'reader' super(Cifar100Dataset, self).__init__(opt, phase) self.data_dir = opt.cifar100_dataset_dir self.data_name = CIFAR100 self.x_transforms_train = transforms.Compose([ transforms.RandomHorizontalFlip(p=0.5), transforms.ColorJitter(brightness=0.24705882352941178), transforms.Resize((opt.imsize, opt.imsize)), transforms.ToTensor(), transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)) ]) self.x_transforms_test = transforms.Compose([ transforms.Resize((opt.imsize, opt.imsize)), transforms.ToTensor(), transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)) ]) self.y_transforms = None if self.opt.load_dataset_mode == 'dir': self.data = [] # image_paths,targets self.label2Indices = defaultdict(list) image_dir = os.path.join(self.data_dir, phase) self._labels = os.listdir(image_dir) # get label to targets, dict type self.label2target = dict([ (label, target) for target, label in enumerate(self.labels) ]) self.target2label = dict([ (target, label) for target, label in enumerate(self.labels) ]) if not os.path.exists(image_dir): raise FileNotFoundError( f"Image Dir {image_dir} not exists, please check it") for root, label_dirs, files in os.walk(image_dir): for file in files: label = os.path.basename(root) image_path = os.path.join(root, file) target = self.label2target[label] self.label2Indices[label].append(len(self.data)) self.data.append( Bunch(image_path=image_path, target=target)) elif self.opt.load_dataset_mode == 'reader': dataset = datasets.CIFAR100(root=os.path.join( self.data_dir, 'raw_data'), train=self.isTrain, download=True) self.data, self._labels, self.label2Indices, self.label2target, self.target2label = prepare_datas_by_standard_data( dataset) else: raise ValueError( f"Expected load_dataset_mode in [dir,reader], but got {self.opt.load_dataset_mode}" )
def fetch_cammoun2012(version='MNI152NLin2009aSym', data_dir=None, url=None, resume=True, verbose=1): """ Downloads files for Cammoun et al., 2012 multiscale parcellation Parameters ---------- version : str, optional Specifies which version of the dataset to download, where 'MNI152NLin2009aSym' will return .nii.gz atlas files defined in MNI152 space, 'fsaverageX' will return .annot files defined in fsaverageX space (FreeSurfer 6.0.1), 'fslr32k' will return .label.gii files in fs_LR_32k HCP space, and 'gcs' will return FreeSurfer-style .gcs probabilistic atlas files for generating new, subject-specific parcellations. Default: 'MNI152NLin2009aSym' data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['scale033', 'scale060', 'scale125', 'scale250', 'scale500'], where corresponding values are lists of filepaths to downloaded parcellation files. References ---------- Cammoun, L., Gigandet, X., Meskaldji, D., Thiran, J. P., Sporns, O., Do, K. Q., Maeder, P., and Meuli, R., & Hagmann, P. (2012). Mapping the human connectome at multiple scales with diffusion spectrum MRI. Journal of Neuroscience Methods, 203(2), 386-397. Notes ----- License: https://raw.githubusercontent.com/LTS5/cmp/master/COPYRIGHT """ if version == 'surface': warnings.warn( 'Providing `version="surface"` is deprecated and will ' 'be removed in a future release. For consistent ' 'behavior please use `version="fsaverage"` instead.', DeprecationWarning, stacklevel=2) version = 'fsaverage' elif version == 'volume': warnings.warn( 'Providing `version="volume"` is deprecated and will ' 'be removed in a future release. For consistent ' 'behavior please use `version="MNI152NLin2009aSym"` ' 'instead.', DeprecationWarning, stacklevel=2) version = 'MNI152NLin2009aSym' versions = [ 'gcs', 'fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k', 'MNI152NLin2009aSym' ] if version not in versions: raise ValueError( 'The version of Cammoun et al., 2012 parcellation ' 'requested "{}" does not exist. Must be one of {}'.format( version, versions)) dataset_name = 'atl-cammoun2012' keys = ['scale033', 'scale060', 'scale125', 'scale250', 'scale500'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name)[version] if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } # filenames differ based on selected version of dataset if version == 'MNI152NLin2009aSym': filenames = [ 'atl-Cammoun2012_space-MNI152NLin2009aSym_res-{}_deterministic{}'. format(res[-3:], suff) for res in keys for suff in ['.nii.gz'] ] + ['atl-Cammoun2012_space-MNI152NLin2009aSym_info.csv'] elif version == 'fslr32k': filenames = [ 'atl-Cammoun2012_space-fslr32k_res-{}_hemi-{}_deterministic{}'. format(res[-3:], hemi, suff) for res in keys for hemi in ['L', 'R'] for suff in ['.label.gii'] ] elif version in ('fsaverage', 'fsaverage5', 'fsaverage6'): filenames = [ 'atl-Cammoun2012_space-{}_res-{}_hemi-{}_deterministic{}'.format( version, res[-3:], hemi, suff) for res in keys for hemi in ['L', 'R'] for suff in ['.annot'] ] else: filenames = [ 'atl-Cammoun2012_res-{}_hemi-{}_probabilistic{}'.format( res[5:], hemi, suff) for res in keys[:-1] + ['scale500v1', 'scale500v2', 'scale500v3'] for hemi in ['L', 'R'] for suff in ['.gcs', '.ctab'] ] files = [(op.join(dataset_name, version, f), url, opts) for f in filenames] data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose) if version == 'MNI152NLin2009aSym': keys += ['info'] elif version in ('fslr32k', 'fsaverage', 'fsaverage5', 'fsaverage6'): data = [ANNOT(*data[i:i + 2]) for i in range(0, len(data), 2)] else: data = [data[::2][i:i + 2] for i in range(0, len(data) // 2, 2)] # deal with the fact that last scale is split into three files :sigh: data = data[:-3] + [list(itertools.chain.from_iterable(data[-3:]))] return Bunch(**dict(zip(keys, data)))
import sys import pandas as pd import numpy as np from sklearn.utils import Bunch from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics from sklearn.metrics import classification_report, confusion_matrix from sklearn.svm import LinearSVC from sklearn.preprocessing import MinMaxScaler from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.datasets import make_blobs from sklearn.metrics import brier_score_loss bunch_train = Bunch() bunch_test = Bunch() # 从本地加载数据到内存 with open('G:\AI\Bunch_1w\Bunch_train', 'rb') as file_obj: # word_bag_filepath为本地文件路径 bunch_train = pickle.load(file_obj) with open('G:\AI\Bunch_1w\Bunch_test', 'rb') as file_obj_2: # word_bag_filepath为本地文件路径 bunch_test = pickle.load(file_obj_2) def metrics_result(actual, predict): print("精度:{0:.3f}".format( metrics.precision_score(actual, predict, average='micro'))) print("召回:{0:0.3f}".format(
def _load_ucr_dataset(dataset, path): """Load a UCR data set from a local folder. Parameters ---------- dataset : str Name of the dataset. path : str The path of the folder containing the cached data set. Returns ------- data : Bunch Dictionary-like object, with attributes: data_train : array of floats The time series in the training set. data_test : array of floats The time series in the test set. target_train : array The classification labels in the training set. target_test : array The classification labels in the test set. DESCR : str The full description of the dataset. url : str The url of the dataset. Notes ----- Padded values are represented as NaN's. """ new_path = path + dataset + '/' try: with (open(new_path + dataset + '.txt', encoding='utf-8')) as f: description = f.read() except UnicodeDecodeError: with (open(new_path + dataset + '.txt', encoding='ISO-8859-1')) as f: description = f.read() try: data_train = np.genfromtxt(new_path + dataset + '_TRAIN.txt') data_test = np.genfromtxt(new_path + dataset + '_TEST.txt') X_train, y_train = data_train[:, 1:], data_train[:, 0] X_test, y_test = data_test[:, 1:], data_test[:, 0] except IndexError: train = loadarff(new_path + dataset + '_TRAIN.arff') test = loadarff(new_path + dataset + '_TEST.arff') data_train = np.asarray([train[0][name] for name in train[1].names()]) X_train = data_train[:-1].T.astype('float64') y_train = data_train[-1] data_test = np.asarray([test[0][name] for name in test[1].names()]) X_test = data_test[:-1].T.astype('float64') y_test = data_test[-1] try: y_train = y_train.astype('float64').astype('int64') y_test = y_test.astype('float64').astype('int64') except ValueError: pass bunch = Bunch(data_train=X_train, target_train=y_train, data_test=X_test, target_test=y_test, DESCR=description, url=("http://www.timeseriesclassification.com/" "description.php?Dataset={}".format(dataset))) return bunch
def load_fon_w_campana(): datos = np.load(ruta + 'fon_w_campana_ccas.npy') return Bunch(data=datos[:, :-1], target=datos[:, -1])
# # # we extract the values from the dataframe df_max to merge with the global df df_to_merge = pd.DataFrame(df_max['encoded'].values.tolist()) df_to_merge['label'] = 1 df_enc = df_enc.append(df_to_merge, ignore_index=True) df_enc['name'] = 'na' df_enc.loc[df_enc['label'] == 1, 'name'] = df_max['name'].values df_enc # # # save the encoded results as 4x23 images # we put the results in bunch encoded4x23 = Bunch( target_names=df_enc['name'].values, target=df_enc['label'].values, images=(df_enc.iloc[:, 0:92].values * 254).reshape(-1,4,23, order='F') ) plt.imshow(encoded4x23.images[0], cmap='Greys') ispickle = False if ispickle is True: # we create the pkl file for later use pickle_out = open("encoded4x23withoutTsai.pkl","wb") pkl.dump(encoded4x23, pickle_out) pickle_out.close() # A -> [1, 0, 0, 0] # G -> [0, 1, 0, 0] # C -> [0, 0, 1, 0] # T -> [0, 0, 0, 1]
def load_fon_w_braso(): datos = np.load(ruta + 'fon_w_braso_ccas.npy') return Bunch(data=datos[:, :-1], target=datos[:, -1])
def load_art_w_petaka(): datos = np.load(ruta + 'art_w_petaka_ccas.npy') return Bunch(data=datos[:, :-1], target=datos[:, -1])
def load_prs_rt(): datos = np.load(ruta + 'prs_rt_ccas.npy') return Bunch(data=datos[:, :-1], target=datos[:, -1])
def set_args(predefined_dict_groups): # options for model args = Bunch() args.mask_mode = 'cross-wise' # in ['row_wise', 'col_wise', 'cross_wise', 'cross_and_hier_wise'] args.additional_ban = 0 # args.pooling = 'avg-token' args.pooling = 'avg-cell-seg' args.table_object = 'first-column' args.noise_num = 2 args.seq_len = 100 args.row_wise_fill = True args.pretrained_model_path = "./models/bert_model.bin-000" args.vocab_path = 'models/google_uncased_en_vocab.txt' args.vocab = Vocab() args.vocab.load(args.vocab_path) args.emb_size = 768 args.embedding = 'tab' # before: bert args.encoder = 'bertTab' args.subword_type = 'none' args.tokenizer = 'bert' args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) args.feedforward_size = 3072 args.hidden_size = 768 args.heads_num = 12 args.layers_num = 12 args.learning_rate = 2e-5 # args.learning_rate = 1e-4 args.warmup = 0.1 args.batch_size = 32 args.dropout = 0.1 args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # other options args.shuffle_rows = True args.report_steps = 100 for predefined_dict_group in predefined_dict_groups.values(): for k, v in predefined_dict_group.items(): args[k] = v args.labels_map = get_labels_map_from_aida_file_2(args.train_path) args.labels_num = len(args.labels_map) # logger and tensorboard writer if args.tx_logger_dir_name: rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) args.summary_writer = SummaryWriter(logdir=os.path.join( args.tx_logger_dir_name, '-'.join([args.exp_name, rq]))) else: args.summary_writer = None if args.logger_dir_name is not None: args.logger_name = 'detail' args.logger = get_logger(logger_name=args.logger_name, dir_name=args.logger_dir_name, file_name=args.logger_file_name) else: args.logger = None return args
# In[794]: # Diretamente das musicas for f in listdir("data/raw"): arquivo = "data/raw/"+f genero_df, genero = carregar_arquivo(arquivo) #stemm = nltk.stem.RSLPStemmer() for l in genero_df.values: normalized = pre_process(l[0]) data.append(normalized) target.append(genero) dataset = Bunch(data=data, target=target) # In[733]: X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.30, random_state=42) # In[ ]: # In[734]:
time_started = time() x_train, x_test, y_train, y_test = train_test_split(bunch.data, bunch.target, test_size=0.5, random_state=10) clf = svm.SVC(gamma='scale') clf.fit(x_train, y_train) y_pred = clf.predict(x_test) time_finished = time() return accuracy_score(y_test, y_pred), time_finished - time_started print(f'our accuracy b is {get_result(iris)[0]}\n') # There is better way to create a bunch object (see new_cancer) new_iris = Bunch() new_iris.DESCR = DESCR new_iris.data = d new_iris.target = labels new_iris.target_names = labels_names new_iris.feature_names = feature_names result, time_taken = get_result(new_iris) print( f'Our accuracy for new_iris is {result}\nCompleted in {1000*time_taken:.3f}\n' ) cancer = datasets.load_breast_cancer() result, time_taken = get_result(cancer) print( f'Our accuracy for cancer dataset is {result}\nCompleted in {1000*time_taken:.3f}\n'
def compute_score(X, G, P, S, GWprior=None, score_method='bic'): """Compute score function of P.""" n_samples, n_dim = X.shape d0 = GWprior.d0 S0 = GWprior.S0 # check prior size violations if S0.shape[0] != n_dim or S0.shape[1] != n_dim: raise ValueError('GWprior.S0 must be p-by-p, with p dimensions X') dn = n_samples + d0 # C = (S + S0) / (dn - 2) # % need logdetP and invP # es, Q = np.linalg.eigh(x) # Inv = np.linalg.multi_dot((Q, np.diag(1. / es), Q.T)) U, s, Vh = linalg.svd(P) # check invP = np.linalg.multi_dot((Vh.T, np.diag(1. / s), U.T)) logdetP = np.sum(np.log(s)) # % compute loglik loglik = n_samples * log_likelihood(S / n_samples, P) num_edges = np.triu(G, 1).sum() dof = num_edges + n_dim # pcor = cov2cor(P); # % the posterior Sn parameter # Sn = (dn - 2) * invP logh = (dn - 2) / 2. * (n_dim + logdetP) # find full param set V Vi, Vj = np.nonzero(np.triu(G)) # to be the same as matlab idx = np.argsort(Vj) Vi, Vj = Vi[idx], Vj[idx] GWpost = Bunch() GWpost.Sn = S + S0 # GWpost.C = C GWpost.dn = dn GWpost.P = P GWpost.num_edges = num_edges GWpost.dof = dof GWpost.logdetP = logdetP GWpost.loglik = loglik if score_method == 'bic': score = loglik - dof * np.log(n_samples) / 2 if n_samples > 0 else 0 elif score_method == 'diaglaplace': # Diagonal hessian laplace approximation diagH = np.zeros(dof) for e1 in range(dof): # e2 = e1 M1 = np.zeros((n_dim, n_dim)) # M2 = M1.copy() nz1 = [Vi[e1], Vj[e1]] # nz2 = [Vi[e2], Vj[e2]] M1[:, nz1] = invP[:, [Vj[e1], Vi[e1]]] # M2[:, nz2] = invP[:, [Vj[e2], Vi[e2]]] # A = M1[nz2][:, nz1] # B = M2[nz1][:, nz2] A = M1[nz1][:, nz1] B = A tmp2 = A[0, :].dot(B[:, 0]) + A[1, :].dot(B[:, 1]) diagH[e1] = -(dn - 2) * tmp2 / 2 # diagH(e1) = -(dn-2) * trace(M1(nz2,nz1)*M2(nz1,nz2))/2; logdetHdiag = sum(np.log(-diagH)) lognormconst = dof * np.log(2 * np.pi) / 2 + logh - logdetHdiag / 2. score = lognormconst - GWprior.lognormconst - \ n_samples * n_dim * np.log(2 * np.pi) / 2 GWpost.lognormconst = lognormconst elif score_method == 'laplace': # Full laplace approximation H = np.empty((dof, dof)) for e1 in range(dof): # nz1 = [Vi[e1], Vj[e1]] i, j = Vi[e1], Vj[e1] for e2 in range(e1, dof): # nz2 = [Vi[e2], Vj[e2]] l, m = Vi[e2], Vj[e2] # A = invP[nz2][:, [Vj[e1], Vi[e1]]] # B = invP[nz1][:, [Vj[e2], Vi[e2]]] A = invP[[l, m]][:, [j, i]] B = invP[[i, j]][:, [m, l]] # tmp2 = A[0, :].dot(B[:, 0]) + A[1, :].dot(B[:, 1]) # tmp2 = np.trace(A.dot(B)) tmp2 = (A * B.T).sum() H[e2, e1] = H[e1, e2] = -(dn - 2) * tmp2 / 2. # neg Hessian will be posdef logdetH = 2 * sum(np.log(np.diag(linalg.cholesky(-H)))) lognormconst = dof * np.log(2 * np.pi) / 2 + logh - logdetH / 2. score = lognormconst - GWprior.lognormconst - \ n_samples * n_dim * np.log(2 * np.pi) / 2 GWpost.lognormconst = lognormconst GWpost.score = score return GWpost
def fetch_conte69(data_dir=None, url=None, resume=True, verbose=1): """ Downloads files for Van Essen et al., 2012 Conte69 template Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['midthickness', 'inflated', 'vinflated'], where corresponding values are lists of filepaths to downloaded template files. References ---------- http://brainvis.wustl.edu/wiki/index.php//Caret:Atlases/Conte69_Atlas Van Essen, D. C., Glasser, M. F., Dierker, D. L., Harwell, J., & Coalson, T. (2011). Parcellations and hemispheric asymmetries of human cerebral cortex analyzed on surface-based atlases. Cerebral cortex, 22(10), 2241-2262. Notes ----- License: ??? """ dataset_name = 'tpl-conte69' keys = ['midthickness', 'inflated', 'vinflated'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name) if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } filenames = [ 'tpl-conte69/tpl-conte69_space-MNI305_variant-fsLR32k_{}.{}.surf.gii'. format(res, hemi) for res in keys for hemi in ['L', 'R'] ] + ['tpl-conte69/template_description.json'] data = _fetch_files(data_dir, files=[(f, url, opts) for f in filenames], resume=resume, verbose=verbose) with open(data[-1], 'r') as src: data[-1] = json.load(src) # bundle hemispheres together data = [ANNOT(*data[:-1][i:i + 2]) for i in range(0, 6, 2)] + [data[-1]] return Bunch(**dict(zip(keys + ['info'], data)))
def fetch_census_dataset(): """Fetch the Adult Census Dataset. This uses a particular URL for the Adult Census dataset. The code is a simplified version of fetch_openml() in sklearn. The data are copied from: https://openml.org/data/v1/download/1595261.gz (as of 2021-03-31) """ try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve filename = "1595261.gz" data_url = "https://rainotebookscdn.blob.core.windows.net/datasets/" remaining_attempts = 5 sleep_duration = 10 while remaining_attempts > 0: try: urlretrieve(data_url + filename, filename) http_stream = gzip.GzipFile(filename=filename, mode='rb') with closing(http_stream): def _stream_generator(response): for line in response: yield line.decode('utf-8') stream = _stream_generator(http_stream) data = arff.load(stream) except Exception as exc: # noqa: B902 remaining_attempts -= 1 print("Error downloading dataset from {} ({} attempt(s) remaining)" .format(data_url, remaining_attempts)) print(exc) time.sleep(sleep_duration) sleep_duration *= 2 continue else: # dataset successfully downloaded break else: raise Exception("Could not retrieve dataset from {}.".format(data_url)) attributes = OrderedDict(data['attributes']) arff_columns = list(attributes) raw_df = pd.DataFrame(data=data['data'], columns=arff_columns) target_column_name = 'class' target = raw_df.pop(target_column_name) for col_name in _categorical_columns: dtype = pd.api.types.CategoricalDtype(attributes[col_name]) raw_df[col_name] = raw_df[col_name].astype(dtype, copy=False) result = Bunch() result.data = raw_df result.target = target return result
def fetch_fsaverage(version='fsaverage', data_dir=None, url=None, resume=True, verbose=1): """ Downloads files for fsaverage FreeSurfer template Parameters ---------- version : str, optional One of {'fsaverage', 'fsaverage3', 'fsaverage4', 'fsaverage5', 'fsaverage6'}. Default: 'fsaverage' data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['surf'] where corresponding values are length-2 lists downloaded template files (each list composed of files for the left and right hemisphere). References ---------- """ versions = [ 'fsaverage', 'fsaverage3', 'fsaverage4', 'fsaverage5', 'fsaverage6' ] if version not in versions: raise ValueError('The version of fsaverage requested "{}" does not ' 'exist. Must be one of {}'.format(version, versions)) dataset_name = 'tpl-fsaverage' keys = ['orig', 'white', 'smoothwm', 'pial', 'inflated', 'sphere'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name)[version] if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } filenames = [ op.join(version, 'surf', '{}.{}'.format(hemi, surf)) for surf in keys for hemi in ['lh', 'rh'] ] try: data_dir = check_fs_subjid(version)[1] data = [op.join(data_dir, f) for f in filenames] except FileNotFoundError: data = _fetch_files(data_dir, resume=resume, verbose=verbose, files=[(op.join(dataset_name, f), url, opts) for f in filenames]) data = [ANNOT(*data[i:i + 2]) for i in range(0, len(keys) * 2, 2)] return Bunch(**dict(zip(keys, data)))
def fetch_icbm152_2009(data_dir=None, url=None, resume=True, verbose=1): """Download and load the ICBM152 template (dated 2009) Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) url: string, optional Download URL of the dataset. Overwrite the default URL. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, interest keys are: "t1", "t2", "t2_relax", "pd": anatomical images obtained with the given modality (resp. T1, T2, T2 relaxometry and proton density weighted). Values are file paths. "gm", "wm", "csf": segmented images, giving resp. gray matter, white matter and cerebrospinal fluid. Values are file paths. "eye_mask", "face_mask", "mask": use these images to mask out parts of mri images. Values are file paths. References ---------- VS Fonov, AC Evans, K Botteron, CR Almli, RC McKinstry, DL Collins and BDCG, "Unbiased average age-appropriate atlases for pediatric studies", NeuroImage,Volume 54, Issue 1, January 2011 VS Fonov, AC Evans, RC McKinstry, CR Almli and DL Collins, "Unbiased nonlinear average age-appropriate brain templates from birth to adulthood", NeuroImage, Volume 47, Supplement 1, July 2009, Page S102 Organization for Human Brain Mapping 2009 Annual Meeting. DL Collins, AP Zijdenbos, WFC Baare and AC Evans, "ANIMAL+INSECT: Improved Cortical Structure Segmentation", IPMI Lecture Notes in Computer Science, 1999, Volume 1613/1999, 210-223 Notes ----- For more information about this dataset's structure: http://www.bic.mni.mcgill.ca/ServicesAtlases/ICBM152NLin2009 The original download URL is http://www.bic.mni.mcgill.ca/~vfonov/icbm/2009/mni_icbm152_nlin_sym_09a_nifti.zip """ if url is None: # The URL can be retrieved from the nilearn account on OSF (Open # Science Framework), https://osf.io/4r3jt/quickfiles/ # Clicking on the "share" button gives the root of the URL. url = "https://osf.io/7pj92/download" opts = {'uncompress': True} keys = ("csf", "gm", "wm", "pd", "t1", "t2", "t2_relax", "eye_mask", "face_mask", "mask") filenames = [ (os.path.join("mni_icbm152_nlin_sym_09a", name), url, opts) for name in ("mni_icbm152_csf_tal_nlin_sym_09a.nii.gz", "mni_icbm152_gm_tal_nlin_sym_09a.nii.gz", "mni_icbm152_wm_tal_nlin_sym_09a.nii.gz", "mni_icbm152_pd_tal_nlin_sym_09a.nii.gz", "mni_icbm152_t1_tal_nlin_sym_09a.nii.gz", "mni_icbm152_t2_tal_nlin_sym_09a.nii.gz", "mni_icbm152_t2_relx_tal_nlin_sym_09a.nii.gz", "mni_icbm152_t1_tal_nlin_sym_09a_eye_mask.nii.gz", "mni_icbm152_t1_tal_nlin_sym_09a_face_mask.nii.gz", "mni_icbm152_t1_tal_nlin_sym_09a_mask.nii.gz") ] dataset_name = 'icbm152_2009' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) params = dict([('description', fdescr)] + list(zip(keys, sub_files))) return Bunch(**params)
def fetch_schaefer2018(version='fsaverage', data_dir=None, url=None, resume=True, verbose=1): """ Downloads FreeSurfer .annot files for Schaefer et al., 2018 parcellation Parameters ---------- version : {'fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k'} Specifies which surface annotation files should be matched to. Default: 'fsaverage' data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys of format '{}Parcels{}Networks' where corresponding values are the left/right hemisphere annotation files References ---------- Schaefer, A., Kong, R., Gordon, E. M., Laumann, T. O., Zuo, X. N., Holmes, A. J., ... & Yeo, B. T. (2017). Local-global parcellation of the human cerebral cortex from intrinsic functional connectivity MRI. Cerebral Cortex, 28(9), 3095-3114. Notes ----- License: https://github.com/ThomasYeoLab/CBIG/blob/master/LICENSE.md """ versions = ['fsaverage', 'fsaverage5', 'fsaverage6', 'fslr32k'] if version not in versions: raise ValueError( 'The version of Schaefer et al., 2018 parcellation ' 'requested "{}" does not exist. Must be one of {}'.format( version, versions)) dataset_name = 'atl-schaefer2018' keys = [ '{}Parcels{}Networks'.format(p, n) for p in range(100, 1001, 100) for n in [7, 17] ] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name)[version] if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } if version == 'fslr32k': hemispheres, suffix = ['LR'], 'dlabel.nii' else: hemispheres, suffix = ['L', 'R'], 'annot' filenames = [ 'atl-Schaefer2018_space-{}_hemi-{}_desc-{}_deterministic.{}'.format( version, hemi, desc, suffix) for desc in keys for hemi in hemispheres ] files = [(op.join(dataset_name, version, f), url, opts) for f in filenames] data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose) if suffix == 'annot': data = [ANNOT(*data[i:i + 2]) for i in range(0, len(keys) * 2, 2)] return Bunch(**dict(zip(keys, data)))
def fetch_oasis_vbm(n_subjects=None, dartel_version=True, data_dir=None, url=None, resume=True, verbose=1): """Download and load Oasis "cross-sectional MRI" dataset (416 subjects). Parameters ---------- n_subjects: int, optional The number of subjects to load. If None is given, all the subjects are used. dartel_version: boolean, Whether or not to use data normalized with DARTEL instead of standard SPM8 normalization. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None url: string, optional Override download URL. Used for test only (or if you setup a mirror of the data). resume: bool, optional If true, try resuming download if possible verbose: int, optional verbosity level (0 means no message). Returns ------- data: Bunch Dictionary-like object, the interest attributes are : - 'gray_matter_maps': string list Paths to nifti gray matter density probability maps - 'white_matter_maps' string list Paths to nifti white matter density probability maps - 'ext_vars': np.recarray Data from the .csv file with information about selected subjects - 'data_usage_agreement': string Path to the .txt file containing the data usage agreement. References ---------- * http://www.oasis-brains.org/ * Open Access Series of Imaging Studies (OASIS): Cross-sectional MRI Data in Young, Middle Aged, Nondemented, and Demented Older Adults. Marcus, D. S and al., 2007, Journal of Cognitive Neuroscience. Notes ----- In the DARTEL version, original Oasis data have been preprocessed with the following steps: 1. Dimension swapping (technically required for subsequent steps) 2. Brain Extraction 3. Segmentation with SPM8 4. Normalization using DARTEL algorithm 5. Modulation 6. Replacement of NaN values with 0 in gray/white matter density maps. 7. Resampling to reduce shape and make it correspond to the shape of the non-DARTEL data (fetched with dartel_version=False). 8. Replacement of values < 1e-4 with zeros to reduce the file size. In the non-DARTEL version, the following steps have been performed instead: 1. Dimension swapping (technically required for subsequent steps) 2. Brain Extraction 3. Segmentation and normalization to a template with SPM8 4. Modulation 5. Replacement of NaN values with 0 in gray/white matter density maps. An archive containing the gray and white matter density probability maps for the 416 available subjects is provided. Gross outliers are removed and filtered by this data fetcher (DARTEL: 13 outliers; non-DARTEL: 1 outlier) Externals variates (age, gender, estimated intracranial volume, years of education, socioeconomic status, dementia score) are provided in a CSV file that is a copy of the original Oasis CSV file. The current downloader loads the CSV file and keeps only the lines corresponding to the subjects that are actually demanded. The Open Access Structural Imaging Series (OASIS) is a project dedicated to making brain imaging data openly available to the public. Using data available through the OASIS project requires agreeing with the Data Usage Agreement that can be found at http://www.oasis-brains.org/app/template/UsageAgreement.vm """ # check number of subjects if n_subjects is None: n_subjects = 403 if dartel_version else 415 if dartel_version: # DARTEL version has 13 identified outliers if n_subjects > 403: warnings.warn('Only 403 subjects are available in the ' 'DARTEL-normalized version of the dataset. ' 'All of them will be used instead of the wanted %d' % n_subjects) n_subjects = 403 else: # all subjects except one are available with non-DARTEL version if n_subjects > 415: warnings.warn('Only 415 subjects are available in the ' 'non-DARTEL-normalized version of the dataset. ' 'All of them will be used instead of the wanted %d' % n_subjects) n_subjects = 415 if n_subjects < 1: raise ValueError("Incorrect number of subjects (%d)" % n_subjects) # pick the archive corresponding to preprocessings type if url is None: if dartel_version: url_images = ('https://www.nitrc.org/frs/download.php/' '6364/archive_dartel.tgz?i_agree=1&download_now=1') else: url_images = ('https://www.nitrc.org/frs/download.php/' '6359/archive.tgz?i_agree=1&download_now=1') # covariates and license are in separate files on NITRC url_csv = ('https://www.nitrc.org/frs/download.php/' '6348/oasis_cross-sectional.csv?i_agree=1&download_now=1') url_dua = ('https://www.nitrc.org/frs/download.php/' '6349/data_usage_agreement.txt?i_agree=1&download_now=1') else: # local URL used in tests url_csv = url + "/oasis_cross-sectional.csv" url_dua = url + "/data_usage_agreement.txt" if dartel_version: url_images = url + "/archive_dartel.tgz" else: url_images = url + "/archive.tgz" opts = {'uncompress': True} # missing subjects create shifts in subjects ids missing_subjects = [ 8, 24, 36, 48, 89, 93, 100, 118, 128, 149, 154, 171, 172, 175, 187, 194, 196, 215, 219, 225, 242, 245, 248, 251, 252, 257, 276, 297, 306, 320, 324, 334, 347, 360, 364, 391, 393, 412, 414, 427, 436 ] if dartel_version: # DARTEL produces outliers that are hidden by nilearn API removed_outliers = [ 27, 57, 66, 83, 122, 157, 222, 269, 282, 287, 309, 428 ] missing_subjects = sorted(missing_subjects + removed_outliers) file_names_gm = [( os.path.join("OAS1_%04d_MR1", "mwrc1OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") % (s, s), url_images, opts) for s in range(1, 457) if s not in missing_subjects][:n_subjects] file_names_wm = [( os.path.join("OAS1_%04d_MR1", "mwrc2OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") % (s, s), url_images, opts) for s in range(1, 457) if s not in missing_subjects] else: # only one gross outlier produced, hidden by nilearn API removed_outliers = [390] missing_subjects = sorted(missing_subjects + removed_outliers) file_names_gm = [ (os.path.join("OAS1_%04d_MR1", "mwc1OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") % (s, s), url_images, opts) for s in range(1, 457) if s not in missing_subjects ][:n_subjects] file_names_wm = [ (os.path.join("OAS1_%04d_MR1", "mwc2OAS1_%04d_MR1_mpr_anon_fslswapdim_bet.nii.gz") % (s, s), url_images, opts) for s in range(1, 457) if s not in missing_subjects ] file_names_extvars = [("oasis_cross-sectional.csv", url_csv, {})] file_names_dua = [("data_usage_agreement.txt", url_dua, {})] # restrict to user-specified number of subjects file_names_gm = file_names_gm[:n_subjects] file_names_wm = file_names_wm[:n_subjects] file_names = (file_names_gm + file_names_wm + file_names_extvars + file_names_dua) dataset_name = 'oasis1' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, file_names, resume=resume, verbose=verbose) # Build Bunch gm_maps = files[:n_subjects] wm_maps = files[n_subjects:(2 * n_subjects)] ext_vars_file = files[-2] data_usage_agreement = files[-1] # Keep CSV information only for selected subjects csv_data = np.recfromcsv(ext_vars_file) # Comparisons to recfromcsv data must be bytes. actual_subjects_ids = [ ("OAS1" + str.split(os.path.basename(x), "OAS1")[1][:9]).encode() for x in gm_maps ] subject_mask = np.asarray( [subject_id in actual_subjects_ids for subject_id in csv_data['id']]) csv_data = csv_data[subject_mask] fdescr = _get_dataset_descr(dataset_name) return Bunch(gray_matter_maps=gm_maps, white_matter_maps=wm_maps, ext_vars=csv_data, data_usage_agreement=data_usage_agreement, description=fdescr)
def get_args_aida_task(): args = Bunch() args.seq_len = 64 args.row_wise_fill = True args.mask_mode = 'cross-wise' args.additional_ban = 2 args.table_object = 'first-column' args.pooling = 'avg-token' args.pretrained_model_path = "./models/bert_model.bin-000" args.vocab_path = 'models/google_uncased_en_vocab.txt' args.vocab = Vocab() args.vocab.load(args.vocab_path) args.emb_size = 768 args.embedding = 'tab' # before: bert args.encoder = 'bertTab' args.subword_type = 'none' args.tokenizer = 'bert' args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) args.feedforward_size = 3072 args.hidden_size = 768 args.heads_num = 12 args.layers_num = 12 args.learning_rate = 2e-5 args.batch_size = 4 args.dropout = 0.1 # args.target = 'bert' return args
def fit(self, X, return_pi_T=False): """See Algorithm 1 in Li & Yu '14. Paramters --------- X : array-like, shape=(n_samples,n_experts) return_pi_T : boolean Whether or not to return (accuracies,labels) as a tuple instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'accuracies', the estimated expert accuracies, 'labels', the best-guess labels, 'class_names' the name of each unique class this estimator observed, 'probs' the probability of each possible label for each sample (None if not available), and class_names the name (and ordering) of the classes. The ordering of columns in probs corresponds to that in class_names """ Z = np.array(X) n_samples,n_experts = np.shape(Z) # Workaround for not getting NaNs in the list of classes # Since NaN == NaN evaluates to False classes = np.sort(pd.Series(Z.flatten()).dropna().unique()) L = len(classes) # Initialize equal weights for all experts v = np.array([1 for i in range(n_experts)]) # Identity matrix, response or no-response T = ~pd.isnull(Z) T = np.array(T).astype(int) s = 0 # Keep track of iterations converged = False # Initialize 'best-guess' with all one class y_prev = np.full(n_samples,classes[0]) while (s<self.n_iter and not converged): # Estimate best-guess labels all_votes = np.array([np.sum(v*(Z==k).astype(int),axis=1) for k in classes]) y_hat = np.array([classes[i] for i in np.argmax(all_votes,axis=0)]) # Calculate expert accuracies (according to the updated best-guess labels) w_hat = np.sum((Z.T==y_hat).astype(int),axis=1) w_hat = w_hat / np.sum(T,axis=0) # Calculate new expert weights (how much their vote counts) if self.mode == 'log': MIN_INT = np.iinfo(np.int16).min v = np.array([MIN_INT if w_i == 0 else math.log((L-1)*w_i)/(1-w_i) for w_i in w_hat]) else: # Derived in eq. 33 in Li & Yu paper v = L*w_hat-1 # If the labels haven't changed since last time, it's converged! if (y_hat == y_prev).all(): converged = True # Updated number of iterations completed s += 1 y_prev = y_hat if return_pi_T: return w_hat,y_hat else: return Bunch(accuracies=w_hat,labels=y_hat,probs=None,class_names=classes)
def read_data(name, with_classes=True, prefer_attr_nodes=False, prefer_attr_edges=False, produce_labels_nodes=False, as_graphs=False, is_symmetric=symmetric_dataset): """Create a dataset iterable for GraphKernel. Parameters ---------- name : str The dataset name. with_classes : bool, default=False Return an iterable of class labels based on the enumeration. produce_labels_nodes : bool, default=False Produce labels for nodes if not found. Currently this means labeling its node by its degree inside the Graph. This operation is applied only if node labels are non existent. prefer_attr_nodes : bool, default=False If a dataset has both *node* labels and *node* attributes set as labels for the graph object for *nodes* the attributes. prefer_attr_edges : bool, default=False If a dataset has both *edge* labels and *edge* attributes set as labels for the graph object for *edge* the attributes. as_graphs : bool, default=False Return data as a list of Graph Objects. is_symmetric : bool, default=False Defines if the graph data describe a symmetric graph. Returns ------- Gs : iterable An iterable of graphs consisting of a dictionary, node labels and edge labels for each graph. classes : np.array, case_of_appearance=with_classes==True An one dimensional array of graph classes aligned with the lines of the `Gs` iterable. Useful for classification. """ indicator_path = "./" + str(name) + "/" + str( name) + "_graph_indicator.txt" edges_path = "./" + str(name) + "/" + str(name) + "_A.txt" node_labels_path = "./" + str(name) + "/" + str(name) + "_node_labels.txt" node_attributes_path = "./" + str(name) + "/" + str( name) + "_node_attributes.txt" edge_labels_path = "./" + str(name) + "/" + str(name) + "_edge_labels.txt" edge_attributes_path = \ "./" + str(name) + "/" + str(name) + "_edge_attributes.txt" graph_classes_path = \ "./" + str(name) + "/" + str(name) + "_graph_labels.txt" # node graph correspondence ngc = dict() # edge line correspondence elc = dict() # dictionary that keeps sets of edges Graphs = dict() # dictionary of labels for nodes node_labels = dict() # dictionary of labels for edges edge_labels = dict() # Associate graphs nodes with indexes with open(indicator_path, "r") as f: for (i, line) in enumerate(f, 1): ngc[i] = int(line[:-1]) if int(line[:-1]) not in Graphs: Graphs[int(line[:-1])] = set() if int(line[:-1]) not in node_labels: node_labels[int(line[:-1])] = dict() if int(line[:-1]) not in edge_labels: edge_labels[int(line[:-1])] = dict() # Extract graph edges with open(edges_path, "r") as f: for (i, line) in enumerate(f, 1): edge = line[:-1].replace(' ', '').split(",") elc[i] = (int(edge[0]), int(edge[1])) Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1]))) if is_symmetric: Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0]))) # Extract node attributes if (prefer_attr_nodes and dataset_metadata[name].get( "na", os.path.exists(node_attributes_path))): with open(node_attributes_path, "r") as f: for (i, line) in enumerate(f, 1): node_labels[ngc[i]][i] = \ [float(num) for num in line[:-1].replace(' ', '').split(",")] # Extract node labels elif dataset_metadata[name].get("nl", os.path.exists(node_labels_path)): with open(node_labels_path, "r") as f: for (i, line) in enumerate(f, 1): node_labels[ngc[i]][i] = int(line[:-1]) elif produce_labels_nodes: for i in range(1, len(Graphs) + 1): node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d)) # Extract edge attributes if (prefer_attr_edges and dataset_metadata[name].get( "ea", os.path.exists(edge_attributes_path))): with open(edge_attributes_path, "r") as f: for (i, line) in enumerate(f, 1): attrs = [ float(num) for num in line[:-1].replace(' ', '').split(",") ] edge_labels[ngc[elc[i][0]]][elc[i]] = attrs if is_symmetric: edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = attrs # Extract edge labels elif dataset_metadata[name].get("el", os.path.exists(edge_labels_path)): with open(edge_labels_path, "r") as f: for (i, line) in enumerate(f, 1): edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1]) if is_symmetric: edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \ int(line[:-1]) Gs = list() if as_graphs: for i in range(1, len(Graphs) + 1): Gs.append(Graph(Graphs[i], node_labels[i], edge_labels[i])) else: for i in range(1, len(Graphs) + 1): Gs.append([Graphs[i], node_labels[i], edge_labels[i]]) if with_classes: classes = [] with open(graph_classes_path, "r") as f: for line in f: classes.append(int(line[:-1])) classes = np.array(classes, dtype=np.int) return Bunch(data=Gs, target=classes) else: return Bunch(data=Gs)
def fetch( collection: str, name: str, data_home: Optional[str] = None, nfolds: Literal[None, 1, 5, 10] = None, dobscv: bool = False, *, return_X_y: bool = False, ) -> Union[Bunch, Tuple[np.typing.NDArray[float], np.typing.NDArray[Union[ int, float]]], ]: """ Fetch Keel dataset. Fetch a Keel dataset by collection and name. More info at http://sci2s.ugr.es/keel. Parameters ---------- collection : string Collection name. name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. nfolds : int, default=None Number of folds. Depending on the dataset, valid values are {None, 1, 5, 10}. dobscv : bool, default=False If folds are in {5, 10}, indicates that the cv folds are distribution optimally balanced stratified. Only available for some datasets. return_X_y : bool, default=False If True, returns ``(data, target)`` instead of a Bunch object. kwargs : dict Optional key-value arguments Returns ------- data : Bunch Dictionary-like object with all the data and metadata. (data, target) : tuple if ``return_X_y`` is True """ if collection not in COLLECTIONS: raise ValueError('Avaliable collections are ' + str(list(COLLECTIONS))) nattrs, DESCR = _load_descr(collection, name, data_home=data_home) X, y, cv = _load_folds( collection, name, nfolds, dobscv, nattrs, data_home=data_home, ) if return_X_y: return X, y return Bunch( data=X, target=y, train_indices=[], validation_indices=[], test_indices=[], inner_cv=None, outer_cv=cv, DESCR=DESCR, )
def fetch_phoneme( *, return_X_y: bool = False, as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the phoneme dataset. The data is obtained from the R package 'ElemStatLearn', which takes it from the dataset in `https://web.stanford.edu/~hastie/ElemStatLearn/`. """ descr = _phoneme_descr raw_dataset = _fetch_elem_stat_learn("phoneme") data = raw_dataset["phoneme"] n_points = 256 curve_data = data.iloc[:, 0:n_points] sound = data["g"].values speaker = data["speaker"].values curves = FDataGrid( data_matrix=curve_data.values, grid_points=np.linspace(0, 8, n_points), domain_range=[0, 8], dataset_name="Phoneme", argument_names=("frequency (kHz)", ), coordinate_names=("log-periodogram", ), ) curve_name = "log-periodogram" target_name = "phoneme" frame = None if as_frame: frame = pd.DataFrame({ curve_name: curves, target_name: sound, }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] meta = pd.Series(speaker, name="speaker") else: target = sound.codes meta = np.array([speaker]).T if return_X_y: return curves, target return Bunch( data=curves, target=target, frame=frame, categories={target_name: sound.categories.tolist()}, feature_names=[curve_name], target_names=[target_name], meta=meta, meta_names=["speaker"], DESCR=descr, )
def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch)) bunch_from_pkl.x = "y" assert bunch_from_pkl["x"] == bunch_from_pkl.x