def setup_working_with_text_data(): if IS_PYPY and os.environ.get('CI', None): raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_maps_piano_dataset(*, data_origin: Optional[str] = None, data_home: Optional[str] = None, preprocessor: Optional[BaseEstimator] = None, force_preprocessing: bool = False, label_type: Literal["pitch", "onset", "offset"]) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load the MAPS piano dataset from Telecom Paris (classification). ================= ===================== Classes TODO Samples total TODO Dimensionality TODO Features TODO ================= ===================== Parameters ---------- data_origin : Optional[str], default=None Specify where the original dataset can be found. By default, all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in '~/scikit_learn_data' subfolders. data_home : Optional[str], default=None Specify another download and cache folder fo the datasets. By default, all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in '~/scikit_learn_data' subfolders. preprocessor : Optional[sklearn.TransformerMixin], default=None, Estimator for preprocessing the dataset (create features and targets from audio and label files). force_preprocessing: bool, default=False Force preprocessing (label computation and feature extraction) label_type : Literal["pitch", "onset", "offset"], default="pitch", Type of labels to return. Possible are pitch labels or onset and offset labels for each pitch. Returns ------- (X_train, X_test, y_train, y_test) : tuple(np.ndarray, np.ndarray, np.ndarray, np.ndarray) """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'maps.pkz') if not exists(filepath) or force_preprocessing: print('preprocessing MAPS dataset from %s to %s' % (data_origin, data_home)) train_files = np.loadtxt(join( data_origin, Path("mapsSplits/sigtia-conf3-splits/train")), dtype=object) test_files = np.loadtxt(join( data_origin, Path("mapsSplits/sigtia-conf3-splits/test")), dtype=object) X_train = np.empty(shape=(len(train_files), ), dtype=object) X_test = np.empty(shape=(len(test_files), ), dtype=object) y_train = np.empty(shape=(len(train_files), ), dtype=object) y_test = np.empty(shape=(len(test_files), ), dtype=object) for k, f in enumerate(train_files): X_train[k] = preprocessor.transform( join(data_origin, Path(f + ".wav"))) y_train[k] = pd.read_csv(join(data_origin, Path(f + ".txt")), sep="\t") for k, f in enumerate(test_files): X_test[k] = preprocessor.transform( join(data_origin, Path(f + ".wav"))) y_test[k] = pd.read_csv(join(data_origin, Path(f + ".txt")), sep="\t") joblib.dump([X_train, X_test, y_train, y_test], filepath, compress=6) else: X_train, X_test, y_train, y_test = joblib.load(filepath) x_shape_zero = np.unique([X.shape[0] for X in X_train] + [X.shape[0] for X in X_test]) x_shape_one = np.unique([X.shape[1] for X in X_train] + [X.shape[1] for X in X_test]) if len(x_shape_zero) == 1 and len(x_shape_one) > 1: for k in range(len(X_train)): X_train[k] = X_train[k].T for k in range(len(X_test)): X_test[k] = X_test[k].T elif len(x_shape_zero) > 1 and len(x_shape_one) == 1: pass else: raise TypeError("Invalid dataformat. Expected at least one equal " "dimension of all sequences.") for k in range(len(X_train)): if label_type == "pitch": y_train[k] = _get_pitch_labels(X_train[k], y_train[k]) else: raise TypeError("Invalid label type.") for k in range(len(X_test)): if label_type == "pitch": y_test[k] = _get_pitch_labels(X_test[k], y_test[k]) else: raise TypeError("Invalid label type.") return X_train, X_test, y_train, y_test
def setup_twenty_newsgroups(): data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_ptdb_tug_dataset(*, data_origin: Union[str, bytes], data_home: Optional[Union[str, bytes]] = None, preprocessor: Optional[BaseEstimator] = None, augment: Union[int, np.integer] = 0, force_preprocessing: bool = False) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load the PTDB-TUG: Pitch Tracking Database from Graz University of Technology. (classification and regression) ================= ===================== Outputs 2 Samples total TODO Dimensionality TODO Features TODO ================= ===================== Parameters ---------- data_origin : Optional[str] Specify where the original dataset can be found. By default, all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in '~/scikit_learn_data' subfolders. data_home : Optional[str] Specify another download and cache folder fo the datasets. By default, all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in '~/scikit_learn_data' subfolders. preprocessor : Optional[BaseEstimator], default=None, Estimator for preprocessing the dataset (create features and targets from audio and label files). augment : Union[int, np.integer], default = 0 Semitone range used for data augmentation force_preprocessing: bool, default=False Force preprocessing (label computation and feature extraction) Returns ------- (X, y) : tuple """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'ptdb_tug.pkz') if not exists(filepath) or force_preprocessing: print('preprocessing PTDB-TUG database from {0} to {1}'.format( data_origin, data_home)) all_training_files = [] all_test_files = [] for root, dirs, files in walk(data_origin): for f in files: if (f.endswith(".wav") and f.startswith("mic") and not re.search(r'\_[0-9]\.wav$', f) and not re.search(r'\_\-[0-9]\.wav$', f)): if "F09" in f or "F10" in f or "M09" in f or "M10" in f: all_test_files.append(join(root, f)) else: all_training_files.append(join(root, f)) if augment > 0: augment = list(range(-augment, augment + 1)) augment.remove(0) else: augment = [0] if len(augment) == 1: X_train = np.empty(shape=(len(all_training_files), ), dtype=object) y_train = np.empty(shape=(len(all_training_files), ), dtype=object) else: X_train = np.empty(shape=((1 + len(augment)) * len(all_training_files), ), dtype=object) y_train = np.empty(shape=((1 + len(augment)) * len(all_training_files), ), dtype=object) X_test = np.empty(shape=(len(all_test_files), ), dtype=object) y_test = np.empty(shape=(len(all_test_files), ), dtype=object) if len(augment) > 1: for k, f in enumerate(all_training_files): X_train[k] = preprocessor.transform(f) y_train[k] = pd.read_csv(f.replace("MIC", "REF").replace( "mic", "ref").replace(".wav", ".f0"), sep=" ", header=None) for m, st in enumerate(augment): for k, f in enumerate(all_training_files): X_train[k + int((m+1) * len(all_training_files))] = \ preprocessor.transform( f.replace(".wav", "_" + str(st) + ".wav")) df = pd.read_csv(f.replace("MIC", "REF").replace( "mic", "ref").replace(".wav", ".f0"), sep=" ", header=None) df[[0]] = df[[0]] * 2**(st / 12) y_train[k + int((m + 1) * len(all_training_files))] = df else: for k, f in enumerate(all_training_files): X_train[k] = preprocessor.transform(f) y_train[k] = pd.read_csv(f.replace("MIC", "REF").replace( "mic", "ref").replace(".wav", ".f0"), sep=" ", header=None) for k, f in enumerate(all_test_files): X_test[k] = preprocessor.transform(f) y_test[k] = pd.read_csv(f.replace("MIC", "REF").replace( "mic", "ref").replace(".wav", ".f0"), sep=" ", header=None) joblib.dump([X_train, X_test, y_train, y_test], filepath, compress=6) else: X_train, X_test, y_train, y_test = joblib.load(filepath) x_shape_zero = np.unique([x.shape[0] for x in X_train] + [x.shape[0] for x in X_test]) x_shape_one = np.unique([x.shape[1] for x in X_train] + [x.shape[1] for x in X_test]) if len(x_shape_zero) == 1 and len(x_shape_one) > 1: for k in range(len(X_train)): X_train[k] = X_train[k].T y_train[k] = _get_labels(X_train[k], y_train[k]) for k in range(len(X_test)): X_test[k] = X_test[k].T y_test[k] = _get_labels(X_test[k], y_test[k]) elif len(x_shape_zero) > 1 and len(x_shape_one) == 1: for k in range(len(X_train)): y_train[k] = _get_labels(X_train[k], y_train[k]) for k in range(len(X_test)): y_test[k] = _get_labels(X_test[k], y_test[k]) else: raise TypeError("Invalid dataformat. Expected at least one equal " "dimension of all sequences.") return X_train, X_test, y_train, y_test