def fetch_fashion_mnist(data_target = True, custom_path = os.getcwd()): train_dict = {} for file_key, file_value in train_files.items(): train_dict.update({file_key : maybe_download(custom_path + '/../../ztlearn/datasets/fashion/', URL + file_value)}) with gzip.open(list(train_dict.values())[0], 'rb') as label_path: train_label = np.frombuffer(label_path.read(), dtype = np.uint8, offset = 8) with gzip.open(list(train_dict.values())[1], 'rb') as data_path: train_data = np.frombuffer(data_path.read(), dtype = np.uint8, offset = 16).reshape(len(train_label), 784) test_dict = {} for file_key, file_value in test_files.items(): test_dict.update({file_key : maybe_download(custom_path + '/../../ztlearn/datasets/fashion/', URL + file_value)}) with gzip.open(list(test_dict.values())[0], 'rb') as label_path: test_label = np.frombuffer(label_path.read(), dtype = np.uint8, offset = 8) with gzip.open(list(test_dict.values())[1], 'rb') as data_path: test_data = np.frombuffer(data_path.read(), dtype = np.uint8, offset = 16).reshape(len(test_label), 784) if data_target: return DataSet(np.concatenate((train_data, test_data), axis = 0), np.concatenate((train_label, test_label), axis = 0)) else: return train_data, test_data, train_label, test_label
def fetch_digits(data_target=True): file_path = maybe_download('../../ztlearn/datasets/digits/', URL) with gzip.open(file_path, 'rb') as digits_path: digits_data = np.loadtxt(digits_path, delimiter=',') data, target = digits_data[:, :-1], digits_data[:, -1].astype(np.int) if data_target: return DataSet(data, target) else: return train_test_split(data, target, test_size=0.33, random_seed=5)
def fetch_pima_indians(data_target=True): file_path = maybe_download('../../ztlearn/datasets/pima/', URL) describe = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'DiabetesPedigreeFunction', 'Age', 'Insulin', 'BMI', 'Outcome (0 or 1)' ] dataframe = pd.read_csv(file_path, names=describe) data, target = dataframe.values[:, 0:8], dataframe.values[:, 8] if data_target: return DataSet(data, target, describe) else: return train_test_split(data, target, test_size=0.2, random_seed=2)
def fetch_boston(data_target=True): file_path = maybe_download('../../ztlearn/datasets/boston/', URL) describe = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] dataframe = pd.read_csv(file_path, delim_whitespace=True, names=describe) data, target = dataframe.values[:, 0:13], dataframe.values[:, 13] if data_target: return DataSet(data, target, describe) else: return train_test_split(data, target, test_size=0.2, random_seed=2)
def fetch_steel_plates_faults(data_target=True, custom_path=os.getcwd()): file_path = maybe_download(custom_path + '/../../ztlearn/datasets/steel/', URL) file_path_2 = maybe_download( custom_path + '/../../ztlearn/datasets/steel/', URL_2) describe = [ 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults' ] InputDataHeader = pd.read_csv(file_path_2, header=None) InputData = pd.read_csv(file_path, header=None, sep="\t") InputData.set_axis(InputDataHeader.values.flatten(), axis=1, inplace=True) dataframe = InputData.copy() dataframe.drop(describe, axis=1, inplace=True) targetframe = InputData[describe].copy() data, target = dataframe.values, targetframe.values if data_target: return DataSet(data, target, describe) else: return train_test_split(data, target, test_size=0.2, random_seed=2)
def fetch_cifar_100(data_target=True, custom_path=os.getcwd()): extract_files(custom_path + CIFAR_100_BASE_PATH, maybe_download(custom_path + CIFAR_100_BASE_PATH, URL)) if not os.path.exists( os.path.join(custom_path + CIFAR_100_BASE_PATH, CIFAR_100_BATCHES_FOLDER, train_files[0])): raise FileNotFoundError('{} File Not Found'.format( train_files[0])) # dont continue if not os.path.exists( os.path.join(custom_path + CIFAR_100_BASE_PATH, CIFAR_100_BATCHES_FOLDER, test_files[0])): raise FileNotFoundError('{} File Not Found'.format( test_files[0])) # dont continue with open( os.path.join(custom_path + CIFAR_100_BASE_PATH, CIFAR_100_BATCHES_FOLDER, train_files[0]), 'rb') as file: data = cPickle.load(file, encoding='latin1') train_data = np.reshape(data['data'], (data['data'].shape[0], 3, 32, 32)) train_label = np.reshape(data['fine_labels'], len(data['fine_labels'], )) with open( os.path.join(custom_path + CIFAR_100_BASE_PATH, CIFAR_100_BATCHES_FOLDER, test_files[0]), 'rb') as file: data = cPickle.load(file, encoding='latin1') test_data = np.reshape(data['data'], (data['data'].shape[0], 3, 32, 32)) test_label = np.reshape(data['fine_labels'], len(data['fine_labels'], )) if data_target: return DataSet(np.concatenate((train_data, test_data), axis=0), np.concatenate((train_label, test_label), axis=0)) else: return train_data, test_data, train_label, test_label
def fetch_iris(data_target = True): file_path = maybe_download('../../ztlearn/datasets/iris/', URL) describe = [ 'sepal-length (cm)', 'sepal-width (cm)', 'petal-length (cm)', 'petal-width (cm)', 'petal_type' ] dataframe = pd.read_csv(file_path, names = describe) # convert petal type column to categorical data i.e {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'} dataframe.petal_type = pd.Categorical(dataframe.petal_type) dataframe['petal_type'] = dataframe.petal_type.cat.codes data, target = dataframe.values[:,0:4], dataframe.values[:,4].astype('int') if data_target: return DataSet(data, target, describe) else: return train_test_split(data, target, test_size = 0.2, random_seed = 2)
def fetch_cifar_10(data_target=True, custom_path=os.getcwd()): extract_files(custom_path + CIFAR_10_BASE_PATH, maybe_download(custom_path + CIFAR_10_BASE_PATH, URL)) for train_file in train_files: if not os.path.exists( os.path.join(custom_path + CIFAR_10_BASE_PATH, CIFAR_10_BATCHES_FOLDER, train_file)): raise FileNotFoundError( '{} File Not Found'.format(train_file)) # dont continue train_data = np.zeros((50000, 3, 32, 32), dtype='uint8') train_label = np.zeros((50000, ), dtype='uint8') for idx, train_file in enumerate(train_files): with open( os.path.join(custom_path + CIFAR_10_BASE_PATH, CIFAR_10_BATCHES_FOLDER, train_file), 'rb') as file: data = cPickle.load(file, encoding='latin1') batch_data = data['data'].reshape((-1, 3, 32, 32)).astype('uint8') batch_label = np.reshape(data['labels'], len(data['labels'], )) train_data[idx * 10000:(idx + 1) * 10000, ...] = batch_data train_label[idx * 10000:(idx + 1) * 10000] = batch_label with open( os.path.join(custom_path + CIFAR_10_BASE_PATH, CIFAR_10_BATCHES_FOLDER, test_files[0]), 'rb') as file: data = cPickle.load(file, encoding='latin1') test_data = data['data'].reshape((-1, 3, 32, 32)).astype('uint8') test_label = np.reshape(data['labels'], len(data['labels'], )) if data_target: return DataSet(np.concatenate((train_data, test_data), axis=0), np.concatenate((train_label, test_label), axis=0)) else: return train_data, test_data, train_label, test_label