def test_CRUD_dataset(capsys): datasets.create_dataset( service_account_json, api_key, project_id, cloud_region, dataset_id) datasets.get_dataset( service_account_json, api_key, project_id, cloud_region, dataset_id) datasets.list_datasets( service_account_json, api_key, project_id, cloud_region) # Test and also clean up datasets.delete_dataset( service_account_json, api_key, project_id, cloud_region, dataset_id) out, _ = capsys.readouterr() # Check that create/get/list/delete worked assert 'Created dataset' in out assert 'Time zone' in out assert 'Dataset' in out assert 'Deleted dataset' in out
def test_CRUD_dataset(capsys, crud_dataset_id): datasets.create_dataset( project_id, cloud_region, crud_dataset_id) datasets.get_dataset( project_id, cloud_region, crud_dataset_id) datasets.list_datasets( project_id, cloud_region) datasets.delete_dataset( project_id, cloud_region, crud_dataset_id) out, _ = capsys.readouterr() # Check that create/get/list/delete worked assert 'Created dataset' in out assert 'Time zone' in out assert 'Dataset' in out assert 'Deleted dataset' in out
def test_CRUD_dataset(capsys, crud_dataset_id): datasets.create_dataset(project_id, cloud_region, crud_dataset_id) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=10, retry_on_exception=retry_if_server_exception) def get_dataset(): datasets.get_dataset(project_id, cloud_region, crud_dataset_id) get_dataset() datasets.list_datasets(project_id, cloud_region) datasets.delete_dataset(project_id, cloud_region, crud_dataset_id) out, _ = capsys.readouterr() # Check that create/get/list/delete worked assert 'Created dataset' in out assert 'Time zone' in out assert 'Dataset' in out assert 'Deleted dataset' in out
def test_CRUD_dataset(capsys): datasets.create_dataset( service_account_json, project_id, cloud_region, dataset_id) datasets.get_dataset( service_account_json, project_id, cloud_region, dataset_id) datasets.list_datasets( service_account_json, project_id, cloud_region) # Test and also clean up datasets.delete_dataset( service_account_json, project_id, cloud_region, dataset_id) out, _ = capsys.readouterr() # Check that create/get/list/delete worked assert 'Created dataset' in out assert 'Time zone' in out assert 'Dataset' in out assert 'Deleted dataset' in out
def setup(self, stage): logger.info("Loading raw data...") if self.name in list_datasets(): logger.info("Loading HuggingFace dataset...") self.dataset_setup_fn = hugging_face_load_dataset else: logger.info("Loading local dataset...") self.dataset_setup_fn = file_load_dataset if not os.path.isfile(self.name): raise FileNotFoundError( f"Passed in path `{self.name}` for dataset, but no such file found." ) if stage == 'train': self.train = self.dataset_setup_fn(self.name, split="train") self.val = self.dataset_setup_fn(self.name, split="valid") elif stage == 'test': # DSYITF - don't shoot yourself in the foot. Comment this out when doing pre-prod testing. self.val = self.dataset_setup_fn(self.name, split="valid") # self.test = self.dataset_setup_fn(self.name, split="test") else: raise NotImplementedError()
class Dataset: ''' Loads datasets and tokenizes them ''' HF_DATASETS = list_datasets() DATA_PATH = '../data/' TRAIN_STR = 'train' TEST_STR = 'test' VALIDATION_STR = 'validation' def __init__(self, name, split): ''' Initialzes dataset :param name: name of dataset :param split: train/validation/test split ''' self.name = name self.split = split if self.name not in self.HF_DATASETS: self.type = 'csv' else: self.type = 'hf' self.data = self.get_dataset() def get_num_classes(self, label_column='label'): ''' Fetches number of classes in dataset :return: number of classes in dataset ''' return self.data.features[label_column].num_classes def get_dataset(self): ''' Loads dataset from Huggingface repository ''' if self.type == 'hf': if self.split == self.VALIDATION_STR: try: return load_dataset(self.name, split=self.VALIDATION_STR) except ValueError: pass try: return load_dataset(self.name, split=self.TEST_STR) except ValueError: raise RuntimeError( 'Invalid dataset. No validation set found.') else: return load_dataset(self.name, split=self.split) else: filename = os.path.join(self.DATA_PATH, self.name, str(self.split) + '.' + str(self.type)) return load_dataset(self.type, data_files=filename) def student_dataset_encoder(self, soft_labels, batch_size, text_column='text', label_column='label'): ''' Creates student dataset in tf.Dataset format along with student model encoder :param soft_labels: soft labels from teacher model :param batch_size: batch_size :param text_column: column name for text in dataset :param label_column: column name for label in dataset :return: student dataset and student model encoder ''' dataset = copy.deepcopy(self.data) dataset.set_format(type='tensorflow', columns=[text_column]) features = dataset[text_column] hard_labels = tf.keras.utils.to_categorical( dataset[label_column], num_classes=self.get_num_classes(label_column=label_column)) labels = {'soft': soft_labels, 'hard': hard_labels} tfdataset = tf.data.Dataset.from_tensor_slices( (features, labels)).shuffle(self.data.num_rows).batch(batch_size) VOCAB_SIZE = 30522 encoder = tf.keras.layers.experimental.preprocessing.TextVectorization( max_tokens=VOCAB_SIZE) encoder.adapt(tfdataset.map(lambda text, label: text)) return tfdataset, encoder def classification_tokenize(self, tokenizer, batch_size, max_seq_len, model_name, text_column='text', label_column='label'): ''' Tokenizes data for classification task :param tokenizer: tokenizer class :param batch_size: batch_size :param max_seq_len: maximum sequence length :param model_name: model name :return: tokenized data ''' def encode(example): return tokenizer(example[text_column], padding='max_length', truncation=True) dataset = self.data.map(encode) dataset.set_format(type='tensorflow', columns=Model.MODEL_INPUTS[model_name] + [label_column]) features = { x: dataset[x].to_tensor(default_value=0, shape=(None, max_seq_len)) for x in Model.MODEL_INPUTS[model_name] } labels = tf.keras.utils.to_categorical( dataset[label_column], num_classes=self.get_num_classes(label_column=label_column)) tfdataset = tf.data.Dataset.from_tensor_slices( (features, labels)).shuffle(self.data.num_rows).batch(batch_size) return tfdataset
def list_datasets(cls) -> List[str]: """List datasets on Huggingface datasets. Returns: list of datasets """ return datasets.list_datasets()
def get_dataset_list(): return datasets.list_datasets()
st.error("Unable to load the templates file!\n\n" "We expect the file templates.yaml to be in the working directory. " "You might need to restart the app in the root directory of the repo.") st.stop() def save_data(message="Done!"): with open("./templates.yaml", 'w') as f: templates.write_to_file(f) st.success(message) # # Loads dataset information # dataset_list = datasets.list_datasets() # # Initializes state # session_state = get_session_state(example_index=0, dataset=dataset_list[0]) # # Select a dataset # # TODO: Currently raises an error if you select a dataset that requires a # TODO: configuration. Not clear how to query for these options. dataset_key = st.sidebar.selectbox('Dataset', dataset_list, key='dataset_select', help='Select the dataset to work on. Number in parens ' + 'is the number of prompts created.') st.sidebar.write("HINT: Try ag_news or trec for examples.")
'f1': f1_score(test_y, pred, average='macro') } print('SVM1 -- ACC:', svm1_scores['accuracy'], 'F1:', svm1_scores['f1']) svm_clf.fit(codings_train, train_y) pred = svm_clf.predict(codings_test) svm2_scores = { 'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro') } print('SVM2 -- ACC:', svm2_scores['accuracy'], 'F1:', svm2_scores['f1']) birnn_results.append({ 'model': 'BiGRUx2/relu/mse/adam', 'dataset': data_name, 'RF1-ACC': rf1_scores['accuracy'], 'RF1-F1': rf1_scores['f1'], 'RF2-ACC': rf2_scores['accuracy'], 'RF2-F1': rf2_scores['f1'], 'SVM1-ACC': svm1_scores['accuracy'], 'SVM1-F1': svm1_scores['f1'], 'SVM2-ACC': svm2_scores['accuracy'], 'SVM2-F1': svm2_scores['f1'] }) for dataset in list_datasets()[0]: evaluate(dataset) pd.DataFrame(rnn_results).to_csv('./uni_rnn_results.csv', index=False) pd.DataFrame(birnn_results).to_csv('./uni_birnn_results.csv', index=False)
and builder_instance.info.size_in_bytes < MAX_SIZE): builder_instance.download_and_prepare() dts = builder_instance.as_dataset() dataset = dts else: dataset = builder_instance fail = True return dataset, fail # Dataset select box. dataset_names = [] selection = None import glob if path_to_datasets is None: list_of_datasets = datasets.list_datasets( with_community_datasets=False) else: list_of_datasets = sorted(glob.glob(path_to_datasets + "*")) print(list_of_datasets) for i, dataset in enumerate(list_of_datasets): dataset = dataset.split("/")[-1] if INITIAL_SELECTION and dataset == INITIAL_SELECTION: selection = i dataset_names.append(dataset) if selection is not None: option = st.sidebar.selectbox("Dataset", dataset_names, index=selection, format_func=lambda a: a) else:
-努力使用大型数据集:使您摆脱RAM内存限制,默认情况下所有数据集都映射到驱动器上的内存。 -具有类似tf.data`的智能缓存的智能缓存: 永不等待您的数据处理多次 “🤗Datasets”源自于很棒的Tensorflow-Datasets的分支,而HuggingFace团队想对这个令人惊叹的库和用户API背后的团队深表感谢。 我们试图与tfds保持兼容性,并且转换可以提供从一种格式到另一种格式的转换。 """ # pip install datasets #让我们导入库。 我们通常最多只需要四种方法: from datasets import list_datasets, list_metrics, load_dataset, load_metric from pprint import pprint # 当前可用的数据集和指标 datasets = list_datasets() metrics = list_metrics() print(f"🤩 Currently {len(datasets)} datasets are available on the hub:") pprint(datasets, compact=True) print(f"🤩 Currently {len(metrics)} metrics are available on the hub:") pprint(metrics, compact=True) #您可以在下载数据集之前访问它们的各种属性 squad_dataset = list_datasets(with_details=True)[datasets.index('squad')] pprint(squad_dataset.__dict__) # It's a simple python dataclass # SQuAD的样本 #下载和加载数据集
'f1': f1_score(test_y, pred, average='macro') } print('SVM1 -- ACC:', svm1_scores['accuracy'], 'F1:', svm1_scores['f1']) svm_clf.fit(codings_train, train_y) pred = svm_clf.predict(codings_test) svm2_scores = { 'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro') } print('SVM2 -- ACC:', svm2_scores['accuracy'], 'F1:', svm2_scores['f1']) birnn_results.append({ 'model': 'BiGRUx2/relu/mse/adam', 'dataset': data_name, 'RF1-ACC': rf1_scores['accuracy'], 'RF1-F1': rf1_scores['f1'], 'RF2-ACC': rf2_scores['accuracy'], 'RF2-F1': rf2_scores['f1'], 'SVM1-ACC': svm1_scores['accuracy'], 'SVM1-F1': svm1_scores['f1'], 'SVM2-ACC': svm2_scores['accuracy'], 'SVM2-F1': svm2_scores['f1'] }) for dataset in list_datasets()[1]: evaluate(dataset) pd.DataFrame(rnn_results).to_csv('./mul_rnn_results.csv', index=False) pd.DataFrame(birnn_results).to_csv('./mul_birnn_results.csv', index=False)