def test_download_data(self): cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) res_path = get_resource_file_path( 'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip', cache_dir=cache_dir) res_path = get_resource_file_path( 'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip', cache_dir=cache_dir) assert res_path == os.path.join( cache_dir, 'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1') assert os.path.exists(res_path) meta_path = os.path.join( cache_dir, 'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1.json' ) assert os.path.exists(meta_path) with open(meta_path, 'r', encoding='utf-8') as meta_file: meta = json.load(meta_file) assert meta['local_path'] == res_path shutil.rmtree(cache_dir)
def test_download_resource(self): cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) res_path = get_resource_file_path('resources://test@amazon', cache_dir=cache_dir, config_dir=config_dir) res_path = get_resource_file_path('resources://test', cache_dir=cache_dir, config_dir=config_dir) assert res_path == os.path.join( cache_dir, '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08') assert os.path.exists(res_path) meta_path = os.path.join( cache_dir, '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08.json' ) assert os.path.exists(meta_path) with open(meta_path, 'r', encoding='utf-8') as meta_file: meta = json.load(meta_file) assert meta['local_path'] == res_path shutil.rmtree(cache_dir)
def test_MSCOCO_resource(self): cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/file_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/mscoco.zip#MSCOCO')), cache_dir, config_dir) assert os.path.isdir(res_path) for key in ['train', 'test', 'dev']: assert os.path.isfile(os.path.join(res_path, key + '.txt')) shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/mscoco.zip_unzip'))) if os.path.exists(cache_dir): shutil.rmtree(cache_dir)
def __init__(self): self.file_id = './tests/dataloader/dummy_languageprocessing' self.file_path = get_resource_file_path(self.file_id) all_vocab_list = ['<pad>', '<unk>', '<go>', '<eos>', \ 'what', 'how', 'here', 'do', 'as', 'can', 'to'] set_names = ["train", "dev", "test"] vocab = GeneralVocab.from_predefined(all_vocab_list, 8) toker = SimpleTokenizer('space', ['<pad>', '<unk>', '<go>', '<eos>']) sent = SentenceDefault(toker, vocab, convert_to_lower_letter=True) fields = {set_name: [('sent', sent)] for set_name in set_names} with FieldContext.set_parameters(vocab=GeneralVocab.from_predefined( all_vocab_list, 8), weak=True) as field_context: fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]] = {} self.fields: "OrderedDict[str, OrderedDictType[str, Field]]" = {} if isinstance(fields, OrderedDict): fields = { set_name: fields for set_name in ["train", "dev", "test"] } if isinstance(fields, dict): for set_name, fields_in_one_set in fields.items(): one_fields, one_fieldcontents = self._fill_field_and_create_content( set_name, fields_in_one_set) self.fields[set_name] = one_fields fieldcontents[set_name] = one_fieldcontents else: raise TypeError("Unknown type for fields") self._load_data(fieldcontents) self.vocabs = self._collect_vocabs_from_fields(self.fields) # self.default_vocab_id = 0 if len(self.vocabs) == 1 else None self.tokenizers = self._collect_tokenizers_from_fields(self.fields) # self.default_tokenizer_id = 0 if len(self.tokenizers) == 1 else None self.default_field_set_name: Optional[str] = None self.default_field_name: Optional[str] = None self._build_vocabs() self._setting_hash = self._create_setting_hash() self._vocab_hash = self._create_vocab_hash() self.data = self._get_data(fieldcontents) self._raw_data_hash, self._data_hash = self._create_data_hash( fieldcontents) self.index, self.batch_id, self.batch_size = self._init_batch( fieldcontents) self.set_default_field("train", "sent")
def test_SwitchboardCorpus_resource(self): cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/file_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/switchboard_corpus.zip#SwitchboardCorpus')), cache_dir, config_dir) assert os.path.isdir(res_path) for key in ['train', 'test', 'dev', 'multi_ref']: assert os.path.isfile(os.path.join(res_path, key + '.txt')) shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/switchboard_corpus.zip_unzip'))) if os.path.exists(cache_dir): shutil.rmtree(cache_dir)
def test_glove50d_resource(self): cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/file_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip#Glove50d')), cache_dir, config_dir) filenames = os.listdir(res_path) assert res_path == str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip_unzip/50d')) assert sorted(filenames) == sorted(os.listdir(os.path.join(data_dir, 'glove', '50d'))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'glove', '50d', filename)) shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip_unzip'))) if os.path.exists(cache_dir): shutil.rmtree(cache_dir)
def test_get_resource(self, r_mock): r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai') cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai')) with pytest.raises(FileNotFoundError) as excinfo: get_resource_file_path('resources://coai', cache_dir=cache_dir, config_dir='wrongpath') assert "not found" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: get_resource_file_path('resources://coai#wrongtype', cache_dir=cache_dir, config_dir=config_dir) assert "No resources type" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: get_resource_file_path('resources://coai@wronglink', cache_dir=cache_dir, config_dir=config_dir) assert "source wronglink wrong" in str(excinfo.value) res_path = get_resource_file_path('resources://coai', cache_dir=cache_dir, config_dir=config_dir) assert res_path == os.path.join( cache_dir, '146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69') assert os.path.exists(res_path) hash_sha256 = hashlib.sha256() with open(res_path, "rb") as fin: for chunk in iter(lambda: fin.read(4096), b""): hash_sha256.update(chunk) assert hash_sha256.hexdigest( ) == "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69" meta_path = res_path + '.json' assert os.path.exists(meta_path) with open(meta_path, 'r', encoding='utf-8') as meta_file: meta = json.load(meta_file) assert meta['local_path'] == res_path shutil.rmtree(cache_dir)