def test_download_resource(self): cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' res_path = get_resource_file_path('resources://test@amazon', cache_dir=cache_dir, config_dir=config_dir) res_path = get_resource_file_path('resources://test', cache_dir=cache_dir, config_dir=config_dir) assert (res_path == os.path.join( cache_dir, '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08') ) assert (os.path.exists(res_path)) meta_path = os.path.join( cache_dir, '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08.json' ) assert (os.path.exists(meta_path)) with open(meta_path, 'r') as meta_file: meta = json.load(meta_file) assert (meta['local_path'] == res_path) shutil.rmtree(cache_dir)
def test_download_data(self): cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' res_path = get_resource_file_path( 'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip', cache_dir=cache_dir) res_path = get_resource_file_path( 'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip', cache_dir=cache_dir) assert (res_path == os.path.join( cache_dir, 'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1') ) assert (os.path.exists(res_path)) meta_path = os.path.join( cache_dir, 'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1.json' ) assert (os.path.exists(meta_path)) with open(meta_path, 'r') as meta_file: meta = json.load(meta_file) assert (meta['local_path'] == res_path) shutil.rmtree(cache_dir)
def test_SwitchboardCorpus_resource(self): cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/_utils/data')) res_path = get_resource_file_path( str( pathlib.Path( './tests/_utils/data/switchboard_corpus.zip#SwitchboardCorpus' ))) filenames = os.listdir(res_path) assert res_path == str( pathlib.Path( './tests/_utils/data/switchboard_corpus.zip_unzip/switchboard_corpus' )) assert sorted(filenames) == sorted( os.listdir(os.path.join(data_dir, 'switchboard_corpus'))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'switchboard_corpus', filename)) shutil.rmtree( str( pathlib.Path( './tests/_utils/data/switchboard_corpus.zip_unzip')))
def test_download_resource(self): cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' res_path = get_resource_file_path('resources://MSCOCO', 'MSCOCO', cache_dir=cache_dir, config_dir=config_dir) assert (res_path == os.path.join( cache_dir, 'f2c79c204e083627ea6c166061b45ba536813058caf178d21ca58daf5abe8a01_unzip/mscoco' )) assert (os.path.exists(res_path)) assert ( dirhash(res_path, 'sha256') == 'f8ece190272864935f1849d784cb67d36b970c54aceadbcd7e845bdeefc23544') meta_path = os.path.join( cache_dir, 'f2c79c204e083627ea6c166061b45ba536813058caf178d21ca58daf5abe8a01.json' ) assert (os.path.exists(meta_path)) with open(meta_path, 'r') as meta_file: meta = json.load(meta_file) assert (meta == {'local_path': res_path})
def test_get_resource(self, r_mock): r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai') cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' res_path = get_resource_file_path('resources://coai', 'Default', cache_dir=cache_dir, config_dir=config_dir) assert (res_path == os.path.join( cache_dir, '6bd9bfb20a5159d1848a203ece33886690b15d785b0c5d632eed63d70442c58b') ) assert (os.path.exists(res_path)) hash_sha256 = hashlib.sha256() with open(res_path, "rb") as fin: for chunk in iter(lambda: fin.read(4096), b""): hash_sha256.update(chunk) assert ( hash_sha256.hexdigest() == "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69") meta_path = res_path + '.json' assert (os.path.exists(meta_path)) with open(meta_path, 'r') as meta_file: meta = json.load(meta_file) assert (meta == {'local_path': res_path})
def __init__(self, file_id="../data/film", min_vocab_times=0, max_sent_length=10086, invalid_vocab_times=0): self._file_id = file_id self._file_path = get_resource_file_path(file_id) self._min_vocab_times = min_vocab_times self._max_sent_length = max_sent_length self._invalid_vocab_times = invalid_vocab_times super(MyLM, self).__init__()
def test_MSCOCO_resource(self): cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/mscoco.zip#MSCOCO'))) assert os.path.isdir(res_path) for key in ['train', 'test', 'dev']: assert os.path.isfile(os.path.join(res_path, key + '.txt')) shutil.rmtree(str(pathlib.Path('./tests/_utils/data/mscoco.zip_unzip')))
def __init__(self, file_id="../data/film", min_vocab_times=0, max_sent_length=10086, invalid_vocab_times=0, num_turns=8, max_know_length=100): self._file_id = file_id self._file_path = get_resource_file_path(file_id) self._min_vocab_times = min_vocab_times self._max_sent_length = max_sent_length self._invalid_vocab_times = invalid_vocab_times self._num_turns = num_turns self._max_know_length = max_know_length super(MyMemSeq2Seq, self).__init__()
def test_SwitchboardCorpus_resource(self): cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/switchboard_corpus.zip#SwitchboardCorpus'))) assert os.path.isdir(res_path) for key in ['train', 'test', 'dev', 'multi_ref']: assert os.path.isfile(os.path.join(res_path, key + '.txt')) shutil.rmtree(str(pathlib.Path('./tests/_utils/data/switchboard_corpus.zip_unzip')))
def test_glove50d_resource(self): cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/_utils/data')) res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip#Glove50d'))) filenames = os.listdir(res_path) assert res_path == str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip_unzip/50d')) assert sorted(filenames) == sorted(os.listdir(os.path.join(data_dir, 'glove', '50d'))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'glove', '50d', filename)) shutil.rmtree(str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip_unzip')))
def test_MSCOCO_resource(self): cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' data_dir = './tests/_utils/data' res_path = get_resource_file_path( './tests/_utils/data/mscoco.zip#MSCOCO') filenames = os.listdir(res_path) assert (res_path == './tests/_utils/data/mscoco.zip_unzip/mscoco') assert (sorted(filenames) == sorted( os.listdir(os.path.join(data_dir, 'mscoco')))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'mscoco', filename)) shutil.rmtree('./tests/_utils/data/mscoco.zip_unzip')
def test_get_resource(self, r_mock): r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai') cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' with pytest.raises(FileNotFoundError) as excinfo: get_resource_file_path('resources://coai', cache_dir=cache_dir, config_dir='wrongpath') assert ("not found" in str(excinfo.value)) with pytest.raises(ValueError) as excinfo: get_resource_file_path('resources://coai#wrongtype', cache_dir=cache_dir, config_dir=config_dir) assert ("differs with res_type" in str(excinfo.value)) with pytest.raises(ValueError) as excinfo: get_resource_file_path('resources://coai@wronglink', cache_dir=cache_dir, config_dir=config_dir) assert ("source wronglink wrong" in str(excinfo.value)) res_path = get_resource_file_path('resources://coai', cache_dir=cache_dir, config_dir=config_dir) assert (res_path == os.path.join( cache_dir, '146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69') ) assert (os.path.exists(res_path)) hash_sha256 = hashlib.sha256() with open(res_path, "rb") as fin: for chunk in iter(lambda: fin.read(4096), b""): hash_sha256.update(chunk) assert ( hash_sha256.hexdigest() == "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69") meta_path = res_path + '.json' assert (os.path.exists(meta_path)) with open(meta_path, 'r') as meta_file: meta = json.load(meta_file) assert (meta['local_path'] == res_path) shutil.rmtree(cache_dir)
def test_OpenSubtitles_resource(self): cache_dir = './tests/_utils/dataset_cache' config_dir = './tests/_utils/dummy_coai' data_dir = './tests/_utils/data' res_path = get_resource_file_path( './tests/_utils/data/opensubtitles.zip#OpenSubtitles') filenames = os.listdir(res_path) assert (res_path == './tests/_utils/data/opensubtitles.zip_unzip/opensubtitles') assert (sorted(filenames) == sorted( os.listdir(os.path.join(data_dir, 'opensubtitles')))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'opensubtitles', filename)) shutil.rmtree('./tests/_utils/data/opensubtitles.zip_unzip')
def test_Ubuntu_resource(self): cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache')) config_dir = str(pathlib.Path('./tests/_utils/dummy_coai')) data_dir = str(pathlib.Path('./tests/_utils/data')) res_path = get_resource_file_path( str(pathlib.Path('./tests/_utils/data/ubuntu_dataset.zip#Ubuntu'))) filenames = os.listdir(res_path) assert res_path == str( pathlib.Path( './tests/_utils/data/ubuntu_dataset.zip_unzip/ubuntu_dataset')) assert sorted(filenames) == sorted( os.listdir(os.path.join(data_dir, 'ubuntu_dataset'))) for filename in filenames: check(os.path.join(res_path, filename), os.path.join(data_dir, 'ubuntu_dataset', filename)) shutil.rmtree( str(pathlib.Path('./tests/_utils/data/ubuntu_dataset.zip_unzip')))
def __init__(self, file_id, min_vocab_times=10, max_sent_length=50, invalid_vocab_times=0, num_samples=10, raml_file="samples_iwslt14.txt", tau=0.4, raml=True): self._file_id = file_id self._file_path = get_resource_file_path(file_id) self._min_vocab_times = min_vocab_times self._max_sent_length = max_sent_length self._invalid_vocab_times = invalid_vocab_times # RAML specific self.raml_mode = raml self.n_samples = num_samples self.raml_path = os.path.join(self._file_path, raml_file) self.tau = tau self.raml_path = os.path.join(self._file_path, raml_file) super(IWSLT14, self).__init__(file_id=file_id) self.raml_data = self.read_raml_sample_file()