Exemplo n.º 1
0
    def _download(self, target_path):
        if self.subsets is None or len(self.subsets) == 0:
            to_download = SUBSETS.keys()
        else:
            to_download = self.subsets

        os.makedirs(target_path, exist_ok=True)

        for subset_name in to_download:
            if subset_name in SUBSETS.keys():
                tmp_file = os.path.join(target_path,
                                        '{}.tar.gz'.format(subset_name))
                download.download_file(SUBSETS[subset_name],
                                       tmp_file,
                                       num_threads=self.num_threads)
                download.extract_tar(tmp_file, target_path)
                extract_sub_path = os.path.join(target_path, 'LibriSpeech')

                for item in os.listdir(extract_sub_path):
                    item_path = os.path.join(extract_sub_path, item)
                    item_target_path = os.path.join(target_path, item)
                    shutil.move(item_path, item_target_path)

                shutil.rmtree('extract_sub_path', ignore_errors=True)

                os.remove(tmp_file)
Exemplo n.º 2
0
    def _download(self, target_path):
        temp_path = os.path.join(target_path, 'temp')
        os.makedirs(temp_path, exist_ok=True)

        sentence_ark = os.path.join(temp_path, 'sentences.tar.bz2')
        sentence_list = os.path.join(temp_path, 'sentences.csv')
        audio_ark = os.path.join(temp_path, 'sentences_with_audio.tar.bz2')
        audio_list = os.path.join(temp_path, 'sentences_with_audio.csv')

        download.download_file(SENTENCE_LIST_URL, sentence_ark)
        download.download_file(AUDIO_LIST_URL, audio_ark)

        download.extract_tar(sentence_ark, temp_path)
        download.extract_tar(audio_ark, temp_path)

        audio_entries = self._load_audio_list(audio_list)
        sentences = self._load_sentence_list(sentence_list)

        valid_sentence_ids = set(audio_entries.keys()).intersection(
            set(sentences.keys()))

        # sent-id, username, lang, transcript
        all_records = [(k, audio_entries[k][0], sentences[k][0],
                        sentences[k][1]) for k in valid_sentence_ids]

        meta_path = os.path.join(target_path, META_FILENAME)
        textfile.write_separated_lines(meta_path,
                                       all_records,
                                       separator='\t',
                                       sort_by_column=0)

        self._download_audio_files(all_records, target_path)

        shutil.rmtree(temp_path, ignore_errors=True)
Exemplo n.º 3
0
    def _download(self, target_path):
        os.makedirs(target_path, exist_ok=True)
        tmp_file = os.path.join(target_path, 'tmp_ark.zip')

        download.download_file(self.url, tmp_file)
        download.extract_zip(tmp_file, target_path)

        files.move_all_files_from_subfolders_to_top(target_path)

        os.remove(tmp_file)
Exemplo n.º 4
0
    def _download_audio_files(self, records, target_path):
        """ Download all audio files based on the given records. """

        for record in logger.progress(records):
            audio_folder = os.path.join(target_path, 'audio', record[2])
            audio_file = os.path.join(audio_folder, '{}.mp3'.format(record[0]))
            os.makedirs(audio_folder, exist_ok=True)

            download_url = 'https://audio.tatoeba.org/sentences/{}/{}.mp3'.format(
                record[2], record[0])
            download.download_file(download_url, audio_file)
Exemplo n.º 5
0
    def _download(self, target_path):
        os.makedirs(target_path, exist_ok=True)

        for tag, download_url in DOWNLOAD_URLS.items():
            if self.tags is None or tag in self.tags:
                tmp_file = os.path.join(target_path, 'tmp_{}.tgz'.format(tag))

                download.download_file(download_url, tmp_file)
                download.extract_tar(tmp_file, target_path)

                os.remove(tmp_file)
Exemplo n.º 6
0
    def _download(self, target_path):
        os.makedirs(target_path, exist_ok=True)
        tmp_file = os.path.join(target_path, 'tmp_ark')

        download.download_file(self.url, tmp_file)
        self._extract_file(tmp_file, target_path)

        if self.move_files_up:
            files.move_all_files_from_subfolders_to_top(target_path,
                                                        delete_subfolders=True)

        os.remove(tmp_file)
Exemplo n.º 7
0
    def _download(self, target_path):
        os.makedirs(target_path, exist_ok=True)
        tmp_file = os.path.join(target_path, 'tmp_ark.tar.gz')

        download.download_file(self.url, tmp_file)
        download.extract_tar(tmp_file, target_path)

        # We use copy since subfolders in the archive are read-only, hence throws permission error when trying to move.
        files.move_all_files_from_subfolders_to_top(target_path,
                                                    delete_subfolders=True,
                                                    copy=True)

        os.remove(tmp_file)
Exemplo n.º 8
0
def test_download_file(sample_zip_data, tmpdir):
    dl_path = 'http://some.url/thezipfile.zip'
    target_path = os.path.join(tmpdir.strpath, 'target.zip')

    with requests_mock.Mocker() as mock:
        mock.get(dl_path, content=sample_zip_data)

        download.download_file(dl_path, target_path)

    assert os.path.isfile(target_path)

    with open(target_path, 'rb') as f:
        assert f.read() == sample_zip_data
Exemplo n.º 9
0
def test_download_file(sample_zip_data, tmpdir):
    dl_path = 'http://some.url/thezipfile.zip'
    target_path = os.path.join(tmpdir.strpath, 'target.zip')

    with requests_mock.Mocker() as mock:
        # Return any size (doesn't matter, only for prints)
        mock.head(requests_mock.ANY, headers={'Content-Length': '100'})

        mock.get(dl_path, content=sample_zip_data)

        download.download_file(dl_path, target_path)

    assert os.path.isfile(target_path)

    with open(target_path, 'rb') as f:
        assert f.read() == sample_zip_data