Exemplo n.º 1
0
    def __init__(self,
                 root: str,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        folder_in_archive = os.path.join(basename, folder_in_archive)

        self._path = os.path.join(root, folder_in_archive)
        self._metadata_path = os.path.join(root, basename, 'metadata.csv')

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        with open(self._metadata_path, "r", newline='') as metadata:
            walker = unicode_csv_reader(metadata,
                                        delimiter="|",
                                        quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
Exemplo n.º 2
0
    def __init__(
            self, root, params, url=URL, download=False):

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        base_folder = os.path.join(root, basename)
        
        self._wav_path = os.path.join(base_folder, 'wavs')
        self._mel_path = os.path.join(base_folder, 'mels')
        self._char_path = os.path.join(base_folder, 'chars')
        self._phone_path = os.path.join(base_folder, 'phones')
        self._metadata_path = os.path.join(base_folder, 'metadata.csv')

        if download:
            if not os.path.isdir(self._wav_path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        if not os.path.isdir(self._mel_path):
            precompute_spectrograms(base_folder, params)

        if not os.path.isdir(self._char_path) or not os.path.isdir(self._phone_path):
            precompute_char_phone(base_folder)
            
        with open(self._metadata_path, "r") as metadata:
            walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
Exemplo n.º 3
0
    def __init__(self, root, tsv=TSV, url=URL, download=False):

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
        }

        if url is languages:
            ext_archive = ".tar.gz"
            language = languages[url]

            base_url = (
                "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4"
                + ".s3.amazonaws.com/cv-corpus-3/"
            )
            url = base_url + language + ext_archive

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = root

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        self._tsv = os.path.join(root, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)
Exemplo n.º 4
0
    def __init__(self,
                 root: Union[str, Path],
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        if url in [
                "aew", "ahw", "aup", "awb", "axb", "bdl", "clb", "eey", "fem",
                "gka", "jmk", "ksp", "ljm", "lnh", "rms", "rxr", "slp", "slt"
        ]:

            url = "cmu_us_" + url + "_arctic"
            ext_archive = ".tar.bz2"
            base_url = "http://www.festvox.org/cmu_arctic/packed/"

            url = os.path.join(base_url, url + ext_archive)

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        basename = os.path.basename(url)
        root = os.path.join(root, folder_in_archive)
        if not os.path.isdir(root):
            os.mkdir(root)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]

        self._path = os.path.join(root, basename)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive)

        self._text = os.path.join(self._path, self._folder_text,
                                  self._file_text)

        with open(self._text, "r") as text:
            walker = unicode_csv_reader(text, delimiter="\n")
            self._walker = list(walker)
Exemplo n.º 5
0
    def __init__(
            self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False
    ):

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        folder_in_archive = os.path.join(basename, folder_in_archive)

        self._path = os.path.join(root, folder_in_archive)
        self._metadata_path = os.path.join(root, basename, 'metadata.csv')

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        with open(self._metadata_path, "r") as metadata:
            walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
Exemplo n.º 6
0
    def __init__(self, root: str, audio_folder: str, text_file: str):
        """
        Args:
            root (str): root folder of the dataset
            audio_folder (str): folder with the audio files inside root folder
            text_file (str): path to the file with the text transcriptions of the audio files inside
                root folder
        """
        self._root = root
        self._audio_folder = audio_folder
        walker = walk_files(root, suffix=self._ext_audio, prefix=False, remove_suffix=True)
        self._walker = list(walker)

        text_path = os.path.join(root, text_file)
        with open(text_path, "r") as text_file:
            text = unicode_csv_reader(text_file, delimiter="|", quoting=csv.QUOTE_NONE)
            self._text = list(text)
            # Delete first row of csv with the information about the columns
            self._text.pop(0)

        assert len(self._walker) == len(self._text), \
            "Number of audiofiles is different from number of texts"
Exemplo n.º 7
0
    def __init__(self, root: str, audio_folder: str, text_file: str):
        self._root = root
        self._audio_folder = audio_folder
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        walker = walk_files(root,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        self._walker = list(walker)

        text_path = os.path.join(root, text_file)
        with open(text_path, "r") as text_file:
            text = unicode_csv_reader(text_file,
                                      delimiter="|",
                                      quoting=csv.QUOTE_NONE)
            self._text = list(text)
            # Delete first row of csv with the information about the columns
            self._text.pop(0)

        assert len(self._walker) == len(self._text), \
            "Number of audiofiles is different from number of texts"
Exemplo n.º 8
0
    def __init__(self,
                 root: str,
                 tsv: str = TSV,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 version: str = VERSION,
                 download: bool = False) -> None:

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "portuguese": "pt",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
            "indonesian": "id",
            "arabic": "ar",
            "tamil": "ta",
            "interlingua": "ia",
            "latvian": "lv",
            "japanese": "ja",
            "votic": "vot",
            "abkhaz": "ab",
            "cantonese": "zh-HK",
            "romansh sursilvan": "rm-sursilv"
        }

        if url in languages:
            ext_archive = ".tar.gz"
            language = languages[url]

            base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
            url = os.path.join(base_url, version, language + ext_archive)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, version, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        self._tsv = os.path.join(root, folder_in_archive, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)
Exemplo n.º 9
0
    def __init__(self,
                 root: str,
                 tsv: str = TSV,
                 language: str = LANGUAGE,
                 version: str = VERSION,
                 download: bool = False) -> None:

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "portuguese": "pt",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
            "indonesian": "id",
            "arabic": "ar",
            "tamil": "ta",
            "interlingua": "ia",
            "latvian": "lv",
            "japanese": "ja",
            "votic": "vot",
            "abkhaz": "ab",
            "cantonese": "zh-HK",
            "romansh sursilvan": "rm-sursilv"
        }

        language = languages.get(language, language)
        ext_archive = ".tar.gz"
        base_url = "https://cdn.commonvoice.mozilla.org"
        url = os.path.join(base_url, version, language + ext_archive)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(version, language)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            pathlib.Path(root).mkdir(parents=True, exist_ok=True)
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)
                os.remove(archive)

        self._tsv = os.path.join(self._path, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)
Exemplo n.º 10
0
    def __init__(self,
                 root: Union[str, Path],
                 tsv: str = TSV,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 version: str = VERSION,
                 download: bool = False) -> None:

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "portuguese": "pt",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
            "indonesian": "id",
            "arabic": "ar",
            "tamil": "ta",
            "interlingua": "ia",
            "latvian": "lv",
            "japanese": "ja",
            "votic": "vot",
            "abkhaz": "ab",
            "cantonese": "zh-HK",
            "romansh sursilvan": "rm-sursilv"
        }

        if download:
            raise RuntimeError(
                "Common Voice dataset requires user agreement on the usage term, "
                "and torchaudio no longer provides the download feature. "
                "Please download the dataset manually and extract it in the root directory, "
                "then provide the target language to `url` argument.")
        if url not in languages:
            raise ValueError(
                f"`url` must be one of available languages: {languages.keys()}"
            )

        if url in languages:
            ext_archive = ".tar.gz"
            language = languages[url]

            base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
            url = os.path.join(base_url, version, language + ext_archive)

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, version, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        self._tsv = os.path.join(root, folder_in_archive, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)
Exemplo n.º 11
0
    def __init__(self,
                 root: Union[str, Path],
                 tsv: str = TSV,
                 url: Optional[str] = None,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 version: str = VERSION,
                 language: str = LANGUAGE,
                 download: Optional[bool] = False) -> None:

        if download is True:
            raise RuntimeError(
                "The dataset is no longer publicly accessible. You need to "
                "download the archives externally and place them in the root "
                "directory."
            )
        elif download is False:
            warnings.warn(
                "The use of the download flag is deprecated, since the dataset "
                "is no longer directly accessible.", RuntimeWarning
            )

        if url is not None:
            warnings.warn(
                "The use of the url flag is deprecated, since the dataset "
                "is no longer publicly accessible. To specify the language of the dataset, "
                "please use the language parameter instead.", RuntimeWarning
            )

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "portuguese": "pt",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
            "indonesian": "id",
            "arabic": "ar",
            "tamil": "ta",
            "interlingua": "ia",
            "latvian": "lv",
            "japanese": "ja",
            "votic": "vot",
            "abkhaz": "ab",
            "cantonese": "zh-HK",
            "romansh sursilvan": "rm-sursilv"
        }

        if language in languages:
            ext_archive = ".tar.gz"
            language = languages[language]
            url = os.path.join(version, language + ext_archive)
        else:
            raise ValueError(
                'Allowed language values are "tatar", "english", "german",'
                '"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",'
                '"irish", "kabyle", "catalan", "taiwanese", "slovenian",'
                '"italian", "dutch", "hakha chin", "esperanto", "estonian",'
                '"persian", "portuguese", "basque", "spanish", "chinese",'
                '"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",'
                '"russian", "indonesian", "arabic", "tamil", "interlingua",'
                '"latvian", "japanese", "votic", "abkhaz", "cantonese" and'
                '"romansh sursilvan".'
            )

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, version, basename)

        self._path = os.path.join(root, folder_in_archive)

        if not os.path.isdir(self._path):
            if os.path.isfile(archive):
                checksum = _CHECKSUMS.get(url, None)
                if checksum:
                    filepath = os.path.basename(url)
                    with open(filepath, "rb") as file_obj:
                        if not validate_file(file_obj, checksum, "sha256"):
                            raise RuntimeError(
                                f"The hash of {filepath} does not match. Delete the file manually and retry."
                            )
                extract_archive(archive)
            else:
                raise RuntimeError(
                    "The dataset is no longer publicly accessible. You need to "
                    "download the archives externally and place them in the root "
                    "directory."
                )

        self._tsv = os.path.join(root, folder_in_archive, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)