예제 #1
0
def ensure_downloads(url=DATA_URL,target_dir=DEFAULT_CODEGOLF_DATASET_PATH):
    """Ensure that all of the given files have been downloaded and/or unpacked"""
    file_ops.mkdir_p( target_dir )
    expected = os.path.join( target_dir, 'train','yes0.wav')
    if not os.path.exists( expected ):
        archive = os.path.join( target_dir, TAR_FILE )
        if not os.path.exists( archive ) or os.stat( archive ).st_size != FILE_SIZE:
            log.info("Downloading codegolf dataset to %s", target_dir )
            if not file_ops.download_file(
                DATA_URL,
                archive,
            ):
                raise RuntimeError( "Unable to download %s to %s"%(
                    DATA_URL,
                    archive,
                ))
        if sys.version_info.major == 3:
            log.info("Using Python 3.x lzma support to unpack")
            file_ops.untar(archive, target_dir, mode='r:xz')
        else:
            log.warn("Attempting decompresion/unpacking via tar command" )
            subprocess.check_call( ['tar', '-xJf', archive])
        if not os.path.exists( expected ):
            raise RuntimeError("Untarring the source file did not create %s"%(expected,))
    log.info("CodeGolf Yes/No dataset is installed in %s"%(target_dir,))
    return True
예제 #2
0
def ensure_downloads(files,base_url=BASE_URL,target_dir=DEFAULT_LIBRISPEECH_DATASET_PATH):
    """Ensure that all of the given files have been downloaded and/or unpacked"""
    log.info("Downloading librispeech to %s", target_dir )
    file_ops.mkdir_p( target_dir )
    for filename in files:
        final_filename = os.path.join( target_dir, filename )
        log.info("Ensuring download: %s", final_filename)
        filesize = FILE_SIZES.get( filename, 'Unknown Size')
        size_desc = file_ops.human_bytes(filesize) if isinstance(filesize,(long,int)) else filesize
        if filename in DIRECTORY_NAMES:
            without_extension = os.path.join( target_dir, DIRECTORY_NAMES[filename])
        else:
            without_extension = final_filename[:-7]
        
        if not os.path.exists( without_extension ):
            if (not os.path.exists( final_filename )) or not( os.stat(final_filename).st_size == filesize):
                final_url = base_url + filename
                log.info("Need to download %s (%s)", final_url,size_desc )
                if not file_ops.download_file(
                    final_url,
                    final_filename,
                ):
                    raise RuntimeError("Unable to download %s to %s"%(
                        final_url,final_filename,
                    ))
            working = tempfile.mkdtemp(dir=target_dir,prefix="unpack-",suffix="-tmp")
            try:
                file_ops.untar(final_filename, working)
                text_files = []
                for name in glob.glob(os.path.join(working,'LibriSpeech','*')):
                    if os.path.basename( name ) == os.path.basename(without_extension):
                        os.rename( name, without_extension )
                    elif os.path.splitext(name)[1].upper() == '.TXT':
                        text_files.append( name )
                    else:
                        log.warn("Unexpected directory in %s: %r",final_filename, name)
                for text_file in text_files:
                    os.rename( text_file, os.path.join( without_extension, os.path.basename(text_file)))
                if not os.path.exists( without_extension ):
                    raise RuntimeError(
                        "Unable to find the directory %s expected from %s"%(
                            without_extension,
                            final_filename,
                        )
                    )
            finally:
                shutil.rmtree( working )
예제 #3
0
 def __init__(
     self,
     path=DEFAULT_TEDLIUM_DATASET_PATH,
     window_duration = 0.01,
     skip_count = 1,
     max_speeches = None,
 ):
     """Initialize the Dataset with a given storage for TEDLIUM
     
     path -- target path for the TED LIUM data storage
     window_duration -- duration of the audio window in seconds
     skip_count -- step size across the segments in the repo
                   used to do a very small subset of the dataset 
                   when doing testing iterations. This allows you
                   to test an "epoch" across a small subset of the 
                   40GB data-file
     """
     self.window_size = 2**int(math.ceil(math.log(int(window_duration * 16000),2)))
     source_filename = path + '.tar.gz'
     if not os.path.exists(path):
         if os.path.exists(source_filename):
             # Note: this could, in theory overwrite anything on disk, as the Python
             # tarfile module doesn't prevent writing outside the root directory
             # (according to its docs).
             file_ops.untar(source_filename, destination_dir=os.path.dirname(path))
     if not os.path.exists(path):
         raise RuntimeError(
             "You need to download the TEDLIUM corpus (v2) from %(url)s and save it to %(path)s"%{
                 'url': LIUM_BASE + TEDLIUM_DOWNLOAD_URL,
                 'path': source_filename,
             }
         )
     path = os.path.realpath(path)
     log.info("Searching for speeches")
     self.train_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]train[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.train_speeches = self.train_speeches[:max_speeches]
     self.test_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]test[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.test_speeches = self.test_speeches[:max_speeches]
     self.valid_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]dev[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.valid_speeches = self.valid_speeches[:max_speeches]
     log.info(
         "Creating speech segments (utterance records using 1/%s of the utterances)",
         skip_count,
     )
     train_inputs,train_targets = inputs_and_targets( self.train_speeches )
     valid_inputs,valid_targets = inputs_and_targets( self.valid_speeches )
     test_inputs,test_targets = inputs_and_targets( self.test_speeches )
     log.info("Initializing the OpenDeep dataset")
     super(TEDLIUMDataset,self).__init__(
         train_inputs=train_inputs,train_targets=train_targets,
         valid_inputs=valid_inputs,valid_targets=valid_targets,
         test_inputs=test_inputs,test_targets=test_targets,
     )