def compress(self, filename=None, path=None, file_list=None, size_limit=None, overwrite=False): """Compress the package. Supports Zip and Tar packages. Parameters ---------- file_list : list of dict size_limit : int Default value None overwrite : bool Overwrite existing package. Default value False Returns ------- self """ if filename is not None: self.filename = filename self.detect_file_format() self.validate_format() if path is not None and file_list is None: files = Path(path=path).file_list(recursive=True) file_list = [] for file in files: file_list.append({ 'source': file, 'target': os.path.relpath(file) }) if size_limit is None: package = None if self.format == FileFormat.ZIP: package = zipfile.ZipFile(file=self.filename, mode='w') elif self.format == FileFormat.TAR: package = tarfile.open(name=self.filename, mode='w:gz') size_uncompressed = 0 for item in file_list: if os.path.exists(item['source']): if self.format == FileFormat.ZIP: package.write(filename=item['source'], arcname=os.path.relpath(item['target']), compress_type=zipfile.ZIP_DEFLATED) file_info = package.getinfo( os.path.relpath(item['target'])) size_uncompressed += file_info.file_size elif self.format == FileFormat.TAR: package.add(name=item['source'], arcname=os.path.relpath(item['target'])) file_info = package.gettarinfo(name=item['source'], arcname=os.path.relpath( item['target'])) size_uncompressed += file_info.size else: package.close() message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format( name=self.__class__.__name__, filename=item['source'], package=self.filename) if self.logger: self.logger.exception(message) raise IOError(message) package.close() else: base, extension = os.path.splitext(self.filename) filename_template = base + '.{package_id}' + extension # Initialize package package_id = 1 size_uncompressed = 0 if self.format == FileFormat.ZIP: package = zipfile.ZipFile( file=filename_template.format(package_id=package_id), mode='w') elif self.format == FileFormat.TAR: package = tarfile.open( name=filename_template.format(package_id=package_id), mode='w:gz') progress = tqdm(file_list, desc="{0: <25s}".format('Compress'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) for item_id, item in enumerate(progress): if self.disable_progress_bar: self.logger.info( ' {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'. format(title='Compress ', item_id=item_id, total=len(progress), file=item['source'])) if os.path.exists(item['source']): current_size_uncompressed = os.path.getsize(item['source']) if size_uncompressed + current_size_uncompressed > size_limit: # Size limit met, close current package and open a new one. package.close() package_id += 1 if self.format == FileFormat.ZIP: package = zipfile.ZipFile( file=filename_template.format( package_id=package_id), mode='w') elif self.format == FileFormat.TAR: package = tarfile.open( name=filename_template.format( package_id=package_id), mode='w:gz') size_uncompressed = 0 if self.format == FileFormat.ZIP: package.write(filename=item['source'], arcname=os.path.relpath(item['target']), compress_type=zipfile.ZIP_DEFLATED) file_info = package.getinfo( os.path.relpath(item['target'])) size_uncompressed += file_info.file_size elif self.format == FileFormat.TAR: package.add(name=item['source'], arcname=os.path.relpath(item['target'])) file_info = package.gettarinfo(name=item['source'], arcname=os.path.relpath( item['target'])) size_uncompressed += file_info.size else: package.close() message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format( name=self.__class__.__name__, filename=item['source'], package=filename_template.format( package_id=package_id)) if self.logger: self.logger.exception(message) raise IOError(message) package.close()
def compress(self, filename=None, path=None, file_list=None, size_limit=None): """Compress the package. Supports Zip and Tar packages. Parameters ---------- filename : str Filename for the package. If None given, one given to class initializer is used. Default value None path : str Path get files if file_list is not set. Files are collected recursively. Default value None file_list : list of dict List of files to be included to the package. Item format {'source': 'file1.txt', 'target': 'folder1/file1.txt'}. Default value None size_limit : int Size limit in bytes. Default value None Returns ------- list of str Filenames of created packages """ if is_jupyter(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm if filename is not None: self.filename = filename self.detect_file_format() self.validate_format() if path is not None and file_list is None: files = Path(path=path).file_list(recursive=True) file_list = [] for filename in files: file_list.append({ 'source': filename, 'target': os.path.relpath(filename) }) package_filenames = [] total_uncompressed_size = 0 for item in file_list: total_uncompressed_size += os.path.getsize(item['source']) if size_limit is None or total_uncompressed_size < size_limit: package = None if self.format == FileFormat.ZIP: package = zipfile.ZipFile(file=self.filename, mode='w') elif self.format == FileFormat.TAR: package = tarfile.open(name=self.filename, mode='w:gz') package_filenames.append(self.filename) size_uncompressed = 0 for item in file_list: if os.path.exists(item['source']): if self.format == FileFormat.ZIP: package.write(filename=item['source'], arcname=os.path.relpath(item['target']), compress_type=zipfile.ZIP_DEFLATED) file_info = package.getinfo( os.path.relpath(item['target'])) size_uncompressed += file_info.file_size elif self.format == FileFormat.TAR: package.add(name=item['source'], arcname=os.path.relpath(item['target'])) file_info = package.gettarinfo(name=item['source'], arcname=os.path.relpath( item['target'])) size_uncompressed += file_info.size else: package.close() message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format( name=self.__class__.__name__, filename=item['source'], package=self.filename) if self.logger: self.logger.exception(message) raise IOError(message) package.close() else: base, extension = os.path.splitext(self.filename) filename_template = base + '.{package_id}' + extension package = None # Initialize package package_id = 1 size_uncompressed = 0 if self.format == FileFormat.ZIP: package = zipfile.ZipFile( file=filename_template.format(package_id=package_id), mode='w') elif self.format == FileFormat.TAR: package = tarfile.open( name=filename_template.format(package_id=package_id), mode='w:gz') package_filenames.append( filename_template.format(package_id=package_id)) progress = tqdm(file_list, desc="{0: <25s}".format('Compress'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) for item_id, item in enumerate(progress): if self.disable_progress_bar: self.logger.info( ' {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'. format(title='Compress ', item_id=item_id, total=len(progress), file=item['source'])) if os.path.exists(item['source']): current_size_uncompressed = os.path.getsize(item['source']) if size_uncompressed + current_size_uncompressed > size_limit: # Size limit met, close current package and open a new one. package.close() package_id += 1 if self.format == FileFormat.ZIP: package = zipfile.ZipFile( file=filename_template.format( package_id=package_id), mode='w') elif self.format == FileFormat.TAR: package = tarfile.open( name=filename_template.format( package_id=package_id), mode='w:gz') package_filenames.append( filename_template.format(package_id=package_id)) size_uncompressed = 0 if self.format == FileFormat.ZIP: package.write(filename=item['source'], arcname=os.path.relpath(item['target']), compress_type=zipfile.ZIP_DEFLATED) file_info = package.getinfo( os.path.relpath(item['target'])) size_uncompressed += file_info.file_size elif self.format == FileFormat.TAR: package.add(name=item['source'], arcname=os.path.relpath(item['target'])) file_info = package.gettarinfo(name=item['source'], arcname=os.path.relpath( item['target'])) size_uncompressed += file_info.size else: package.close() message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format( name=self.__class__.__name__, filename=item['source'], package=filename_template.format( package_id=package_id)) if self.logger: self.logger.exception(message) raise IOError(message) package.close() return package_filenames
def extract(self, target_path=None, overwrite=False, omit_first_level=False): """Extract the package. Supports Zip and Tar packages. Parameters ---------- target_path : str Path to extract the package content. If none given, package is extracted in the same path than package. Default value None overwrite : bool Overwrite existing files. Default value False omit_first_level : bool Omit first directory level. Default value True Returns ------- self """ if target_path is None: target_path = os.path.split(self.filename)[0] Path(target_path).create() if self.format == FileFormat.ZIP: with zipfile.ZipFile(self.filename, "r") as z: if omit_first_level: parts = [] for name in z.namelist(): if not name.endswith('/'): parts.append(name.split('/')[:-1]) prefix = os.path.commonprefix(parts) or '' if prefix: if len(prefix) > 1: prefix_ = list() prefix_.append(prefix[0]) prefix = prefix_ prefix = '/'.join(prefix) + '/' offset = len(prefix) # Start extraction members = z.infolist() file_count = 1 progress = tqdm(members, desc="{0: <25s}".format('Extract'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) for i, member in enumerate(progress): if self.disable_progress_bar: self.logger.info( ' {title:<15s} [{item_id:d}/{total:d}] {file:<30s}' .format(title='Extract ', item_id=i, total=len(progress), file=member.filename)) if not omit_first_level or len(member.filename) > offset: if omit_first_level: member.filename = member.filename[offset:] progress.set_description("{0: >35s}".format( member.filename.split('/')[-1])) progress.update() if not os.path.isfile( os.path.join(target_path, member.filename)) or overwrite: try: if hasattr(self, 'package_password' ) and self.package_password: z.extract(member=member, path=target_path, pwd=self.package_password) else: z.extract(member=member, path=target_path) except KeyboardInterrupt: # Delete latest file, since most likely it was not extracted fully os.remove( os.path.join(target_path, member.filename)) # Quit sys.exit() file_count += 1 elif self.format == FileFormat.TAR: tar = tarfile.open(self.filename, "r:gz") progress = tqdm(tar, desc="{0: <25s}".format('Extract'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) for i, tar_info in enumerate(progress): if self.disable_progress_bar: self.logger.info( ' {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'. format(title='Extract ', item_id=i, total=len(progress), file=tar_info.name)) if not os.path.isfile(os.path.join( target_path, tar_info.name)) or overwrite: tar.extract(tar_info, target_path) tar.members = [] tar.close() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if not self.meta_container.exists(): scene_label = 'home' dcase_cross_val_data = ListDictContainer(filename=os.path.join( self.local_path, 'chime_home', 'development_chunks_refined_crossval_dcase2016.csv')).load( fields=['id', 'filename', 'set_id']) audio_files = {} for item in dcase_cross_val_data: audio_filename = os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav') annotation_filename = os.path.join('chime_home', 'chunks', item['filename'] + '.csv') if audio_filename not in audio_files: audio_files[audio_filename] = { 'audio': audio_filename, 'meta': annotation_filename } meta_data = MetaDataContainer() for audio_filename, data in iteritems(audio_files): current_meta_data = DictContainer(filename=os.path.join( self.local_path, data['meta'])).load() tags = [] for i, tag in enumerate(current_meta_data['majorityvote']): if tag != 'S' and tag != 'U': tags.append(self.tagcode_to_taglabel(tag)) name = os.path.split(audio_filename)[1] segment_name = name[0:name.find('_chunk')] chunk_name = name[name.find('_chunk') + 1:].split('.')[0] item = MetaDataItem({ 'filename': audio_filename, 'scene_label': scene_label, 'tags': ';'.join(tags) + ';', 'identifier': segment_name }) self.process_meta_item(item=item, absolute_path=False) meta_data.append(item) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() all_folds_found = True for fold in range(1, self.crossvalidation_folds + 1): train_filename = self.evaluation_setup_filename(setup_part='train', fold=fold) test_filename = self.evaluation_setup_filename(setup_part='test', fold=fold) eval_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=fold) if not os.path.isfile(train_filename): all_folds_found = False if not os.path.isfile(test_filename): all_folds_found = False if not os.path.isfile(eval_filename): all_folds_found = False if not all_folds_found: Path().makedirs(path=self.evaluation_setup_path) dcase_crossval = { 1: [], 2: [], 3: [], 4: [], 5: [], } dcase_cross_val_data = ListDictContainer(filename=os.path.join( self.local_path, 'chime_home', 'development_chunks_refined_crossval_dcase2016.csv')).load( fields=['id', 'filename', 'set_id']) for item in dcase_cross_val_data: dcase_crossval[int(item['set_id']) + 1].append( self.relative_to_absolute_path( os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav'))) for fold in range(1, self.crossvalidation_folds + 1): # Collect training and testing files train_files = [] for f in range(1, self.crossvalidation_folds + 1): if f is not fold: train_files += dcase_crossval[f] test_files = dcase_crossval[fold] # Create meta containers and save them # Train train_filename = self.evaluation_setup_filename( setup_part='train', fold=fold) train_meta = MetaDataContainer(filename=train_filename) for filename in train_files: item = self.file_meta(filename)[0] self.process_meta_item(item=item, absolute_path=False) train_meta.append(item) train_meta.save() # Test test_filename = self.evaluation_setup_filename( setup_part='test', fold=fold) test_meta = MetaDataContainer(filename=test_filename) for filename in test_files: item = MetaDataItem( {'filename': self.absolute_to_relative_path(filename)}) test_meta.append(item) test_meta.save() # Evaluate eval_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=fold) eval_meta = MetaDataContainer(filename=eval_filename) for filename in test_files: item = self.file_meta(filename)[0] self.process_meta_item(item=item, absolute_path=False) eval_meta.append(item) eval_meta.save() # Load meta and cross validation self.load() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if not self.meta_container.exists(): scene_label = 'home' evaluation_chunks = ListDictContainer( filename=os.path.join(self.local_path, 'chime_home', 'evaluation_chunks_refined.csv')).load( fields=['id', 'filename', 'set_id']) audio_files = {} for item in dcase_cross_val_data: audio_filename = os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav') annotation_filename = os.path.join('chime_home', 'chunks', item['filename'] + '.csv') if audio_filename not in audio_files: audio_files[audio_filename] = { 'audio': audio_filename, 'meta': annotation_filename } meta_data = MetaDataContainer() for audio_filename, data in iteritems(audio_files): current_meta_data = DictContainer(filename=os.path.join( self.local_path, data['meta'])).load() tags = [] for i, tag in enumerate(current_meta_data['majorityvote']): if tag != 'S' and tag != 'U': tags.append(self.tagcode_to_taglabel(tag)) name = os.path.split(audio_filename)[1] segment_name = name[0:name.find('_chunk')] chunk_name = name[name.find('_chunk') + 1:].split('.')[0] item = MetaDataItem({ 'filename': audio_filename, 'scene_label': scene_label, 'tags': ';'.join(tags) + ';', 'identifier': segment_name }) self.process_meta_item(item=item, absolute_path=False) meta_data.append(item) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() all_folds_found = True train_filename = self.evaluation_setup_filename(setup_part='train') test_filename = self.evaluation_setup_filename(setup_part='test') eval_filename = self.evaluation_setup_filename(setup_part='evaluate') if not os.path.isfile(train_filename): all_folds_found = False if not os.path.isfile(test_filename): all_folds_found = False if not os.path.isfile(eval_filename): all_folds_found = False if not all_folds_found: Path().makedirs(path=self.evaluation_setup_path) # Train train_filename = self.evaluation_setup_filename(setup_part='train') train_meta = MetaDataContainer(filename=train_filename) for filename in self.train_files(): train_meta.append(self.file_meta(filename)[0]) train_meta.save() # Test test_filename = self.evaluation_setup_filename(setup_part='test') test_meta = MetaDataContainer(filename=test_filename) for filename in self.test_files(): test_meta.append( MetaDataItem( {'filename': self.absolute_to_relative_path(filename)})) test_meta.save() # Evaluate eval_filename = self.evaluation_setup_filename( setup_part='evaluate') eval_meta = MetaDataContainer(filename=eval_filename) for filename in self.test_files(): eval_meta.append(self.file_meta(filename)[0]) eval_meta.save() # Load meta and cross validation self.load() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if not self.meta_container.exists(): meta_data = MetaDataContainer() for filename in self.audio_files: raw_path, raw_filename = os.path.split(filename) relative_path = self.absolute_to_relative_path(raw_path) meta_data.append( MetaDataItem({ 'filename': os.path.join(relative_path, raw_filename), 'scene_label': os.path.splitext(os.path.split(filename)[1])[0][:-2], })) meta_data.save(filename=self.meta_file) self.load_meta() all_folds_found = True for fold in self.folds(): train_filename = self.evaluation_setup_filename(setup_part='train', fold=fold) test_filename = self.evaluation_setup_filename(setup_part='test', fold=fold) if not os.path.isfile(train_filename): all_folds_found = False if not os.path.isfile(test_filename): all_folds_found = False if not all_folds_found: Path().makedirs(path=self.evaluation_setup_path) classes = [] files = [] for item in self.meta: classes.append(item.scene_label) files.append(item.filename) files = numpy.array(files) from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=self.crossvalidation_folds, test_size=0.3, random_state=0) fold = 1 for train_index, test_index in sss.split(X=numpy.zeros( len(classes)), y=classes): train_files = files[train_index] test_files = files[test_index] train_filename = self.evaluation_setup_filename( setup_part='train', fold=fold) test_filename = self.evaluation_setup_filename( setup_part='test', fold=fold) eval_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=fold) # Create meta containers and save them # Train train_meta = MetaDataContainer(filename=train_filename) for filename in train_files: train_meta += self.meta_container.filter(filename=filename) train_meta.save() # Test test_meta = MetaDataContainer(filename=test_filename) for filename in test_files: test_meta.append( MetaDataItem({ 'filename': self.absolute_to_relative_path(filename) })) test_meta.save() # Evaluate eval_meta = MetaDataContainer(filename=eval_filename) for filename in test_files: eval_meta += self.meta_container.filter(filename=filename) eval_meta.save() fold += 1 # Load meta and cross validation self.load() return self
def extract_packages(self): """Extract the dataset packages Raises ------ IOError Local package was not found. Returns ------- self """ # Make sure evaluation_setup directory exists Path().makedirs( path=os.path.join(self.local_path, self.evaluation_setup_folder)) log = FancyLogger() item_access_log_filename = os.path.join(self.local_path, 'item_access_error.log.csv') if 'audio' in self.included_content_types or self.included_content_types == [ 'all' ]: # mean process audio log.title("Download_data") log.info( "Once database is downloaded, do not forget to check your missing_files" ) non_existing_videos = pandas.DataFrame( columns=["filename", "error"]) log.line("check files exist or download data") # Collect file ids for package in self.package_list: if package.get('content_type') == "meta": base_filepath = os.path.splitext( package.get('filename').split('/')[-1])[0] if 'train' in package.get('filename'): result_audio_directory = os.path.join( self.local_path, 'dataset/audio/train', base_filepath) else: result_audio_directory = os.path.join( self.local_path, 'dataset/audio/test') missing_files = download(package.get('filename'), result_audio_directory, n_jobs=3) if not missing_files.empty: non_existing_videos = non_existing_videos.append( missing_files, ignore_index=True) # Save list of non-accessible videos ListDictContainer(non_existing_videos.to_dict(orient="records"), filename=item_access_log_filename).save( fields=['filename', 'error']) # Evaluation setup filenames train_filename_fold1 = self.evaluation_setup_filename( setup_part='train', fold=1, file_extension='csv') test_filename_fold1 = self.evaluation_setup_filename( setup_part='test', fold=1, file_extension='csv') train_filename_fold2 = self.evaluation_setup_filename( setup_part='train', fold=2, file_extension='csv') test_filename_fold2 = self.evaluation_setup_filename( setup_part='test', fold=2, file_extension='csv') evaluate_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=2, file_extension='csv') # Check that evaluation setup exists evaluation_setup_exists = True if not os.path.isfile(train_filename_fold1) or not os.path.isfile(test_filename_fold1) \ or not os.path.isfile(train_filename_fold2) or not os.path.isfile(test_filename_fold2) \ or not os.path.isfile(evaluate_filename) or not self.meta_container.exists(): evaluation_setup_exists = False if not evaluation_setup_exists: # Evaluation setup was not found, generate one item_access_log_filename = os.path.join( self.local_path, 'item_access_error.log.csv') non_existing_videos = ListDictContainer().load( filename=item_access_log_filename, delimiter=',').get_field_unique('filename') train_meta_weak_fold1 = MetaDataContainer() audio_path = 'dataset/audio/train/weak' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/train/' 'weak.csv'), fields=["filename", "tags"], csv_header=True): if item.filename not in non_existing_videos: if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if audio present if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): train_meta_weak_fold1.append(item) else: train_meta_weak_fold1.append(item) train_meta_weak_fold1.save(filename=train_filename_fold1, csv_header=True, file_format="CSV") test_meta_unlabel_fold1 = MetaDataContainer() audio_path = 'dataset/audio/train/unlabel_in_domain' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/train/' 'unlabel_in_domain.csv'), csv_header=True): if item.filename not in non_existing_videos: # If not the right extension, change it if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if audio present if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): test_meta_unlabel_fold1.append(item) else: test_meta_unlabel_fold1.append(item) test_meta_unlabel_fold1.save(filename=test_filename_fold1, csv_header=True, file_format="CSV") # Fold 2 train is all the data used in fold 1 train_meta_weak_fold2 = MetaDataContainer() train_meta_weak_fold2 += MetaDataContainer().load( train_filename_fold1, csv_header=True, file_format="CSV") for item in MetaDataContainer().load(test_filename_fold1, csv_header=True, file_format="CSV"): item.tags = [] train_meta_weak_fold2.append(item) train_meta_weak_fold2.save(filename=train_filename_fold2, csv_header=True) # Evaluate meta is the groundtruth file with test annotations test.csv evaluate_meta = MetaDataContainer() audio_path = 'dataset/audio/test' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/test/test.csv'), csv_header=True): if item.filename not in non_existing_videos: if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): evaluate_meta.append(item) else: evaluate_meta.append(item) evaluate_meta.save(filename=evaluate_filename, csv_header=True, file_format="CSV") # Test meta is filenames of evaluation, labels will be predicted test_meta_strong_fold2 = MetaDataContainer() for filename in evaluate_meta.unique_files: test_meta_strong_fold2.append( MetaDataItem({'filename': filename})) test_meta_strong_fold2.save(filename=test_filename_fold2, csv_header=True, file_format="CSV") # meta_data is the default meta container containing all files of the dataset meta_data = MetaDataContainer() meta_data += MetaDataContainer().load(train_filename_fold1, csv_header=True, file_format="CSV") meta_data += MetaDataContainer().load(test_filename_fold1, csv_header=True, file_format="CSV") meta_data += MetaDataContainer().load(test_filename_fold2, csv_header=True, file_format="CSV") # Save meta meta_data.save(filename=self.meta_file) log.foot() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ # Make sure evaluation_setup directory exists Path().makedirs( path=os.path.join(self.local_path, self.evaluation_setup_folder)) reference_data_file = os.path.join( self.local_path, 'groundtruth_strong_label_evaluation_set.csv') if not self.meta_container.exists() and os.path.exists( reference_data_file): # Reference data is present and but meta data is empty meta_data = MetaDataContainer() ref_data = MetaDataContainer().load(filename=reference_data_file) for item in ref_data: # Modify audio file path item.filename = os.path.join( 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' # Only collect items which exists if os.path.isfile(os.path.join(self.local_path, item.filename)): meta_data.append(item) # Save meta data meta_data.save(filename=self.meta_container.filename) # Load meta and cross validation self.load() test_filename = self.evaluation_setup_filename(setup_part='test', scene_label='youtube', file_extension='txt') evaluate_filename = self.evaluation_setup_filename( setup_part='evaluate', scene_label='youtube', file_extension='txt') # Check that evaluation setup exists evaluation_setup_exists = True if not os.path.isfile(test_filename) or not os.path.isfile( evaluate_filename): evaluation_setup_exists = False if not evaluation_setup_exists: if os.path.exists(reference_data_file): ref_data = MetaDataContainer().load( filename=reference_data_file) evaluate_meta = MetaDataContainer() for item in ref_data: # Modify audio file path if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( 'audio', 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' self.process_meta_item(item=item, absolute_path=False) evaluate_meta.append(item) evaluate_meta.save(filename=self.evaluation_setup_filename( setup_part='evaluate', scene_label='youtube', file_extension='txt')) audio_files = Path().file_list(path=self.local_path, extensions=self.audio_extensions) test_meta = MetaDataContainer() for audio_file in audio_files: item = MetaDataItem({ 'filename': os.path.split(audio_file)[1], 'scene_label': 'youtube' }) self.process_meta_item(item=item, absolute_path=False) test_meta.append(item) test_meta.save(filename=self.evaluation_setup_filename( setup_part='test', scene_label='youtube', file_extension='txt')) # Load meta and cross validation self.load() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if is_jupyter(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm # Make sure audio directory exists Path().makedirs(path=os.path.join(self.local_path, 'audio')) # Make sure evaluation_setup directory exists Path().makedirs( path=os.path.join(self.local_path, self.evaluation_setup_folder)) if 'audio' in self.included_content_types: # Collect file ids files = [] files += ListDictContainer(filename=os.path.join( self.local_path, 'testing_set.csv')).load( fields=['query_id', 'segment_start', 'segment_end']) files += ListDictContainer(filename=os.path.join( self.local_path, 'training_set.csv')).load( fields=['query_id', 'segment_start', 'segment_end']) file_progress = tqdm(files, desc="{0: <25s}".format('Files'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) non_existing_videos = {} # Load list of already identified non-accessible videos item_access_log_filename = os.path.join( self.local_path, 'item_access_error.log.csv') if os.path.isfile(item_access_log_filename): for item in ListDictContainer( filename=item_access_log_filename).load( fields=['query_id', 'error']): non_existing_videos[item['query_id']] = item # Check that audio files exists for file_data in file_progress: audio_filename = os.path.join( self.local_path, 'audio', 'Y{query_id}_{segment_start}_{segment_end}.{extension}'. format(query_id=file_data['query_id'], segment_start=file_data['segment_start'], segment_end=file_data['segment_end'], extension=self.default_audio_extension)) # Download segment if it does not exists if not os.path.isfile(audio_filename) and file_data[ 'query_id'] not in non_existing_videos: try: AudioContainer().load_from_youtube( query_id=file_data['query_id'], start=file_data['segment_start'], stop=file_data['segment_end']).save( filename=audio_filename) except IOError as e: non_existing_videos[file_data['query_id']] = { 'error': str(e.message).replace('\n', ' '), 'query_id': file_data['query_id'] } # Save list of non-accessible videos ListDictContainer(list(non_existing_videos.values()), filename=item_access_log_filename).save( fields=['query_id', 'error']) # Evaluation setup filenames train_filename = self.evaluation_setup_filename(setup_part='train', fold=1, scene_label='youtube', file_extension='txt') test_filename = self.evaluation_setup_filename(setup_part='test', fold=1, scene_label='youtube', file_extension='txt') evaluate_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=1, scene_label='youtube', file_extension='txt') # Check that evaluation setup exists evaluation_setup_exists = True if not os.path.isfile(train_filename) or not os.path.isfile( test_filename) or not os.path.isfile(evaluate_filename): evaluation_setup_exists = False if not evaluation_setup_exists: # Evaluation setup was not found, generate one fold = 1 train_meta = MetaDataContainer() for item in MetaDataContainer().load( os.path.join(self.local_path, 'groundtruth_weak_label_training_set.csv')): if not item.filename.endswith(self.default_audio_extension): item.filename = os.path.join( 'audio', 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' # Translate event onset and offset, weak labels item.offset -= item.onset item.onset -= item.onset # Only collect items which exists if audio present if 'audio' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): train_meta.append(item) else: train_meta.append(item) train_meta.save( filename=self.evaluation_setup_filename(setup_part='train', fold=fold, scene_label='youtube', file_extension='txt')) evaluate_meta = MetaDataContainer() for item in MetaDataContainer().load( os.path.join(self.local_path, 'groundtruth_strong_label_testing_set.csv')): if not item.filename.endswith(self.default_audio_extension): item.filename = os.path.join( 'audio', 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' # Only collect items which exists if 'audio' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): evaluate_meta.append(item) else: evaluate_meta.append(item) evaluate_meta.save( filename=self.evaluation_setup_filename(setup_part='evaluate', fold=fold, scene_label='youtube', file_extension='txt')) test_meta = MetaDataContainer() for item in evaluate_meta: test_meta.append(MetaDataItem({'filename': item.filename})) test_meta.save( filename=self.evaluation_setup_filename(setup_part='test', fold=fold, scene_label='youtube', file_extension='txt')) # Load meta and cross validation self.load() if not self.meta_container.exists(): fold = 1 meta_data = MetaDataContainer() meta_data += MetaDataContainer().load( self.evaluation_setup_filename(setup_part='train', fold=fold, scene_label='youtube', file_extension='txt')) meta_data += MetaDataContainer().load( self.evaluation_setup_filename(setup_part='evaluate', fold=fold, scene_label='youtube', file_extension='txt')) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() return self
def pack(self, dataset_name='dcase-dataset', content=None, output_path=None, base_path=None, overwrite=False, verbose=True): """Pack dataset. Parameters ---------- dataset_name : str Dataset name Default value 'dcase-dataset' content : list of dict List of packages to be packed. Package item dict should have format {'data_name': 'doc', 'file_list': [{'source': 'file1.txt'}]}. Default value None output_path : str Path to which packages are saved. Default value None base_path : str Base path of the data. If per item package paths are not given ('target' field), this parameter is used to create one from source path. Default value None overwrite : bool Overwrite existing packages. Default value False verbose : bool Show information during the packing. Default value True Returns ------- nothing """ if verbose: log = FancyLogger() log.section_header('Packing dataset [{dataset_name}]'.format( dataset_name=dataset_name)) if base_path is not None and not base_path.endswith(os.path.sep): base_path += os.path.sep for group in content: if verbose: log.line('[{data_name}]'.format(data_name=group['data_name'])) package_filename = os.path.join( output_path, self.filename_template.format( dataset_name=dataset_name, data_name=group['data_name'], extension=self.package_extension)) newest_source = 0 for item in group['file_list']: if not os.path.exists(item['source']): message = '{name}: File not found [{source_file}].'.format( name=self.__class__.__name__, source_file=item['source']) self.logger.exception(message) raise IOError(message) if 'target' not in item: if item['source'].startswith(base_path): item['target'] = item['source'][len(base_path):] else: item['target'] = item['source'] timestamp = os.path.getmtime(item['source']) if newest_source < timestamp: newest_source = timestamp # Get newest package, take care of split packages all_packages = Path().file_list( path=os.path.split(os.path.abspath(package_filename))[0], extensions=os.path.splitext(package_filename)[1][1:]) newest_package = 0 for package in all_packages: base_name = os.path.splitext(os.path.split(package)[-1])[0] if base_name[-1].isdigit(): base_name = os.path.splitext(base_name)[0] if base_name == os.path.splitext( os.path.split(package_filename)[-1])[0]: timestamp = os.path.getmtime(package) if newest_package < timestamp: newest_package = timestamp if newest_package < newest_source or overwrite: if self.convert_md_to_html: # Check for markdown content new_files = [] for item in group['file_list']: if os.path.splitext(item['source'])[-1] == '.md': if not os.path.exists( os.path.splitext(item['source'])[0] + '.html' ) or (os.path.exists( os.path.splitext(item['source'])[0] + '.html') and os.path.getmtime( item['source']) > os.path.getmtime( os.path.splitext(item['source'])[0] + '.html')) or overwrite: # Convert self.convert_markdown( source_filename=item['source'], target_filename=os.path.splitext( item['source'])[0] + '.html') new_files.append({ 'source': os.path.splitext(item['source'])[0] + '.html', 'target': os.path.splitext(item['target'])[0] + '.html' }) # Add new html files to the file_list group['file_list'] += new_files # Create packages package = Package(filename=package_filename) package_filenames = package.compress( file_list=group['file_list'], size_limit=self.package_size_limit) if verbose: log.line('Saved', indent=2) for i in package_filenames: log.line('[{file}] [{size}]'.format( file=i.replace(base_path, ''), size=get_byte_string(os.path.getsize(i), show_bytes=False)), indent=4) if verbose: log.foot()