def convert(args): # TODO docstring # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose trs_only = args.only_transcriptions ignore_list_file = args.ignore # Read in the ignore list. ignore_paths = set() ignore_globs = set() if ignore_list_file: for path_or_glob in ignore_list_file: path_or_glob = path_or_glob.rstrip('\n') # For lines that list absolute paths, if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob): # add them to the list of paths to ignore. ignore_paths.add(path_or_glob) # For other lines, treat them as basename globs. else: ignore_globs.add(path_or_glob) ignore_list_file.close() # Get all but the ignored transcriptions. if os.path.isdir(infname): trs_paths = find(infname, '*.trs', ignore_globs=ignore_globs, ignore_paths=ignore_paths) else: trs_paths = list() with open(infname, 'r') as inlist: for line in inlist: trs_paths.extend(find(line.strip(), '*.trs', mindepth=1, maxdepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths)) size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for trs_path in trs_paths: if verbose: print u"Processing transcription file: ", trs_path cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(trs_path, outdir, trs_only, lang, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs print u"Size of copied audio data:", size sec = size / (2*16000) hour = sec / 3600.0 print u"Length of audio data in hours (for 16kHz 16bit WAVs output):", hour # Return the number of file collisions and overwrites. return n_overwrites, n_missing_wav, n_missing_trs
def get_call_data_from_fs(rootdir): voip_names = find(rootdir, 'voip-*', mindepth=0, prune=True) # Here we assume the following format for call-log directory basenames: # voip-<phone>-<YYMMDD_HHMMSS> voip_parts = [ set_and_ret(basename(name).split('-'), 0, name) for name in voip_names ] # Here we assume the following format for call-log files: # jurcic-<num>-<YYMMDD_HHMMSS>_<ms-start>_<ms-end>.wav # where # num =~ /\d\d\d/ # ms-start =~ /\d{7}/ # ms-end =~ /\d{7}/ # # ...ms-start and ms-end actually denote hundredths of seconds, not # milliseconds. # Build the mapping (phone_no -> timestamps_of_calls). # Build the mapping (phone_no -> total_size_of_call_logs). call_timestamps = dict() call_size = dict() for split in voip_parts: voip_path = split[0] phone = split[1] date_str = split[2] # Transform the date string into a timestamp. date = datetime(year=2000 + int(date_str[:2]), month=int(date_str[2:4]), day=int(date_str[4:6]), hour=int(date_str[7:9]), minute=int(date_str[9:11]), second=int(date_str[11:13])) timestamp = get_timestamp(date) # Compute the total size of the wavs. wavs = iglob( os.path.join(voip_path, 'jurcic-???-??????_??????_???????_???????.wav')) total = 0 # total size in bytes for wav in wavs: total += getsize(wav) # Save the timestamp and the size. call_timestamps.setdefault(phone, []).append(timestamp) call_size[phone] = call_size.get(phone, 0) + total return call_size, call_timestamps
def get_call_data_from_fs(rootdir): voip_names = find(rootdir, 'voip-*', mindepth=0, prune=True) # Here we assume the following format for call-log directory basenames: # voip-<phone>-<YYMMDD_HHMMSS> voip_parts = [set_and_ret(basename(name).split('-'), 0, name) for name in voip_names] # Here we assume the following format for call-log files: # jurcic-<num>-<YYMMDD_HHMMSS>_<ms-start>_<ms-end>.wav # where # num =~ /\d\d\d/ # ms-start =~ /\d{7}/ # ms-end =~ /\d{7}/ # # ...ms-start and ms-end actually denote hundredths of seconds, not # milliseconds. # Build the mapping (phone_no -> timestamps_of_calls). # Build the mapping (phone_no -> total_size_of_call_logs). call_timestamps = dict() call_size = dict() for split in voip_parts: voip_path = split[0] phone = split[1] date_str = split[2] # Transform the date string into a timestamp. date = datetime(year=2000 + int(date_str[:2]), month=int(date_str[2:4]), day=int(date_str[4:6]), hour=int(date_str[7:9]), minute=int(date_str[9:11]), second=int(date_str[11:13])) timestamp = get_timestamp(date) # Compute the total size of the wavs. wavs = iglob(os.path.join( voip_path, 'jurcic-???-??????_??????_???????_???????.wav')) total = 0 # total size in bytes for wav in wavs: total += getsize(wav) # Save the timestamp and the size. call_timestamps.setdefault(phone, []).append(timestamp) call_size[phone] = call_size.get(phone, 0) + total return call_size, call_timestamps
parser.add_argument('-i', '--input', action="store", help='an input directory with all wav files') parser.add_argument('-o', '--output', action="store", help='an output directory for the converted wav') parser.add_argument('-v', action="store_true", dest="verbose", help='set verbose output') args = parser.parse_args() trn_files = find(args.input, '*.trn', mindepth=1, maxdepth=5) for fn in trn_files: if args.verbose: print "Processing file:", fn real_fn = os.path.realpath(fn) base_fn = os.path.basename(fn) wav_fn = real_fn.replace('.trn', '') trn_fn = real_fn if not os.path.exists(wav_fn) or not os.path.exists(wav_fn+'.trn'): print "Does not exists {fn} or {fnt}".format(fn=wav_fn, fnt=wav_fn+'.trn') continue
help='a list of test wav files') parser.add_argument('-v', action="store_true", dest="verbose", help='set verbose output') args = parser.parse_args() with open(args.devlist, 'r') as f: dev_files = set([os.path.basename(fn.strip()) for fn in f.readlines()]) with open(args.testlist, 'r') as f: test_files = set([os.path.basename(fn.strip()) for fn in f.readlines()]) all_files = find(args.all, '*.wav', mindepth=1, maxdepth=5) for fn in all_files: if args.verbose: print "Processing file:", fn base_fn = os.path.basename(fn) if base_fn in dev_files: os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.dev, base_fn))) os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn+'.trn'), tgt = os.path.join(args.dev, base_fn+'.trn'))) elif base_fn in test_files: os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.test, base_fn))) os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn+'.trn'), tgt = os.path.join(args.test, base_fn+'.trn'))) else: os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.train, base_fn)))
parser.add_argument('-i', '--input', action="store", help='an input directory with all wav files') parser.add_argument('-o', '--output', action="store", help='an output directory for the converted wav') parser.add_argument('-v', action="store_true", dest="verbose", help='set verbose output') args = parser.parse_args() trn_files = find(args.input, '*.trn', mindepth=1, maxdepth=5) for fn in trn_files: if args.verbose: print "Processing file:", fn real_fn = os.path.realpath(fn) base_fn = os.path.basename(fn) wav_fn = real_fn.replace('.trn', '') trn_fn = real_fn if not os.path.exists(wav_fn) or not os.path.exists(wav_fn + '.trn'): print "Does not exists {fn} or {fnt}".format(fn=wav_fn, fnt=wav_fn + '.trn') continue
def convert(args): # TODO docstring # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose trs_only = args.only_transcriptions ignore_list_file = args.ignore # Read in the ignore list. ignore_paths = set() ignore_globs = set() if ignore_list_file: for path_or_glob in ignore_list_file: path_or_glob = path_or_glob.rstrip('\n') # For lines that list absolute paths, if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob): # add them to the list of paths to ignore. ignore_paths.add(path_or_glob) # For other lines, treat them as basename globs. else: ignore_globs.add(path_or_glob) ignore_list_file.close() # Get all but the ignored transcriptions. if os.path.isdir(infname): trs_paths = find(infname, '*.trs', mindepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths) else: trs_paths = list() with open(infname, 'r') as inlist: for line in inlist: trs_paths.extend( find(line.strip(), '*.trs', mindepth=1, maxdepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths)) size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for trs_path in trs_paths: if verbose: print u"Processing transcription file: ", trs_path cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(trs_path, outdir, trs_only, lang, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs print u"Size of copied audio data:", size sec = size / (2 * 16000) hour = sec / 3600.0 print u"Length of audio data in hours (for 16kHz 16bit WAVs output):", hour # Return the number of file collisions and overwrites. return n_overwrites, n_missing_wav, n_missing_trs
def find_with_ignorelist(infname, pat, ignore_list_file=None, find_kwargs=dict()): """ Finds specific files below the paths specified and returns their filenames. Arguments: pat -- globbing pattern specifying the files to look for infname -- either a directory, or a file. In the first case, wavs are looked for below that directory. In the latter case, the file is read line by line, each line specifying a directory or a glob determining the wav to include. ignore_list_file -- a file of absolute paths or globs (can be mixed) specifying wavs that should be excluded from the results find_kwargs -- if provided, this dictionary is used as additional keyword arguments for the function `utils.fs.find' for finding positive examples of files (not the ignored ones) Returns a set of paths to files satisfying the criteria. """ # Read in the ignore list. ignore_paths = set() ignore_globs = set() if ignore_list_file: for path_or_glob in ignore_list_file: path_or_glob = path_or_glob.rstrip("\n") # For lines that list absolute paths, if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob): # add them to the list of paths to ignore. ignore_paths.add(path_or_glob) # For other lines, treat them as basename globs. else: ignore_globs.add(path_or_glob) ignore_list_file.close() # Get all files matching `pat', skipping ignore globs and ignore paths. # # First option: the infile is actually a directory. Then, take all # matching files from below that directory. if os.path.isdir(infname): find_kwargs = _build_find_kwargs(find_kwargs, ignore_globs=ignore_globs, ignore_paths=ignore_paths) if "mindepth" not in find_kwargs: find_kwargs["mindepth"] = 1 file_paths = set(find(infname, pat, **find_kwargs)) # Second option: the infile is a file listing all paths to check for # matching files. else: file_paths = set() find_kwargs = _build_find_kwargs( find_kwargs, mindepth=1, maxdepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths ) with open(infname, "r") as inlist: for line in inlist: line = line.rstrip("\n") # If the line contains directories: if os.path.isdir(line): file_paths.update(find(line, pat, **find_kwargs)) # If it is not a directory name, treat the line as a file glob. else: new_paths = [os.path.abspath(f) for f in glob.glob(line)] file_paths.update(new_paths) # Find all files in ignore paths and remove them from the returned files, # to be sure that symlinks from other, not ignored paths did not add them. for ignore_path in ignore_paths: file_paths.difference_update(find(ignore_path, pat, mindepth=1, maxdepth=1)) for ignore_glob in ignore_globs: file_paths.difference_update(os.path.abspath(fname) for fname in glob.glob(ignore_glob)) return file_paths
action="store", help='a list of test wav files') parser.add_argument('-v', action="store_true", dest="verbose", help='set verbose output') args = parser.parse_args() with open(args.devlist, 'r') as f: dev_files = set([os.path.basename(fn.strip()) for fn in f.readlines()]) with open(args.testlist, 'r') as f: test_files = set( [os.path.basename(fn.strip()) for fn in f.readlines()]) all_files = find(args.all, '*.wav', mindepth=1, maxdepth=5) for fn in all_files: if args.verbose: print "Processing file:", fn base_fn = os.path.basename(fn) if base_fn in dev_files: os.system("ln -s {src} {tgt}".format( src=os.path.join('..', args.all, base_fn), tgt=os.path.join(args.dev, base_fn))) os.system("ln -s {src} {tgt}".format( src=os.path.join('..', args.all, base_fn + '.trn'), tgt=os.path.join(args.dev, base_fn + '.trn'))) elif base_fn in test_files:
def find_with_ignorelist(infname, pat, ignore_list_file=None, find_kwargs=dict()): """ Finds specific files below the paths specified and returns their filenames. Arguments: pat -- globbing pattern specifying the files to look for infname -- either a directory, or a file. In the first case, wavs are looked for below that directory. In the latter case, the file is read line by line, each line specifying a directory or a glob determining the wav to include. ignore_list_file -- a file of absolute paths or globs (can be mixed) specifying wavs that should be excluded from the results find_kwargs -- if provided, this dictionary is used as additional keyword arguments for the function `utils.fs.find' for finding positive examples of files (not the ignored ones) Returns a set of paths to files satisfying the criteria. """ # Read in the ignore list. ignore_paths = set() ignore_globs = set() if ignore_list_file: for path_or_glob in ignore_list_file: path_or_glob = path_or_glob.rstrip('\n') # For lines that list absolute paths, if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob): # add them to the list of paths to ignore. ignore_paths.add(path_or_glob) # For other lines, treat them as basename globs. else: ignore_globs.add(path_or_glob) ignore_list_file.close() # Get all files matching `pat', skipping ignore globs and ignore paths. # # First option: the infile is actually a directory. Then, take all # matching files from below that directory. if os.path.isdir(infname): find_kwargs = _build_find_kwargs(find_kwargs, ignore_globs=ignore_globs, ignore_paths=ignore_paths) if 'mindepth' not in find_kwargs: find_kwargs['mindepth'] = 1 file_paths = set(find(infname, pat, **find_kwargs)) # Second option: the infile is a file listing all paths to check for # matching files. else: file_paths = set() find_kwargs = _build_find_kwargs(find_kwargs, mindepth=1, maxdepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths) with open(infname, 'r') as inlist: for line in inlist: line = line.rstrip('\n') # If the line contains directories: if os.path.isdir(line): file_paths.update(find(line, pat, **find_kwargs)) # If it is not a directory name, treat the line as a file glob. else: new_paths = [os.path.abspath(f) for f in glob.glob(line)] file_paths.update(new_paths) # Find all files in ignore paths and remove them from the returned files, # to be sure that symlinks from other, not ignored paths did not add them. for ignore_path in ignore_paths: file_paths.difference_update( find(ignore_path, pat, mindepth=1, maxdepth=1)) for ignore_glob in ignore_globs: file_paths.difference_update( os.path.abspath(fname) for fname in glob.glob(ignore_glob)) return file_paths