def read_sents(self, filename, filter_ids=None): if self.vocab is None: self.vocab = Vocab() def convert(line, segmentation): line = line.strip().split() ret = AnnotatedSentenceInput( list(map(self.vocab.convert, line)) + [self.vocab.convert(Vocab.ES_STR)]) ret.annotate("segment", list(map(int, segmentation.strip().split()))) return ret if type(filename) != list: try: filename = ast.literal_eval(filename) except: logger.debug("Reading %s with a PlainTextReader instead..." % filename) return super(SegmentationTextReader, self).read_sents(filename) max_id = None if filter_ids is not None: max_id = max(filter_ids) filter_ids = set(filter_ids) data = [] with open(filename[0], encoding='utf-8') as char_inp,\ open(filename[1], encoding='utf-8') as seg_inp: for sent_count, (char_line, seg_line) in enumerate(zip(char_inp, seg_inp)): if filter_ids is None or sent_count in filter_ids: data.append(convert(char_line, seg_line)) if max_id is not None and sent_count > max_id: break return data
def _augment_data_initial(self): """ Called before loading corpus for the first time, if reload_command is given """ augment_command = self.reload_command logger.debug('initial augmentation') if self._augmentation_handle is None: # first run self._augmentation_handle = Popen(augment_command + " --epoch 0", shell=True) self._augmentation_handle.wait()
def extract_to(self, in_file: str, out_file: str) -> None: """ Args: in_file: yaml file that contains a list of dictionaries. Each dictionary contains: - wav (str): path to wav file - offset (float): start time stamp (optional) - duration (float): stop time stamp (optional) - speaker: speaker id for normalization (optional; if not given, the filename is used as speaker id) out_file: a filename ending in ".h5" """ import librosa if not out_file.endswith(".h5"): raise ValueError(f"out_file must end in '.h5', was '{out_file}'") start_time = time.time() with open(in_file) as in_stream, \ h5py.File(out_file, "w") as hf: db = yaml.load(in_stream, Loader=yaml.Loader) db_by_speaker = defaultdict(list) for db_index, db_item in enumerate(db): speaker_id = db_item.get("speaker", db_item["wav"].split("/")[-1]) db_item["index"] = db_index db_by_speaker[speaker_id].append(db_item) for speaker_id in db_by_speaker.keys(): data = [] for db_item in db_by_speaker[speaker_id]: y, sr = librosa.load(db_item["wav"], sr=16000, offset=db_item.get("offset", 0.0), duration=db_item.get( "duration", None)) if len(y) * 40 < sr: logger.warn( f"Encountered a short audio with only {len(y)} values. Filling up with zeros to extract filterbank features.." ) missing_len = sr - len(y) * 40 y = np.pad(y, (0, missing_len), mode='constant') # raise ValueError(f"encountered an empty or out of bounds segment: {db_item}") logmel = speech_features.logfbank(y, samplerate=sr, nfilt=self.nfilt) if self.delta: delta = speech_features.calculate_delta(logmel) features = np.concatenate([logmel, delta], axis=1) else: features = logmel data.append(features) mean, std = speech_features.get_mean_std(np.concatenate(data)) for features, db_item in zip(data, db_by_speaker[speaker_id]): features = speech_features.normalize(features, mean, std) hf.create_dataset(str(db_item["index"]), data=features) logger.debug( f"feature extraction took {time.time()-start_time:.3f} seconds")
def tokenize_stream(self, stream): """ Tokenize a file-like text stream. Args: stream: A file-like stream of untokenized text Returns: A file-like stream of tokenized text """ logger.debug("****** calling tokenize_stream {}".format( self.__class__)) for line in stream: yield self.tokenize(line.strip())
def main(overwrite_args=None): with tee.Tee(), tee.Tee(error=True): argparser = argparse.ArgumentParser() argparser.add_argument("--dynet-mem", type=str) argparser.add_argument("--dynet-seed", type=int) argparser.add_argument("--dynet-autobatch", type=int) argparser.add_argument("--dynet-devices", type=str) argparser.add_argument("--dynet-viz", action='store_true', help="use visualization") argparser.add_argument("--dynet-gpu", action='store_true', help="use GPU acceleration") argparser.add_argument("--dynet-gpu-ids", type=int) argparser.add_argument("--dynet-gpus", type=int) argparser.add_argument("--dynet-weight-decay", type=float) argparser.add_argument("--dynet-profiling", type=int) argparser.add_argument("--settings", type=str, default="standard", help="settings (standard, debug, or unittest)" "must be given in '=' syntax, e.g." " --settings=standard") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) if args.dynet_gpu: if settings.CHECK_VALIDITY: settings.CHECK_VALIDITY = False logger.warning( "disabling CHECK_VALIDITY because it is not supported on GPU currently" ) config_experiment_names = YamlPreloader.experiment_names_from_file( args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference( config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join( list(nonexistent)))) for experiment_name in experiment_names: ParamManager.init_param_col() uninitialized_exp_args = YamlPreloader.preload_experiment_from_file( args.experiments_file, experiment_name) logger.info(f"=> Running {experiment_name}") logger.debug( f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) glob_args = uninitialized_exp_args.data.exp_global log_file = glob_args.log_file if os.path.isfile(log_file) and not settings.OVERWRITE_LOG: logger.warning( f"log file {log_file} already exists; please delete by hand if you want to overwrite it (or use --settings debug or otherwise set OVERWRITE_LOG=True); skipping experiment.." ) continue tee.set_out_file(log_file) model_file = glob_args.model_file uninitialized_exp_args.data.exp_global.commandline_args = args # Create the model experiment = initialize_if_needed(uninitialized_exp_args) ParamManager.param_col.model_file = experiment.exp_global.model_file ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints ParamManager.populate() # Run the experiment eval_scores = experiment(save_fct=lambda: save_to_file( model_file, experiment, ParamManager.param_col)) results.append((experiment_name, eval_scores)) print_results(results) tee.unset_out_file()