def load_data(options): structure = Structure(options.configdir / "structure.xml") coderconf = read_config(options.configdir / "coder.cfg") Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) dataconf = read_config(options.configdir / "database.cfg") features = { uttid: feats for spkr in dataconf.sections() for uttid, feats in np.load(dataconf[spkr].get("features")).items() } labels = { uttid: coder.encode(read_task(task)) for spkr in dataconf.sections() for uttid, task in zip(*load_tasks(Path(dataconf[spkr].get("tasks")))) } errors = set(features).union(set(labels)) - set(features).intersection( set(labels)) if errors: msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)" if options.errors == "raise": raise Exception(msg) elif options.errors == "warn": warning.warn(msg) uttids = list(features) features = [features[uttid] for uttid in uttids] labels = [labels[uttid] for uttid in uttids] return SequenceDataset(features, labels)
def train(expdir, cuda=False, do_eval=True): logger.info(f"Train {expdir}") acquisitionconf = read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) model.display(logger.info) trainfeats = FeatLoader(expdir / "trainfeats").to_dict() with open(expdir / "traintasks") as f: traintasks = dict(map(parse_line, f)) train_set = { utt: (trainfeats[utt], traintasks[utt]) for utt in traintasks if utt in trainfeats } if not train_set: raise ValueError("No training examples") test_feats = FeatLoader(expdir / "testfeats").to_dict() with open(expdir / "testtasks") as testtasks: test_tasks = dict(map(parse_line, testtasks)) test_set = { utt: (test_feats[utt], test_tasks[utt]) for utt in test_tasks if utt in test_feats } if (expdir / "model").exists(): model.load(expdir / "model") model.train(train_set, test_set) model.save(expdir / 'model') # from assist.tasks import read_task # from sklearn.metrics import classification_report, log_loss # from operator import itemgetter # from functools import partial # predictions = model.encode(model._decode(list(map(itemgetter(0), train_set.values())))) # target = model.encode(list(map(itemgetter(1), train_set.values()))) # for line in classification_report(target, predictions).split("\n"): # logger.info(line) if do_eval: evaluate(expdir, cuda=cuda)
def main(expdir, cuda): expdir = Path(expdir) #check if this experiment has been completed if (expdir / "model").exists(): logger.warning(f"Found trained model in {expdir}.") return acquisitionconf = read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = read_config(expdir / "coder.cfg") structure = Structure(os.path.join(expdir, 'structure.xml')) Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) trainconf = dict( read_config(expdir / "train.cfg", default=Path(__file__).parent / "defaults/train.cfg").items("train")) with open(expdir / "trainfeats") as trainfeats: features = { uttid: np.load(featsfile) for uttid, featsfile in map(parse_line, trainfeats.readlines()) } with open(expdir / "traintasks") as traintasks: taskstrings = { uttid: task for uttid, task in map(parse_line, traintasks.readlines()) } examples = { utt: (features[utt], taskstrings[utt]) for utt in taskstrings if utt in features } model.train(examples) model.save(expdir / 'model')
def prepare_cross_validation(expdir, recipe): os.makedirs(expdir, exist_ok=True) for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"): logger.debug(f"Copy {filename} from {recipe} to {expdir}") shutil.copy(recipe / filename, expdir / filename) expconf = dict( read_config(recipe / "cross_validation.cfg", default=Path(__file__).parent / "defaults/cross_validation.cfg").items("cross_validation")) random_seed = int(expconf.get("random_seed", 3105)) logger.debug(f"Setting random seed to {random_seed}") random.seed(random_seed) dataconf = read_config(recipe / "database.cfg") coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) speakers = list(dataconf.sections()) if "speakers" in expconf and expconf["speakers"]: speakers = list( filter(lambda spkr: spkr in expconf["speakers"], speakers)) logger.info(f"{len(speakers)} speakers selected for cross-validation") option_list = [ dict(expdir=expdir, speaker=speaker, coder=coder, dataconf=dataconf, expconf=expconf) for speaker in speakers ] for opts in option_list: map_prepare_filesystem(opts)
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1): logger.info(f"GridSearch {expdir}") with open(recipe / "param_grid.json") as jsonfile: param_grid = json.load(jsonfile) logger.debug(str(param_grid)) total_params = np.prod(list(map(len, param_grid.values()))) logger.warning( f"Searching {len(param_grid)} parameters, totalling {total_params} possible values." ) gsconf = read_config(expdir / "gridsearch.cfg") default_config = dict(gsconf["acquisition"].items()) default_config["device"] = "cuda" if cuda else "cpu" gsconf = dict(gsconf["gridsearch"].items()) logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items())) train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]), int(gsconf["num_trains"])) gs_params = { "train_sizes": train_sizes, "cv": int(gsconf["cv_splits"]), "scoring": make_scorer(accuracy) if gsconf["scoring"] == "accuracy" else gsconf["scoring"], "n_jobs": n_jobs } logger.debug(gs_params) coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) default_config["output_dim"] = coder.numlabels features = FeatLoader(expdir / "gridsearchfeats").to_dict() with open(expdir / "gridsearchtasks") as traintasks: taskstrings = { uttid: task for uttid, task in map(parse_line, traintasks.readlines()) } indices = sorted(set(features).intersection(set(taskstrings))) X = list(map(features.__getitem__, indices)) y = list( map(coder.encode, map(read_task, map(taskstrings.__getitem__, indices)))) gs_results = defaultdict(list) start = time() best_score = 0 for i, param_values in enumerate(product(*param_grid.values())): t0 = time() params = dict(zip(param_grid.keys(), param_values)) config = deepcopy(default_config) config.update(params) logger.debug(config) model = RNNClassifier(**config) train_sizes, train_scores, valid_scores = learning_curve( model, X, y, **gs_params) train_score = auc(train_sizes, train_scores.mean(-1)) test_score = auc(train_sizes, valid_scores.mean(-1)) t1 = time() logger.info( f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} " f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}") gs_results["auc_test_score"].append(test_score) gs_results["auc_train_score"].append(train_score) gs_results["params"].append(params) gs_results["train_sizes"].append(train_sizes) gs_results["train_scores"].append(train_scores) gs_results["test_scores"].append(valid_scores) if test_score > best_score: best_params, best_score, best_index = params, test_score, i logger.warning( f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})" ) logger.warning( f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}") with open(expdir / "gs_results.json", "w") as result_file: json.dump( { "best_params": best_params, "best_score": best_score, "cv_results": serialise(gs_results) }, result_file)
def load_data(options): logger.info("Loading data") structure = Structure(options.expdir/"structure.xml") coderconf = read_config(options.expdir/"coder.cfg") Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) featconf = read_config(options.expdir/"features.cfg") dataconf = read_config(options.expdir/"database.cfg") def load_tasks(filename): with open(filename) as f: spkr = filename.stem if spkr == "tasks": spkr = filename.parent.name uttids, tasks = map(list, zip(*map( lambda s: s.split(maxsplit=1), map(str.strip, f.readlines())) )) return list(map(f"{spkr}_{{}}".format, uttids)), tasks feat_loader = kaldiio.load_scp(featconf["features"].get("fbanks")) features = {uttid: feat_loader[uttid] for uttid in feat_loader} labels = { uttid: coder.encode(read_task(task)) for spkr in dataconf.sections() for uttid, task in zip(*load_tasks(Path(dataconf[spkr].get("tasks")))) } errors = set(features).union(set(labels)) - set(features).intersection(set(labels)) if errors: msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)" if options.errors == "raise": raise Exception(msg) elif options.errors == "warn": logger.warn(msg) uttids = np.array(list(features)) features = np.array([features[uttid] for uttid in uttids], dtype="object") labels = np.array([labels[uttid] for uttid in uttids]) if any(subset in uttids[0] for subset in ["train", "valid", "test"]): train_mask = np.array(list(map(lambda s: "_train_" in s, uttids))) valid_mask = np.array(list(map(lambda s: "_valid_" in s, uttids))) # test_mask = np.array(list(map(lambda s: "_test_" in s, uttids))) elif (options.expdir/"train.cfg").exists(): train_sections = set(read_config(options.expdir/"train.cfg").get("train", "datasections").split()) test_sections = set(read_config(options.expdir/"test.cfg").get("test", "datasections").split()) train_mask = np.array(list(map(lambda s: s.split("_")[0] in train_sections, uttids))) valid_mask = np.array(list(map(lambda s: s.split("_")[0] in test_sections, uttids))) else: train_ids, valid_ids = train_test_split(uttids, test_size=0.1) train_mask = np.array(list(map(lambda s: s in train_ids, uttids))) valid_mask = np.array(list(map(lambda s: s in valid_ids, uttids))) train_set = SequenceDataset(features[train_mask], labels[train_mask], uttids[train_mask]) valid_set = SequenceDataset(features[valid_mask], labels[valid_mask], uttids[valid_mask]) # test_set = SequenceDataset(features[test_mask], labels[test_mask], uttids[test_mask]) return train_set, valid_set
def load_data(options): if options.expdir is None: options.expdir = options.outdir dataconf = read_config(options.expdir / "database.cfg") structure = Structure(options.expdir / "structure.xml") coderconf = read_config(options.expdir / "coder.cfg") Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) if options.expdir == options.outdir: trainfeats = FeatLoader(options.outdir / "trainfeats").to_dict() testfeats = FeatLoader(options.outdir / "testfeats").to_dict() with open(options.outdir / "traintasks") as traintasks: trainlabels = { uttid: coder.encode(read_task(task)) for uttid, task in map(parse_line, traintasks.readlines()) } with open(options.outdir / "testtasks") as testtasks: testlabels = { uttid: coder.encode(read_task(task)) for uttid, task in map(parse_line, testtasks.readlines()) } features = set(trainfeats).union(set(testfeats)) labels = set(trainlabels).union(set(testlabels)) else: def load_tasks(filename): with open(filename) as f: spkr = filename.stem if spkr == "tasks": spkr = filename.parent.name uttids, tasks = map( list, zip(*map(lambda s: s.split(maxsplit=1), map(str.strip, f.readlines())))) if not uttids[0].startswith(spkr): uttids = list(map(f"{spkr}_{{}}".format, uttids)) return uttids, tasks features = { uttid: feats for spkr in dataconf.sections() for uttid, feats in np.load( dataconf[spkr].get("features")).items() } labels = { uttid: coder.encode(read_task(task)) for spkr in dataconf.sections() for uttid, task in zip( *load_tasks(Path(dataconf[spkr].get("tasks")))) } errors = set(features).union(set(labels)) - set(features).intersection( set(labels)) if errors: msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)" if options.errors == "raise": raise Exception(msg) elif options.errors == "warn": warning.warn(msg) else: # import ipdb; ipdb.set_trace() features = {k: v for k, v in features.items() if k not in errors} labels = {k: v for k, v in labels.items() if k not in errors} if not (features and labels): raise ValueError("No examples left after removing errors") if options.expdir == options.outdir: trainuttids = set(trainfeats) trainfeats = np.array([trainfeats[uttid] for uttid in trainuttids], dtype="object") trainlabels = np.array([trainlabels[uttid] for uttid in trainuttids]) train_set = SequenceDataset(trainfeats, trainlabels) testuttids = set(testfeats) testfeats = np.array([testfeats[uttid] for uttid in testuttids], dtype="object") testlabels = np.array([testlabels[uttid] for uttid in testuttids]) valid_set = SequenceDataset(testfeats, testlabels) return train_set, valid_set uttids = np.array(list(features)) features = np.array([features[uttid] for uttid in uttids], dtype="object") labels = np.array([labels[uttid] for uttid in uttids]) if options.method == "10-fold": return SequenceDataset(features, labels, indices=uttids) # 1. Fluent Speech Commands if any(subset in uttids[0] for subset in ["train", "valid", "test"]): logger.info("Fluent Speech Commands dataset splits") train_mask = np.array(list(map(lambda s: "_train_" in s, uttids))) valid_mask = np.array(list(map(lambda s: "_valid_" in s, uttids))) # test_mask = np.array(list(map(lambda s: "_test_" in s, uttids))) # 2. Train/test split exists in expdir elif (options.expdir / "train.cfg").exists(): logger.info( f"Loading dataset splits from spec {options.expdir}/{{train,test}}.cfg" ) train_sections = set( read_config(options.expdir / "train.cfg").get( "train", "datasections").split()) test_sections = set( read_config(options.expdir / "test.cfg").get( "test", "datasections").split()) def make_filter(sections): def _filter(uttid): return any(uttid.startswith(spkr) for spkr in sections) return _filter train_mask = np.array(list(map(make_filter(train_sections), uttids))) valid_mask = np.array(list(map(make_filter(test_sections), uttids))) # 3. Random train/test split else: logger.info("Random train/test split") train_ids, valid_ids = train_test_split(uttids, test_size=0.1) train_mask = np.array(list(map(lambda s: s in train_ids, uttids))) valid_mask = np.array(list(map(lambda s: s in valid_ids, uttids))) if options.method in ("10%", "1%"): sz = .1 if options.method == "10%" else .01 train_ids = np.arange(len(features))[train_mask] train_ids, _ = train_test_split(train_ids, train_size=sz, stratify=labels[train_mask]) train_ids = set(train_ids) train_mask = [idx in train_ids for idx in np.arange(len(features))] train_set = SequenceDataset(features[train_mask], labels[train_mask]) valid_set = SequenceDataset(features[valid_mask], labels[valid_mask]) # test_set = SequenceDataset(features[test_mask], labels[test_mask]) logger.info( f"Dataset loaded: train_size={len(train_set):,} valid_size={len(valid_set):,}" ) with open(options.outdir/"trainfeats") as featfile, \ open(options.outdir/"traintasks") as taskfile: for uttid in uttids[train_mask]: speaker = "_".join(uttid.split("_")[:-1]) featpath = dataconf.get("speaker")["feats"] taskstring = coder.decode(labels[uttid]) featfile.write(f"{uttid} {featpath}:{uttid}\n") taskfile.write(f"{uttid} {taskstring}") return train_set, valid_set
def load_structure_and_coder(expdir): structure = Structure(expdir / "structure.xml") coderconf = read_config(expdir / "coder.cfg") Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) return structure, coder
@dataclass class Task: name:str args:dict # Setup confdir = Path("config/FluentSpeechCommands/lstm_128") # outdir = Path("exp/fluent/finetune_gru") outdir = Path("exp/fluent/finetune_gru_enc_upd_2") dataconf = read_config(confdir/"database.cfg") coderconf = read_config(confdir/"coder.cfg") structure = Structure(confdir/"structure.xml") Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) # Model encoder = torch.load(outdir/"encoder.pt", map_location="cuda") decoder = torch.load(outdir/"decoder.pt", map_location="cuda") for module in (encoder, decoder): for line in str(module).split("\n"): logger.info(line) for p in module.parameters(): p.requires_grad = False # Target speakers = list(dataconf.sections()) taskfiles = list(map(lambda spkr: dataconf[spkr].get("tasks"), speakers)) taskstrings = dict(sum(map(load_tasks, taskfiles), []))
def evaluate(expdir, cuda=False, clean=False): logger.info(f"Evaluate {expdir}") acquisitionconf = read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / "structure.xml") Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) model.display(logger.info) model.load(expdir / 'model') model.display(logger.info) features = FeatLoader(expdir / "testfeats").to_dict() with open(expdir / "testtasks") as testtasks: references = { key: read_task(value) for key, value in map(parse_line, testtasks.readlines()) if key in features } assert len(features) == len(references), set(features) - set(references) to_remove = [] for uttid, feat in features.items(): if not np.isfinite(feat).all(): to_remove.append(uttid) if to_remove: logger.warning(f"Found {len(to_remove)} utterances with nan.") for uttid in to_remove: del features[uttid] del references[uttid] decoded = model.decode(features) assert not (set(decoded) - set(references)) y_true = np.array([coder.encode(task) for task in references.values()]) y_pred = np.array([coder.encode(task) for task in decoded.values()]) TP = (y_pred == 1) & (y_true == 1) TN = (y_pred == 0) & (y_true == 0) FP = (y_pred == 1) & (y_true == 0) FN = (y_pred == 0) & (y_true == 1) error_rate = 1 - (TP | TN).all(-1).mean() precision = TP.sum() / (TP | FP).sum() recall = TP.sum() / (TP | FN).sum() f1_score = 2 * precision * recall / (precision + recall) logger.info(f"TPR={TP.sum()} TN={TN.sum()} FP={FP.sum()} FN={FN.sum()}") logger.info( f"P={precision:.2%} R={recall:.2%} F={f1_score:.2%} E={error_rate:.2%}" ) for line in classification_report(y_true, y_pred).split("\n"): logger.info(line) with open(expdir / "dectasks", "w") as dectasks_file: dectasks_file.writelines( [f"{name} {to_string(task)}\n" for name, task in decoded.items()]) metrics, scores = score(decoded, references) for metric_name, metric in metrics.items(): logger.info(f"{metric_name}: {metric:.4f}") with open(expdir / metric_name.replace(" ", ""), "w") as f: f.write(str(metric)) write_scores(scores, expdir) if clean: logger.info(f"Remove {expdir}/model") os.remove(expdir / 'model')
def main(expdir, cuda): expdir = Path(expdir) if (expdir / "f1").exists(): logger.info(f"Results found at {expdir}") return logger.info(f"Evaluate {expdir}") acquisitionconf = tools.read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = tools.read_config(expdir / "coder.cfg") structure = Structure(expdir / "structure.xml") Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) logger.debug(f"Loading model at {expdir}/model") model.load(expdir / 'model') with open(expdir / "testfeats") as testfeats: features = { line[0]: np.load(line[1]) for line in map(tools.parse_line, testfeats.readlines()) } with open(expdir / "testtasks") as testtasks: references = { key: read_task(value) for key, value in map(tools.parse_line, testtasks.readlines()) if key in features } assert len(features) == len(references) #decode the test utterances feats = deepcopy(features) errors, nans, too_small = 0, 0, 0 for uttid, feat in feats.items(): remove = False # if feat.shape[0] < 5: # too_small += 1 # remove = True if not np.isfinite(feat).all(): nans += 1 remove = True if remove: logger.debug(f"Removing {uttid}") errors += 1 del features[uttid] del references[uttid] if errors > 0: logger.warning( f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)" ) decoded = model.decode(features) with open(expdir / "dectasks", "w") as dectasks_file: dectasks_file.writelines( [f"{name} {to_string(task)}\n" for name, task in decoded.items()]) metric_names = [ "precision", "recal", "f1", "macro precision", "macro recal", "macro f1" ] metrics, scores = score(decoded, references) for metric_name, metric in zip(metric_names, metrics): logger.info(f"{metric_name}: {metric:.4f}") with open(expdir / metric_name.replace(" ", ""), "w") as f: f.write(str(metric)) write_scores(scores, expdir)