def prepare_traindir(expdir, recipe): os.makedirs(expdir) for filename in [ "acquisition.cfg", "coder.cfg", "train.cfg", "structure.xml" ]: logger.debug(f"Copy {filename} from {recipe} to {expdir}") shutil.copy(recipe / filename, expdir) trainconf = read_config(expdir / "train.cfg") dataconf = read_config(recipe / "database.cfg") if "fluent" in str(expdir).lower(): convert_uttid = lambda spkr, uttid: (spkr if not uttid.startswith(spkr) else "") + uttid else: convert_uttid = lambda spkr, uttid: (spkr + "_" if not uttid. startswith(spkr) else "") + uttid logger.debug("Create trainfeats and traintasks files") with open(expdir / "trainfeats", "w") as feats, open(expdir / "traintasks", "w") as tasks: for section in trainconf.get("train", "datasections").split(): with open(Path(dataconf.get(section, "features")) / "feats") as f: feats.writelines( [convert_uttid(section, line) for line in f.readlines()]) with open(Path(dataconf.get(section, "tasks"))) as f: tasks.writelines( [convert_uttid(section, line) for line in f.readlines()]) nfeats, ntasks = (subprocess.check_output( f"wc -l {expdir}/{filename}".split()).decode("utf-8").split()[0] for filename in ("trainfeats", "traintasks")) logger.info(f"Written {nfeats} features and {ntasks} tasks to {expdir}")
def train(self, train_set, valid_set, test_set=None): self.initialize() epoch_bar = trange(self.max_epochs, desc="epochs") logger.debug( f"Training for {self.max_epochs} epochs on {len(train_set)} examples" ) history = defaultdict(list) for self._epoch in epoch_bar: log = self.train_one_epoch(train_set) log = self.evaluate(valid_set, log) [history[key].append(log) for key in log] epoch_log = self.on_epoch_end(log, progress_bar=epoch_bar) if self.early_stopping and self.early_stopping.should_stop( self._epoch): logger.info("Early stopping reached") break for scheduler in self.schedulers: if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau): scheduler.step(log["val_loss"]) with open(f"{self.checkpoint_dir}/history.json", "w") as f: json.dump(history, f) if test_set is not None: log = self.evaluate(test_set) with open(f"{self.checkpoint_dir}/results.json", "w") as f: json.dump(self.accumulate_metrics(log), f, indent=4)
def train_loop(self, ): train_loss = 0 self.encoder.train() self.decoder.train() predictions, target = None, None for inputs, input_lengths, labels in tqdm(train_lr, total=len(train_set)//train_lr.batch_size): encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() inputs = inputs.to(device) labels = labels.to(device) encodings, encoding_lengths = encoder.encode(inputs, input_lengths) loss, preds = decoder(encodings, encoding_lengths, labels=labels) loss.backward() logger.debug(f"loss={loss.item()}") encoder_optimizer.step() decoder_optimizer.step() train_loss += loss.item() preds = preds.detach().cpu().numpy() predictions = ( preds if predictions is None else np.concatenate([predictions, preds], axis=0) ) labels = labels.detach().cpu().numpy() target = ( labels if target is None else np.concatenate([target, labels], axis=0) ) train_loss = float(train_loss/len(train_set)) train_error_rate = compute_error_rate(target, predictions) return {"loss": train_loss, "error_rate": train_error_rate}
def encode_feats(expdir, featconf, dataconf, aggfunc=None): speaker = expdir.name target_file = Path(dataconf["features"]) storage_type = featconf.get("features", "storage") model_string = featconf.get("features", "encoder") import torch from transformers import AutoTokenizer, AutoModel logger.info(f"Loading tokenizer and model: {model_string}") tokenizer = AutoTokenizer.from_pretrained(model_string) encoder = AutoModel.from_pretrained(model_string) encoder.eval() # HACK: Small workaround but would be better to use --cuda flag device = torch.device( "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES", None) else "cpu") logger.debug(f"Device: {device}") encoder.to(device) for param in encoder.parameters(): param.requires_grad = False with open(featconf.get("features", "file"), "r") as textfile: uttids, texts = map( list, zip(*map( parse_line, filter(lambda line: line.startswith(speaker), textfile.readlines())))) inputs = tokenizer(texts, add_special_tokens=False, return_tensors="pt", padding=True) input_lengths = inputs["attention_mask"].sum(-1) out = encoder( **{ k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items() }) if len(out) == 2: data = out[featconf.get("features", "agg") == "pooled"] else: data, = out data = { uttid: data[i, :length].cpu().numpy() for i, (uttid, length) in enumerate(zip(uttids, input_lengths)) } if aggfunc is not None: data = {uttid: aggfunc(array) for uttid, array in data.items()} save_features(data, target_file, storage_type)
def prepare_subset(expdir, subset, dataconf): conf = read_config(expdir / f"{subset}.cfg") logger.debug(f"Create {subset}feats and {subset}tasks files") with open(expdir/f"{subset}feats", "w") as feats, \ open(expdir/f"{subset}tasks", "w") as tasks: for section in conf.get(subset, "datasections").split(): featfile, taskfile = (Path(dataconf.get(section, key)) for key in ["features", "tasks"]) scpfile = str(featfile).replace(featfile.suffix, ".scp") for filepath, outfile in zip((scpfile, taskfile), (feats, tasks)): with open(filepath) as f: uttids, values = zip(*map(parse_line, f)) if not (uttids[0].startswith(section)): # Make sure that uttid has format $speaker_$uttid uttids = list(map(f"{section}_{{}}".format, uttids)) outfile.writelines([ f"{uttid} {value}\n" for uttid, value in zip(uttids, values) ]) nfeats, ntasks = (subprocess.check_output( f"wc -l {expdir}/{filename}".split()).decode("utf-8").split()[0] for filename in (f"{subset}feats", f"{subset}tasks")) logger.info( f"Written {nfeats} features and {ntasks} tasks to {expdir} ({subset})")
def run_prepare_database(expdir, recipe, backend="local", njobs=-1, overwrite=False): logger.debug(f"Create {expdir}") os.makedirs(expdir) dataconf = read_config(recipe / "database.cfg") shutil.copy(recipe / "features.cfg", expdir / "features.cfg") speakers = dataconf.sections() nspeakers = len(speakers) mp_map(map_prepare_spkrdir, [expdir] * nspeakers, speakers, [dict(dataconf[spkr].items()) for spkr in speakers], njobs=nspeakers) spkrdirs = [expdir / speaker for speaker in speakers] if backend == "condor": condor_submit(expdir, "prepare_dataset", spkrdirs) else: mp_map(map_prepare_features, spkrdirs, njobs=njobs)
def train(self, examples, test_examples=None): logger.debug(f"{len(examples)} training examples") features, tasks = zip(*examples.values()) target = [self.encode_target(task) for task in tasks] if test_examples is not None: test_feats, test_tasks = zip(*examples.values()) test_target = [self.encode_target(task) for task in test_tasks] self.fit(features, target, (test_feats, test_target)) return self.fit(features, target)
def train_one_step(self, inputs, labels, optimizer, lr_scheduler=None): if lr_scheduler is not None: lr_scheduler.step() logger.debug(f"LR: {lr_scheduler.get_lr()}") optimizer.zero_grad() *inputs, labels = self._recursive_to(*inputs, labels) loss, _ = self.model(*inputs, labels=labels) loss.backward() optimizer.step() return loss.item()
def prepare_gridsearch(expdir, recipe): os.makedirs(expdir, exist_ok=True) for filename in [ "param_grid.json", "gridsearch.cfg", "coder.cfg", "structure.xml" ]: logger.debug(f"Copy {filename} from {recipe} to {expdir}") shutil.copy(recipe / filename, expdir / filename) dataconf = read_config(recipe / "database.cfg") prepare_subset(expdir, "gridsearch", dataconf)
def build_encoder(model_dir, freeze=None): options = load_args(model_dir) model, train_args = load_trained_model(options.resume) model.teacher_model = None if freeze: for m in freeze: logger.info(f"Freeze {m} in encoder") for p in getattr(model, m).parameters(): p.requires_grad = False display_model(model, logger.info) logger.debug(train_args) return model
def prepare_train(expdir, recipe): os.makedirs(expdir) for filename in ("acquisition.cfg", "coder.cfg", "train.cfg", "test.cfg", "structure.xml"): logger.debug(f"Copy {filename} from {recipe} to {expdir}") shutil.copy(recipe / filename, expdir / filename) dataconf = read_config(recipe / "database.cfg") for subset in ("train", "test"): prepare_subset(expdir, subset, dataconf)
def on_epoch_end(self, log, progress_bar=None): log = self.accumulate_metrics(log) if progress_bar: progress_bar.set_postfix({k: f"{v:.4f}" for k, v in log.items()}) else: logger.debug(f"[{self._epoch+1}] " + " ".join(f"[{k}={v:.4f}]" for k, v in log.items())) if self._epoch + 1 % self.save_interval == 0: self.save_checkpoint(f"{self._epoch:04d}") best = False if self.early_stopping and self.early_stopping(self._epoch, log, self.model): self.save_checkpoint("best") return log
def __init__( self, config=None, coder=None, expdir=None ): if isinstance(config, ConfigParser): self.config = dict(config["acquisition"].items()) else: self.config = config self.coder = coder self.expdir = expdir self.n_classes = self.config.get("output_dim", None) or coder.numlabels # Backward compat self.model = self.build() logger.debug(f"Num classes: {self.n_classes}")
def prepare_cross_validation(expdir, recipe): os.makedirs(expdir, exist_ok=True) for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"): logger.debug(f"Copy {filename} from {recipe} to {expdir}") shutil.copy(recipe / filename, expdir / filename) expconf = dict( read_config(recipe / "cross_validation.cfg", default=Path(__file__).parent / "defaults/cross_validation.cfg").items("cross_validation")) random_seed = int(expconf.get("random_seed", 3105)) logger.debug(f"Setting random seed to {random_seed}") random.seed(random_seed) dataconf = read_config(recipe / "database.cfg") coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get("coder", "name")) coder = Coder(structure, coderconf) speakers = list(dataconf.sections()) if "speakers" in expconf and expconf["speakers"]: speakers = list( filter(lambda spkr: spkr in expconf["speakers"], speakers)) logger.info(f"{len(speakers)} speakers selected for cross-validation") option_list = [ dict(expdir=expdir, speaker=speaker, coder=coder, dataconf=dataconf, expconf=expconf) for speaker in speakers ] for opts in option_list: map_prepare_filesystem(opts)
def evaluate(self, dataset, log=None): batch_size = 32 self.model.eval() log = log or defaultdict(int) valid_lr = self.make_dataloader(dataset, batch_size, train=False) iter_bar = tqdm(valid_lr, desc="eval", total=len(dataset) // batch_size, leave=False) logger.debug(f"Evaluate {len(dataset)} examples") for batch in iter_bar: val_loss, metrics = self.eval_one_step(*batch) log["val_loss"] += float(val_loss) for metric, value in metrics.items(): log[metric] += value iter_bar.set_postfix( dict(val_loss=f"{val_loss/batch_size:.3f}", **{ k: f"{v:.3f}" for k, v in self.accumulate_metrics(metrics).items() })) log["val_loss"] /= len(dataset) return log
def train_loop(self, dataset): if self.iterations > 0: self.max_iter = self.iterations self.epochs = 0 else: self.max_iter = int( np.ceil(self.epochs * len(dataset) / self.batch_size)) assert any([self.epochs, self.iterations]) logger.debug(f"Number of iterations: {self.iterations}") optimizer = torch.optim.Adam(self.model.parameters()) progress_bar = tqdm( total=self.max_iter, bar_format= "{postfix[1][iter]}/{postfix[0]} loss={postfix[1][loss]:.4f}", postfix=[self.max_iter, { "iter": 0, "loss": float('inf') }]) with progress_bar: iteration = 0 while True: train_iter = iter(self.batch_iterator(dataset, is_train=True)) iteration += 1 *inputs, labels = next(train_iter) train_loss = self.train_one_step(inputs, labels, optimizer) / len(labels) progress_bar.postfix[1].update({ "iter": iteration, "loss": train_loss }) progress_bar.update() if iteration == self.max_iter: break
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1): logger.info(f"GridSearch {expdir}") with open(recipe / "param_grid.json") as jsonfile: param_grid = json.load(jsonfile) logger.debug(str(param_grid)) total_params = np.prod(list(map(len, param_grid.values()))) logger.warning( f"Searching {len(param_grid)} parameters, totalling {total_params} possible values." ) gsconf = read_config(expdir / "gridsearch.cfg") default_config = dict(gsconf["acquisition"].items()) default_config["device"] = "cuda" if cuda else "cpu" gsconf = dict(gsconf["gridsearch"].items()) logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items())) train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]), int(gsconf["num_trains"])) gs_params = { "train_sizes": train_sizes, "cv": int(gsconf["cv_splits"]), "scoring": make_scorer(accuracy) if gsconf["scoring"] == "accuracy" else gsconf["scoring"], "n_jobs": n_jobs } logger.debug(gs_params) coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) default_config["output_dim"] = coder.numlabels features = FeatLoader(expdir / "gridsearchfeats").to_dict() with open(expdir / "gridsearchtasks") as traintasks: taskstrings = { uttid: task for uttid, task in map(parse_line, traintasks.readlines()) } indices = sorted(set(features).intersection(set(taskstrings))) X = list(map(features.__getitem__, indices)) y = list( map(coder.encode, map(read_task, map(taskstrings.__getitem__, indices)))) gs_results = defaultdict(list) start = time() best_score = 0 for i, param_values in enumerate(product(*param_grid.values())): t0 = time() params = dict(zip(param_grid.keys(), param_values)) config = deepcopy(default_config) config.update(params) logger.debug(config) model = RNNClassifier(**config) train_sizes, train_scores, valid_scores = learning_curve( model, X, y, **gs_params) train_score = auc(train_sizes, train_scores.mean(-1)) test_score = auc(train_sizes, valid_scores.mean(-1)) t1 = time() logger.info( f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} " f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}") gs_results["auc_test_score"].append(test_score) gs_results["auc_train_score"].append(train_score) gs_results["params"].append(params) gs_results["train_sizes"].append(train_sizes) gs_results["train_scores"].append(train_scores) gs_results["test_scores"].append(valid_scores) if test_score > best_score: best_params, best_score, best_index = params, test_score, i logger.warning( f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})" ) logger.warning( f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}") with open(expdir / "gs_results.json", "w") as result_file: json.dump( { "best_params": best_params, "best_score": best_score, "cv_results": serialise(gs_results) }, result_file)
def main(expdir, cuda): expdir = Path(expdir) if (expdir / "f1").exists(): logger.info(f"Results found at {expdir}") return logger.info(f"Evaluate {expdir}") acquisitionconf = tools.read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = tools.read_config(expdir / "coder.cfg") structure = Structure(expdir / "structure.xml") Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) logger.debug(f"Loading model at {expdir}/model") model.load(expdir / 'model') with open(expdir / "testfeats") as testfeats: features = { line[0]: np.load(line[1]) for line in map(tools.parse_line, testfeats.readlines()) } with open(expdir / "testtasks") as testtasks: references = { key: read_task(value) for key, value in map(tools.parse_line, testtasks.readlines()) if key in features } assert len(features) == len(references) #decode the test utterances feats = deepcopy(features) errors, nans, too_small = 0, 0, 0 for uttid, feat in feats.items(): remove = False # if feat.shape[0] < 5: # too_small += 1 # remove = True if not np.isfinite(feat).all(): nans += 1 remove = True if remove: logger.debug(f"Removing {uttid}") errors += 1 del features[uttid] del references[uttid] if errors > 0: logger.warning( f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)" ) decoded = model.decode(features) with open(expdir / "dectasks", "w") as dectasks_file: dectasks_file.writelines( [f"{name} {to_string(task)}\n" for name, task in decoded.items()]) metric_names = [ "precision", "recal", "f1", "macro precision", "macro recal", "macro f1" ] metrics, scores = score(decoded, references) for metric_name, metric in zip(metric_names, metrics): logger.info(f"{metric_name}: {metric:.4f}") with open(expdir / metric_name.replace(" ", ""), "w") as f: f.write(str(metric)) write_scores(scores, expdir)
def train( encoder, decoder, train_set, valid_set, enc_ckpt, dec_ckpt, max_epochs=20, batch_size=64, enc_lr=1e-5, dec_lr=1e-2, es_patience=5, es_criterion="val_loss", es_lower_is_better=True, device="cuda", enc_update_interval=1 ): encoder.to(device) decoder.to(device) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=enc_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=dec_lr) best_error_rate = float("inf") best_loss = float("inf") best_epoch = 0 epoch = 0 t0 = time() def should_stop(epoch): return (epoch - best_epoch) > es_patience or epoch >= max_epochs def check_early_stopping(epoch, loss, error_rate): nonlocal best_error_rate, best_loss, best_epoch if es_criterion == "val_loss": metric, prev = loss, best_loss else: metric, prev = error_rate, best_error_rate if (metric - prev) * (-1 if es_lower_is_better else 1) > 0: best_epoch = epoch best_loss = loss best_error_rate = error_rate torch.save(encoder, enc_ckpt) torch.save(decoder, dec_ckpt) while True: train_lr, val_lr = (DataLoader( dataset, batch_size=batch_size, shuffle=not i, collate_fn=dataset.data_collator ) for i, dataset in enumerate([train_set, valid_set])) train_loss, val_loss = 0, 0 encoder.train() decoder.train() predictions, target = None, None for i, (inputs, input_lengths, labels) in enumerate(tqdm(train_lr, total=len(train_set)//train_lr.batch_size), 1): encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() inputs = inputs.to(device) labels = labels.to(device) encodings, encoding_lengths = encoder.encode(inputs, input_lengths) loss, preds = decoder(encodings, encoding_lengths, labels=labels) loss.backward() logger.debug(f"loss={loss.item()}") if i % enc_update_interval == 0: encoder_optimizer.step() decoder_optimizer.step() train_loss += loss.item() preds = preds.detach().cpu().numpy() predictions = ( preds if predictions is None else np.concatenate([predictions, preds], axis=0) ) labels = labels.detach().cpu().numpy() target = ( labels if target is None else np.concatenate([target, labels], axis=0) ) train_loss = float(train_loss/len(train_set)) train_error_rate = compute_error_rate(target, predictions) predictions, target = None, None with torch.no_grad(): encoder.eval() decoder.eval() for inputs, input_lengths, labels in val_lr: inputs = inputs.to(device) labels = labels.to(device) encodings, encoding_lengths = encoder.encode(inputs, input_lengths) loss, preds = decoder(encodings, encoding_lengths, labels=labels) val_loss += loss.item() preds = preds.detach().cpu().numpy() predictions = ( preds if predictions is None else np.concatenate([predictions, preds], axis=0) ) labels = labels.detach().cpu().numpy() target = ( labels if target is None else np.concatenate([target, labels], axis=0) ) val_loss = float(val_loss/len(valid_set)) val_error_rate = compute_error_rate(target, predictions) epoch += 1 logger.info(f"Epoch {epoch} [ TRAIN {train_loss:.4f} {train_error_rate:.4f} ] " f"[ VALID {val_loss:.4f} {val_error_rate:.4f} ]") check_early_stopping(epoch, val_loss, val_error_rate) if should_stop(epoch): break logger.info(f"Training completed in {time() - t0:.2f} s. " f"Best epoch: {best_epoch} Loss: {best_loss:.3f} ER: {best_error_rate:.4%}")
def prepare_filesystem(expdir, speaker, coder, dataconf, expconf): speaker_dir = expdir / speaker os.makedirs(speaker_dir, exist_ok=True) feature_file = Path(dataconf.get(speaker, 'features')) with open(str(feature_file).replace(feature_file.suffix, ".scp")) as featfile: features = dict(map(parse_line, featfile.readlines())) with open(dataconf.get(speaker, "tasks")) as taskfile: task_strings = { f"{speaker}_{uttid}": task for uttid, task in map(parse_line, taskfile.readlines()) } for uttid in list(task_strings): if uttid not in features: logger.warning(f"Missing utterance speaker {speaker}: {uttid}") del task_strings[uttid] tasks = [ coder.encode(read_task(task)) for task in task_strings.values() ] if not tasks: logger.error(f"Error with speaker {speaker}: no tasks") return [] tasks = np.array(tasks) blocks_path = speaker_dir / "blocks.pkl" if blocks_path.exists(): with open(blocks_path, "rb") as blockfile: blocks = pickle.load(blockfile) else: try: blocks = make_blocks(tasks, expconf, feature_file.parent) except Exception as err: logger.error(f"Error with speaker {speaker}: {err}") return [] with open(blocks_path, "wb") as blockfile: pickle.dump(blocks, blockfile) num_exp = int(expconf["numexp"]) train_ids, test_ids = [], [] for block_id in range(len(blocks) - 1): train_ids.append([]) test_ids.append([]) for exp_id in range(num_exp): train_ids[-1].append( list( itertools.chain.from_iterable( random.sample(blocks, block_id + 1)))) test_ids[-1].append( [i for i in range(len(tasks)) if i not in train_ids[-1][-1]]) if not (train_ids and test_ids): logger.error(f"Error with speaker {speaker}: no utterances") return [] uttids = list(task_strings) block_id = int(expconf['startblocks']) - 1 while True: dirname = f"{block_id + 1}blocks_exp" num_exp = int(expconf['numexp']) for exp_id in range(num_exp): subexpdir = expdir / speaker / (dirname + str(exp_id)) logger.debug(f"Experiment {subexpdir.name}") if (subexpdir / "f1").exists(): logger.info(f"Skipping {subexpdir}") continue os.makedirs(subexpdir, exist_ok=True) for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"): symlink(f"../../{filename}", subexpdir / filename, relative=True) if not (subexpdir / "trainfeats").exists(): for subset, ids in [("train", train_ids), ("test", test_ids)]: utts = [ uttids[idx] for idx in ids[block_id][exp_id] if idx < len(uttids) ] if len(utts) != len(ids[block_id][exp_id]): num_lost = len(ids[block_id][exp_id]) - len(utts) logger.warning(f"Lost {num_lost} {subset} utterances") logger.debug(f"Number of {subset} examples: {len(utts):,}") writefile(subexpdir / f"{subset}feats", {utt: features[utt] for utt in utts}) writefile(subexpdir / f"{subset}tasks", {utt: task_strings[utt] for utt in utts}) next_block_id = (block_id + 1) * int(expconf['scale']) + int( expconf['increment']) - 1 next_block_id = min(next_block_id, len(blocks) - 2) if block_id == next_block_id: break else: block_id = next_block_id
def make_blocks(labelmat, conf, blocksdir): ''' devides the data into blocks of similar content by minimising the Jensen-Channon divergence args: labelmat: the label matrix of shape [numutt x numlabels] conf: the experiments configuration blocksdir: the directory where blocks are stored returns: - the data blocks as a list containing lists of utterance indices ''' req_blocks = int(conf["numblocks"]) min_blocks = int(conf["minblocks"]) all_labels = conf['alllabels'].lower() == "true" balanced_blocks = conf['balancedblocks'].lower() == 'true' blocksdir = Path(blocksdir) os.makedirs(blocksdir, exist_ok=True) # initialise numblocks as the requested number of blocks nblocks = min(req_blocks, labelmat.shape[0]) labelmat = labelmat.astype(int) if all_labels: # ignore labels that have less than the minimum amount of labels to_count, = np.where(labelmat.sum(0) >= min_blocks) nblocks = int(min(nblocks, np.min(labelmat[:, to_count].sum(0)))) while True: # check if the minimum number of blocks has been reached if nblocks < min_blocks: raise ValueError(f'Failed to create {min_blocks} blocks') blocksfile = blocksdir / f'{nblocks}blocks.pkl' if os.path.exists(blocksfile): with open(blocksfile, 'rb') as fid: return pickle.load(fid) else: # compute the average distribution of labels Tdist = np.sum(labelmat, 0) / np.sum(labelmat) # make a random initialisation for the blocks ind = list(np.random.permutation(range(labelmat.shape[0]))) blocks = [ ind[int(i * labelmat.shape[0] / nblocks):int((i + 1) * labelmat.shape[0] / nblocks)] for i in range(nblocks) ] # compute the label counts in all blocks clab = [np.sum(labelmat[blocks[b], :], 0) for b in range(nblocks)] dist = [clab[b] / np.sum(clab[b]) for b in range(nblocks)] # compute the initial KLD to the mean for all blocks KLD = [ np.sum(dist[b][np.nonzero(dist[b])] * np.log( dist[b][np.nonzero(dist[b])] / Tdist[np.nonzero(dist[b])])) for b in range(nblocks) ] # compute the gains for removing an utterance from a block remove_gains = np.zeros(labelmat.shape[0]) swap_gains = np.zeros([labelmat.shape[0], nblocks]) for b1 in range(nblocks): for u in blocks[b1]: cb = clab[b1] - labelmat[u, :] dist = cb / np.sum(cb) remove_gains[u] = ( KLD[b1] - np.sum(dist[np.nonzero(dist)] * np.log( dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))) for b2 in range(nblocks): if b1 != b2: cb = clab[b2] + labelmat[u, :] dist = cb / np.sum(cb) swap_gains[u, b2] = ( KLD[b2] - np.sum(dist[np.nonzero(dist)] * np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))) # compute the complete gains for al the moves gains = remove_gains[:, np.newaxis] + swap_gains # remove the elements wher utterances stay in the same block for b in range(nblocks): gains[blocks[b], b] = 0 #find the best swap I = np.argmax(gains) uc = I // nblocks bt = I % nblocks KLD_track = [sum(KLD)] while gains[uc, bt] > 0 and balanced_blocks: # find the originating block bo = [uc in b for b in blocks].index(True) # apply the change blocks[bt].append(uc) del blocks[bo][blocks[bo].index(uc)] # update the counts for the relevant blocks clab[bo] = clab[bo] - labelmat[uc, :] clab[bt] = clab[bt] + labelmat[uc, :] # update the costs for the relevant block dist = clab[bo] / np.sum(clab[bo]) KLD[bo] = np.sum( dist[np.nonzero(dist)] * np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)])) dist = clab[bt] / np.sum(clab[bt]) KLD[bt] = np.sum( dist[np.nonzero(dist)] * np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)])) KLD_track.append(sum(KLD)) # update the remove gains for the utterances in the relevant # blocks for b in [bo, bt]: for u in blocks[b]: cb = clab[b] - labelmat[u, :] dist = cb / np.sum(cb) remove_gains[u] = ( KLD[b] - np.sum(dist[np.nonzero(dist)] * np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))) # update the swap costs for all the utterances to the relevant # blocks swap_gains[uc, bt] = 0 for b1 in range(nblocks): for b2 in [bt, bo]: if b1 != b2: for u in blocks[b1]: cb = clab[b2] + labelmat[u, :] dist = cb / np.sum(cb) swap_gains[u, b2] = ( KLD[b2] - np.sum(dist[np.nonzero(dist)] * np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))) # compute the complete gains for al the moves gains = remove_gains[:, np.newaxis] + swap_gains # remove the elements wher utterances stay in the same block for b in range(nblocks): gains[blocks[b], b] = 0 #find the best swap I = np.argmax(gains) uc = I // nblocks bt = I % nblocks # there are no more changes with gain, check if all labels occur in # all blocks if (not any([any(clab[b][to_count] == 0) for b in range(nblocks)]) or not (all_labels)): with open(blocksfile, 'wb') as fid: pickle.dump(blocks, fid) break # if there are blocks that don't have all labels decrement the number # of blocks and start over nblocks -= 1 logger.debug(f"Created {nblocks} blocks (requested: {req_blocks})") return blocks
def save_checkpoint(self, suffix): path = f"{self.checkpoint_dir}/ckpt-{suffix}.pt" logger.debug(f"Saving model to {path}") torch.save(self.model, path)