def reformat_KRSL(): np.random.seed(0) krsl_video_dir = os.path.join(KRSL_DIR, "videos") videos = list(glob.glob(os.sep.join([krsl_video_dir, "**", "*.mp4"]))) fps_out = 25 fourcc = cv2.VideoWriter_fourcc(*'mp4v') np.random.shuffle(videos) pp = ProgressPrinter(len(videos), 15) print("Reformatting KRSL") not_images = 0 for idx, video_path in enumerate(videos): out_video_path = os.sep.join([VIDEOS_DIR] + video_path.split(os.sep)[-2:]) video_dir = os.path.split(out_video_path)[0] if (os.path.exists(out_video_path)): pp.omit() continue images, fps = get_images(video_path) if not images: not_images += 1 pp.omit() continue images = resize_images(images) L = len(images) L_out = round(L * fps_out / fps) images = np.array(images) idxs = np.linspace(0, L, L_out, endpoint=False) hw = images.shape[1:3] assert (hw == (200, 360) or hw == (200, 200) or hw == (360, 200)) images = [images[round(i)] for i in idxs] if not os.path.exists(video_dir): os.makedirs(video_dir) out = cv2.VideoWriter(out_video_path, fourcc, 25.0, hw[::-1]) for frame in images: out.write(frame) out.release() pp.show(idx) pp.end() clean_anno_KRSL("train", save=True) clean_anno_KRSL("test", save=True) clean_anno_KRSL("dev", save=True)
def generate_openpose_features_split(pose_estimator, split): with torch.no_grad(): df = get_split_df(split) print(SOURCE, "Feature extraction:", STF_MODEL, split, "split") L = df.shape[0] pp = ProgressPrinter(L, 1) for idx in range(L): row = df.iloc[idx] video_dir, feat_path = get_video_path(row, split, feat_ext=".npy") if os.path.exists(feat_path): pp.omit() continue feat_dir = os.path.split(feat_path)[0] feats = pose_estimator.estimate_video_pose(video_dir) if not os.path.exists(feat_dir): os.makedirs(feat_dir) np.save(feat_path, feats) if SHOW_PROGRESS: pp.show(idx) if SHOW_PROGRESS: pp.end() print()
def convert_phoenix_to_videos(): ph_images_dir = os.sep.join([PH_DIR, "features", "fullFrame-210x260px"]) video_dirs = list(glob.glob(os.sep.join([ph_images_dir, '*', '*', '1']))) pp = ProgressPrinter(len(video_dirs), 5) print("Converting Images into Videos") for idx, video_dir in enumerate(video_dirs): image_paths = sorted(list(glob.glob(os.path.join(video_dir, "*.png")))) video_path = os.path.split(video_dir)[0] + ".mp4" video_path = os.sep.join([VIDEOS_DIR] + video_path.split(os.sep)[-2:]) if os.path.exists(video_path): pp.omit() continue video_dir = os.path.split(video_path)[0] if not os.path.exists(video_dir): os.makedirs(video_dir) fourcc = cv2.VideoWriter_fourcc(*'mp4v') shape = (210, 260) out = cv2.VideoWriter(video_path, fourcc, 25.0, shape) for im in image_paths: frame = cv2.imread(im) out.write(frame) out.release() pp.show(idx) pp.end() print()
def eval_split_by_lev(model, vocab, split): df = get_split_df(split) pp = ProgressPrinter(df.shape[0], 5) hypes = [] gts = [] with torch.no_grad(): for idx in range(df.shape[0]): row = df.iloc[idx] gt = vocab.encode(row.annotation) video_path, feat_path = get_video_path(row, split) tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) pred = model(tensor_video).squeeze(1).log_softmax(dim=1).argmax( dim=1).cpu().numpy() hypo = [] for i in range(len(pred)): if pred[i] == 0 or (i > 0 and pred[i] == pred[i - 1]): continue hypo.append(pred[i]) gts += gt hypes += hypo pp.show(idx) pp.end() hypes = "".join([chr(x) for x in hypes]) gts = "".join([chr(x) for x in gts]) wer = Lev.distance(hypes, gts) / len(gts) * 100 print(wer)
def _build_dataset(self): dataset_dir = os.sep.join([END2END_DATASETS_DIR, self._get_ffm()]) X_path = os.sep.join([dataset_dir, "X_" + self.split + ".pkl"]) Y_path = os.sep.join([dataset_dir, "Y_" + self.split + ".pkl"]) X_lens_path = os.sep.join( [dataset_dir, "X_lens_" + self.split + ".pkl"]) if os.path.exists(X_path) and os.path.exists( Y_path) and os.path.exists(X_lens_path) and self.load: with open(X_path, 'rb') as f: self.X = pickle.load(f) with open(Y_path, 'rb') as f: self.Y = pickle.load(f) with open(X_lens_path, 'rb') as f: self.X_lens = pickle.load(f) print(self.split[0].upper() + self.split[1:], "dataset loaded") else: print("Building", self.split, "dataset") df = get_split_df(self.split) self.X = [] self.Y = [] self.X_lens = [] pp = ProgressPrinter(df.shape[0], 5) for idx in range(df.shape[0]): row = df.iloc[idx] glosses = self.vocab.encode(row.annotation) feat_path, feat, feat_len = self._get_feat(row, glosses) if feat is None: continue self.X.append(feat_path) self.Y.append(glosses) self.X_lens.append(feat_len) if self._show_progress(): pp.show(idx) if self._show_progress(): pp.end() if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) with open(X_path, 'wb') as f: pickle.dump(self.X, f) with open(Y_path, 'wb') as f: pickle.dump(self.Y, f) with open(X_lens_path, 'wb') as f: pickle.dump(self.X_lens, f) self.length = len(self.X)
def gen_img_feat_split(model, preprocess, split): if SOURCE == "KRSL" and split == "dev": split = "val" df = get_split_df(split) print(SOURCE, STF_MODEL, "feature extraction:", split, "split") L = df.shape[0] pp = ProgressPrinter(L, 10) for idx in range(L): row = df.iloc[idx] video_path, feat_path = get_video_path(row, split, stf_feat=False) if os.path.exists(feat_path) and not FEAT_OVERRIDE: pp.omit() continue feat_dir = os.path.split(feat_path)[0] images = get_images(video_path) if len(images) < 4: continue tensor_video = get_tensor_video(images, preprocess, "2D") inp = tensor_video.to(DEVICE) feat = model(inp).cpu() if not os.path.exists(feat_dir): os.makedirs(feat_dir) torch.save(feat, feat_path) if SHOW_PROGRESS: pp.show(idx) if SHOW_PROGRESS: pp.end()
def generate_gloss_dataset(vocab, stf_type=STF_TYPE, use_feat=USE_ST_FEAT): print("Generation of the Gloss-Recognition Dataset") model, loaded = get_end2end_model(vocab, True, stf_type, use_feat) mode = "3D" if stf_type else "2D" if not loaded: print("STF or SEQ2SEQ model doesn't exist") exit(0) model.eval() temp_stride = 4 rerun_out_dir = os.path.join(GR_DATASET_DIR, "STF_RERUN") rerun_out_path = os.path.join(rerun_out_dir, STF_MODEL + ".bin") stf_rerun = use_feat and os.path.exists(rerun_out_path) if stf_rerun: with open(rerun_out_path, 'rb') as f: feats_rerun_data = pickle.load(f) else: feats_rerun_data = {"frame_n": [], "gloss_paths": [], "gloss_lens": []} df = get_split_df("train") Y = [] X = [] X_lens = [] pp = ProgressPrinter(df.shape[0], 5) cur_n_gloss = 0 for idx in range(df.shape[0]): row = df.iloc[idx] video_path, feat_path = get_video_path(row, "train") if stf_rerun: frame_n = feats_rerun_data["frame_n"][idx] if frame_n < temp_stride: pp.omit() continue gloss_paths = feats_rerun_data["gloss_paths"][idx] gloss_lens = feats_rerun_data["gloss_lens"][idx] with torch.no_grad(): tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) else: images = get_images(video_path) frame_n = len(images) feats_rerun_data["frame_n"].append(frame_n) if frame_n < temp_stride: pp.omit() feats_rerun_data["gloss_paths"].append("") feats_rerun_data["gloss_lens"].append(0) continue gloss_paths, gloss_lens = get_gloss_paths(images, cur_n_gloss, temp_stride, mode) feats_rerun_data["gloss_paths"].append(gloss_paths) feats_rerun_data["gloss_lens"].append(gloss_lens) with torch.no_grad(): if use_feat: tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) else: tensor_video = get_tensor_video(images, preprocess_3d, mode).unsqueeze(0).to(DEVICE) X += gloss_paths X_lens += gloss_lens Y += get_decoded_prediction(model, tensor_video, vocab.encode(row.annotation)) assert (len(Y) == len(X) == len(X_lens)) cur_n_gloss = len(X) if SHOW_PROGRESS: pp.show(idx) shuffle_and_save_dataset(X, X_lens, Y) if use_feat and not stf_rerun: if not os.path.exists(rerun_out_dir): os.makedirs(rerun_out_dir) with(open(rerun_out_path, 'wb')) as f: pickle.dump(feats_rerun_data, f) if SHOW_PROGRESS: pp.end()
def train_end2end(model, vocab, datasets, use_feat): print("END2END model training...") print("Features:", STF_MODEL) print("Save Model path:", STF_MODEL_PATH) print("WER path:", END2END_WER_PATH) optimizer = Adam(model.parameters(), lr=END2END_LR) loss_fn = nn.CTCLoss(zero_infinity=True) lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.2, patience=4) best_wer = get_best_wer() curve = {"train": [], "val": []} current_best_wer = float("inf") trained = False # n_epochs since wer was updated since_wer_update = 0 try: for epoch in range(1, END2END_N_EPOCHS + 1): print("Epoch", epoch) for phase in ["train", "val"]: if phase == "train": model.train() # Set model to training mode else: model.eval() dataset = datasets[phase] n_batches = dataset.start_epoch() losses = [] hypes = [] gts = [] with torch.set_grad_enabled(phase == "train"): pp = ProgressPrinter(n_batches, 25 if USE_ST_FEAT else 1) for i in range(n_batches): optimizer.zero_grad() X_batch, Y_batch, Y_lens = dataset.get_batch(i) X_batch = X_batch.to(DEVICE) Y_batch = Y_batch.to(DEVICE) preds = model(X_batch).log_softmax(dim=2) T, N, V = preds.shape X_lens = torch.full(size=(N,), fill_value=T, dtype=torch.int32) loss = loss_fn(preds, Y_batch, X_lens, Y_lens) losses.append(loss.item()) if phase == "train": loss.backward() optimizer.step() out_sentences = predict_glosses(preds, decoder=None) gts += [y for y in Y_batch.view(-1).tolist() if y != 0] for sentence in out_sentences: hypes += sentence if i == 0 and SHOW_EXAMPLE: pred = " ".join(vocab.decode(out_sentences[0])) gt = Y_batch[0][:Y_lens[0]].tolist() gt = " ".join(vocab.decode(gt)) print(" ", phase, 'Ex. [' + pred + ']', '[' + gt + ']') if SHOW_PROGRESS: pp.show(i, " ") if SHOW_PROGRESS: pp.end(" ") hypes = "".join([chr(x) for x in hypes]) gts = "".join([chr(x) for x in gts]) phase_wer = Lev.distance(hypes, gts) / len(gts) * 100 if phase == "train": lr_scheduler.step(phase_wer) curve[phase].append(phase_wer) phase_loss = np.mean(losses) print(" ", phase.upper(), "WER:", phase_wer, "Loss:", phase_loss) if phase_wer < best_wer[phase]: best_wer[phase] = phase_wer save_end2end_model(model, phase, best_wer[phase]) if phase == "val": if phase_wer < current_best_wer: current_best_wer = phase_wer since_wer_update = 0 else: since_wer_update += 1 if since_wer_update >= END2END_STOP_LIMIT and not use_feat: trained = True raise KeyboardInterrupt except KeyboardInterrupt: pass if epoch >= END2END_N_EPOCHS: trained = True with open(os.path.join(VARS_DIR, "curve.pkl"), 'wb') as f: pickle.dump(curve, f) return best_wer, trained
def train_gloss_recog(model, datasets): print("GR model training...") print("Features:", STF_MODEL) best_loss = float("inf") optimizer = Adam(model.parameters(), lr=GR_LR) loss_fn = nn.CrossEntropyLoss() best_acc = 0 trained = False # n_epochs since wer was updated for epoch in range(1, GR_N_EPOCHS + 1): print("Epoch", epoch) for phase in ['Train', 'Val']: if phase == 'Train': model.train() else: model.eval() dataset = datasets[phase] n_batches = dataset.start_epoch() losses = [] correct = [] with torch.set_grad_enabled(phase == "Train"): pp = ProgressPrinter(n_batches, 25) for i in range(n_batches): if phase == "Train": optimizer.zero_grad() X_batch, Y_batch = dataset.get_batch(i) if X_batch.size(1) != 8 and STF_TYPE == 0: continue X_batch = X_batch.to(DEVICE) Y_batch = Y_batch.to(DEVICE) preds = model(X_batch) loss = loss_fn(preds, Y_batch) correct.append(torch.sum(preds.argmax(dim=1) == Y_batch).item()) losses.append(loss.item()) if phase == "Train": loss.backward() optimizer.step() if SHOW_PROGRESS: pp.show(i, " Loss: %.3f" % np.mean(losses)) if SHOW_PROGRESS: pp.end(" ") phase_loss = np.mean(losses) phase_acc = sum(correct) / len(correct * GR_BATCH_SIZE) * 100 print(" ", phase, "loss:", phase_loss, "phase ACC:", phase_acc) if phase == "Val" and phase_loss < best_loss: best_loss = phase_loss save_model(model, best_loss) if phase == "Val": best_acc = max(best_acc, phase_acc) if epoch >= 5: trained = True return best_acc, trained