def make_reference_pipe(args: argparse.Namespace) -> Pipeline: ref_format = _ref_format(args.reference) if ref_format in SUBTITLE_EXTENSIONS: if args.vad is not None: logger.warning("Vad specified, but reference was not a movie") return cast( Pipeline, make_subtitle_speech_pipeline( fmt=ref_format, **override(args, encoding=args.reference_encoding or DEFAULT_ENCODING), ), ) elif ref_format in ("npy", "npz"): if args.vad is not None: logger.warning("Vad specified, but reference was not a movie") return Pipeline([("deserialize", DeserializeSpeechTransformer(args.non_speech_label)) ]) else: vad = args.vad or DEFAULT_VAD if args.reference_encoding is not None: logger.warning( "Reference srt encoding specified, but reference was a video file" ) ref_stream = args.reference_stream if ref_stream is not None and not ref_stream.startswith("0:"): ref_stream = "0:" + ref_stream return Pipeline([ ( "speech_extract", VideoSpeechTransformer( vad=vad, sample_rate=SAMPLE_RATE, frame_rate=args.frame_rate, non_speech_label=args.non_speech_label, start_seconds=args.start_seconds, ffmpeg_path=args.ffmpeg_path, ref_stream=ref_stream, vlc_mode=args.vlc_mode, gui_mode=args.gui_mode, ), ), ])
def subpipe_maker(framerate_ratio): return Pipeline([ ('parse', parser), ('scale', SubtitleScaler(framerate_ratio)), ('speech_extract', SubtitleSpeechTransformer( sample_rate=SAMPLE_RATE, start_seconds=start_seconds, framerate_ratio=framerate_ratio, )) ])
def subpipe_maker(framerate_ratio): return Pipeline( [ ("parse", parser), ("scale", SubtitleScaler(framerate_ratio)), ( "speech_extract", SubtitleSpeechTransformer( sample_rate=SAMPLE_RATE, start_seconds=start_seconds, framerate_ratio=framerate_ratio, ), ), ] )
def try_sync(args: argparse.Namespace, reference_pipe: Optional[Pipeline], result: Dict[str, Any]) -> bool: sync_was_successful = True exc = None try: logger.info('extracting speech segments from %s...', 'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin)) if not args.srtin: args.srtin = [None] for srtin in args.srtin: skip_sync = args.skip_sync or reference_pipe is None skip_infer_framerate_ratio = args.skip_infer_framerate_ratio or reference_pipe is None srtout = srtin if args.overwrite_input else args.srtout srt_pipe_maker = get_srt_pipe_maker(args, srtin) framerate_ratios = get_framerate_ratios_to_try(args) srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios] for srt_pipe in srt_pipes: if callable(srt_pipe): continue else: srt_pipe.fit(srtin) if not skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'): inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / cast(Pipeline, srt_pipes[0])[-1].num_frames logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length) srt_pipes.append(cast(Pipeline, srt_pipe_maker(inferred_framerate_ratio_from_length)).fit(srtin)) logger.info('...done') logger.info('computing alignments...') if skip_sync: best_score = 0. best_srt_pipe = cast(Pipeline, srt_pipes[0]) offset_samples = 0 else: (best_score, offset_samples), best_srt_pipe = MaxScoreAligner( FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds ).fit_transform( reference_pipe.transform(args.reference), srt_pipes, ) logger.info('...done') offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds scale_step = best_srt_pipe.named_steps['scale'] logger.info('score: %.3f', best_score) logger.info('offset seconds: %.3f', offset_seconds) logger.info('framerate scale factor: %.3f', scale_step.scale_factor) output_steps: List[Tuple[str, TransformerMixin]] = [('shift', SubtitleShifter(offset_seconds))] if args.merge_with_reference: output_steps.append( ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_)) ) output_pipe = Pipeline(output_steps) out_subs = output_pipe.fit_transform(scale_step.subs_) if args.output_encoding != 'same': out_subs = out_subs.set_encoding(args.output_encoding) suppress_output_thresh = args.suppress_output_if_offset_less_than if ( suppress_output_thresh is None or ( scale_step.scale_factor == 1.0 and offset_seconds >= suppress_output_thresh ) ): logger.info('writing output to {}'.format(srtout or 'stdout')) out_subs.write_file(srtout) else: logger.warning('suppressing output because offset %s was less than suppression threshold %s', offset_seconds, args.suppress_output_if_offset_less_than) except FailedToFindAlignmentException as e: sync_was_successful = False logger.error(e) except Exception as e: exc = e sync_was_successful = False logger.error(e) else: result['offset_seconds'] = offset_seconds result['framerate_scale_factor'] = scale_step.scale_factor finally: if exc is not None: raise exc result['sync_was_successful'] = sync_was_successful return sync_was_successful
def try_sync(args: argparse.Namespace, reference_pipe: Optional[Pipeline], result: Dict[str, Any]) -> bool: sync_was_successful = True exc = None try: logger.info( "extracting speech segments from %s...", "stdin" if not args.srtin else "subtitles file(s) {}".format(args.srtin), ) if not args.srtin: args.srtin = [None] for srtin in args.srtin: skip_sync = args.skip_sync or reference_pipe is None skip_infer_framerate_ratio = (args.skip_infer_framerate_ratio or reference_pipe is None) srtout = srtin if args.overwrite_input else args.srtout srt_pipe_maker = get_srt_pipe_maker(args, srtin) framerate_ratios = get_framerate_ratios_to_try(args) srt_pipes = [srt_pipe_maker(1.0) ] + [srt_pipe_maker(rat) for rat in framerate_ratios] for srt_pipe in srt_pipes: if callable(srt_pipe): continue else: srt_pipe.fit(srtin) if not skip_infer_framerate_ratio and hasattr( reference_pipe[-1], "num_frames"): inferred_framerate_ratio_from_length = ( float(reference_pipe[-1].num_frames) / cast(Pipeline, srt_pipes[0])[-1].num_frames) logger.info("inferred frameratio ratio: %.3f" % inferred_framerate_ratio_from_length) srt_pipes.append( cast(Pipeline, srt_pipe_maker( inferred_framerate_ratio_from_length)).fit(srtin)) logger.info("...done") logger.info("computing alignments...") if skip_sync: best_score = 0.0 best_srt_pipe = cast(Pipeline, srt_pipes[0]) offset_samples = 0 else: (best_score, offset_samples), best_srt_pipe = MaxScoreAligner( FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds).fit_transform( reference_pipe.transform(args.reference), srt_pipes, ) logger.info("...done") offset_seconds = (offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds) scale_step = best_srt_pipe.named_steps["scale"] logger.info("score: %.3f", best_score) logger.info("offset seconds: %.3f", offset_seconds) logger.info("framerate scale factor: %.3f", scale_step.scale_factor) output_steps: List[Tuple[str, TransformerMixin]] = [ ("shift", SubtitleShifter(offset_seconds)) ] if args.merge_with_reference: output_steps.append( ("merge", SubtitleMerger( reference_pipe.named_steps["parse"].subs_))) output_pipe = Pipeline(output_steps) out_subs = output_pipe.fit_transform(scale_step.subs_) if args.output_encoding != "same": out_subs = out_subs.set_encoding(args.output_encoding) suppress_output_thresh = args.suppress_output_if_offset_less_than if suppress_output_thresh is None or ( scale_step.scale_factor == 1.0 and offset_seconds >= suppress_output_thresh): logger.info("writing output to {}".format(srtout or "stdout")) out_subs.write_file(srtout) else: logger.warning( "suppressing output because offset %s was less than suppression threshold %s", offset_seconds, args.suppress_output_if_offset_less_than, ) except FailedToFindAlignmentException: sync_was_successful = False logger.exception("failed to find alignment") except Exception as e: exc = e sync_was_successful = False else: result["offset_seconds"] = offset_seconds result["framerate_scale_factor"] = scale_step.scale_factor finally: if exc is not None: raise exc result["sync_was_successful"] = sync_was_successful return sync_was_successful