예제 #1
0
def make_reference_pipe(args: argparse.Namespace) -> Pipeline:
    ref_format = _ref_format(args.reference)
    if ref_format in SUBTITLE_EXTENSIONS:
        if args.vad is not None:
            logger.warning("Vad specified, but reference was not a movie")
        return cast(
            Pipeline,
            make_subtitle_speech_pipeline(
                fmt=ref_format,
                **override(args,
                           encoding=args.reference_encoding
                           or DEFAULT_ENCODING),
            ),
        )
    elif ref_format in ("npy", "npz"):
        if args.vad is not None:
            logger.warning("Vad specified, but reference was not a movie")
        return Pipeline([("deserialize",
                          DeserializeSpeechTransformer(args.non_speech_label))
                         ])
    else:
        vad = args.vad or DEFAULT_VAD
        if args.reference_encoding is not None:
            logger.warning(
                "Reference srt encoding specified, but reference was a video file"
            )
        ref_stream = args.reference_stream
        if ref_stream is not None and not ref_stream.startswith("0:"):
            ref_stream = "0:" + ref_stream
        return Pipeline([
            (
                "speech_extract",
                VideoSpeechTransformer(
                    vad=vad,
                    sample_rate=SAMPLE_RATE,
                    frame_rate=args.frame_rate,
                    non_speech_label=args.non_speech_label,
                    start_seconds=args.start_seconds,
                    ffmpeg_path=args.ffmpeg_path,
                    ref_stream=ref_stream,
                    vlc_mode=args.vlc_mode,
                    gui_mode=args.gui_mode,
                ),
            ),
        ])
예제 #2
0
 def subpipe_maker(framerate_ratio):
     return Pipeline([
         ('parse', parser),
         ('scale', SubtitleScaler(framerate_ratio)),
         ('speech_extract', SubtitleSpeechTransformer(
             sample_rate=SAMPLE_RATE,
             start_seconds=start_seconds,
             framerate_ratio=framerate_ratio,
         ))
     ])
예제 #3
0
 def subpipe_maker(framerate_ratio):
     return Pipeline(
         [
             ("parse", parser),
             ("scale", SubtitleScaler(framerate_ratio)),
             (
                 "speech_extract",
                 SubtitleSpeechTransformer(
                     sample_rate=SAMPLE_RATE,
                     start_seconds=start_seconds,
                     framerate_ratio=framerate_ratio,
                 ),
             ),
         ]
     )
예제 #4
0
def try_sync(args: argparse.Namespace, reference_pipe: Optional[Pipeline], result: Dict[str, Any]) -> bool:
    sync_was_successful = True
    exc = None
    try:
        logger.info('extracting speech segments from %s...',
                    'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin))
        if not args.srtin:
            args.srtin = [None]
        for srtin in args.srtin:
            skip_sync = args.skip_sync or reference_pipe is None
            skip_infer_framerate_ratio = args.skip_infer_framerate_ratio or reference_pipe is None
            srtout = srtin if args.overwrite_input else args.srtout
            srt_pipe_maker = get_srt_pipe_maker(args, srtin)
            framerate_ratios = get_framerate_ratios_to_try(args)
            srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios]
            for srt_pipe in srt_pipes:
                if callable(srt_pipe):
                    continue
                else:
                    srt_pipe.fit(srtin)
            if not skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'):
                inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / cast(Pipeline, srt_pipes[0])[-1].num_frames
                logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length)
                srt_pipes.append(cast(Pipeline, srt_pipe_maker(inferred_framerate_ratio_from_length)).fit(srtin))
                logger.info('...done')
            logger.info('computing alignments...')
            if skip_sync:
                best_score = 0.
                best_srt_pipe = cast(Pipeline, srt_pipes[0])
                offset_samples = 0
            else:
                (best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
                    FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds
                ).fit_transform(
                    reference_pipe.transform(args.reference),
                    srt_pipes,
                )
            logger.info('...done')
            offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds
            scale_step = best_srt_pipe.named_steps['scale']
            logger.info('score: %.3f', best_score)
            logger.info('offset seconds: %.3f', offset_seconds)
            logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
            output_steps: List[Tuple[str, TransformerMixin]] = [('shift', SubtitleShifter(offset_seconds))]
            if args.merge_with_reference:
                output_steps.append(
                    ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
                )
            output_pipe = Pipeline(output_steps)
            out_subs = output_pipe.fit_transform(scale_step.subs_)
            if args.output_encoding != 'same':
                out_subs = out_subs.set_encoding(args.output_encoding)
            suppress_output_thresh = args.suppress_output_if_offset_less_than
            if (
                suppress_output_thresh is None
                or (
                    scale_step.scale_factor == 1.0
                    and offset_seconds >= suppress_output_thresh
                )
            ):
                logger.info('writing output to {}'.format(srtout or 'stdout'))
                out_subs.write_file(srtout)
            else:
                logger.warning('suppressing output because offset %s was less than suppression threshold %s',
                               offset_seconds, args.suppress_output_if_offset_less_than)
    except FailedToFindAlignmentException as e:
        sync_was_successful = False
        logger.error(e)
    except Exception as e:
        exc = e
        sync_was_successful = False
        logger.error(e)
    else:
        result['offset_seconds'] = offset_seconds
        result['framerate_scale_factor'] = scale_step.scale_factor
    finally:
        if exc is not None:
            raise exc
        result['sync_was_successful'] = sync_was_successful
        return sync_was_successful
예제 #5
0
def try_sync(args: argparse.Namespace, reference_pipe: Optional[Pipeline],
             result: Dict[str, Any]) -> bool:
    sync_was_successful = True
    exc = None
    try:
        logger.info(
            "extracting speech segments from %s...",
            "stdin"
            if not args.srtin else "subtitles file(s) {}".format(args.srtin),
        )
        if not args.srtin:
            args.srtin = [None]
        for srtin in args.srtin:
            skip_sync = args.skip_sync or reference_pipe is None
            skip_infer_framerate_ratio = (args.skip_infer_framerate_ratio
                                          or reference_pipe is None)
            srtout = srtin if args.overwrite_input else args.srtout
            srt_pipe_maker = get_srt_pipe_maker(args, srtin)
            framerate_ratios = get_framerate_ratios_to_try(args)
            srt_pipes = [srt_pipe_maker(1.0)
                         ] + [srt_pipe_maker(rat) for rat in framerate_ratios]
            for srt_pipe in srt_pipes:
                if callable(srt_pipe):
                    continue
                else:
                    srt_pipe.fit(srtin)
            if not skip_infer_framerate_ratio and hasattr(
                    reference_pipe[-1], "num_frames"):
                inferred_framerate_ratio_from_length = (
                    float(reference_pipe[-1].num_frames) /
                    cast(Pipeline, srt_pipes[0])[-1].num_frames)
                logger.info("inferred frameratio ratio: %.3f" %
                            inferred_framerate_ratio_from_length)
                srt_pipes.append(
                    cast(Pipeline,
                         srt_pipe_maker(
                             inferred_framerate_ratio_from_length)).fit(srtin))
                logger.info("...done")
            logger.info("computing alignments...")
            if skip_sync:
                best_score = 0.0
                best_srt_pipe = cast(Pipeline, srt_pipes[0])
                offset_samples = 0
            else:
                (best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
                    FFTAligner, srtin, SAMPLE_RATE,
                    args.max_offset_seconds).fit_transform(
                        reference_pipe.transform(args.reference),
                        srt_pipes,
                    )
            logger.info("...done")
            offset_seconds = (offset_samples / float(SAMPLE_RATE) +
                              args.apply_offset_seconds)
            scale_step = best_srt_pipe.named_steps["scale"]
            logger.info("score: %.3f", best_score)
            logger.info("offset seconds: %.3f", offset_seconds)
            logger.info("framerate scale factor: %.3f",
                        scale_step.scale_factor)
            output_steps: List[Tuple[str, TransformerMixin]] = [
                ("shift", SubtitleShifter(offset_seconds))
            ]
            if args.merge_with_reference:
                output_steps.append(
                    ("merge",
                     SubtitleMerger(
                         reference_pipe.named_steps["parse"].subs_)))
            output_pipe = Pipeline(output_steps)
            out_subs = output_pipe.fit_transform(scale_step.subs_)
            if args.output_encoding != "same":
                out_subs = out_subs.set_encoding(args.output_encoding)
            suppress_output_thresh = args.suppress_output_if_offset_less_than
            if suppress_output_thresh is None or (
                    scale_step.scale_factor == 1.0
                    and offset_seconds >= suppress_output_thresh):
                logger.info("writing output to {}".format(srtout or "stdout"))
                out_subs.write_file(srtout)
            else:
                logger.warning(
                    "suppressing output because offset %s was less than suppression threshold %s",
                    offset_seconds,
                    args.suppress_output_if_offset_less_than,
                )
    except FailedToFindAlignmentException:
        sync_was_successful = False
        logger.exception("failed to find alignment")
    except Exception as e:
        exc = e
        sync_was_successful = False
    else:
        result["offset_seconds"] = offset_seconds
        result["framerate_scale_factor"] = scale_step.scale_factor
    finally:
        if exc is not None:
            raise exc
        result["sync_was_successful"] = sync_was_successful
        return sync_was_successful