示例#1
0
def main(cfg: ServerConfig):
    global model, spect_parser, decoder, config, device, model2, model3
    global commo_model, dict_data, word_dict, char_dict
    commo_model, dict_data, word_dict, char_dict = transcribe_comma.loadModel()
    config = cfg
    model1Path = '/work/Source/deepspeech.pytorch/models/deepspeech_50_1600_gru_fpt.pth'
    logging.info('Setting up server...')
    device = torch.device("cuda" if cfg.model.cuda else "cpu")
    model = load_model(device=device,
                       model_path=model1Path,
                       use_half=cfg.model.use_half)
    logging.info('Loaded model 1')
    model2Path = '/work/Source/deepspeech.pytorch/models/deepspeech_1600_lstm_16_50_vin.pth'
    model2 = load_model(device=device,
                        model_path=model2Path,
                        use_half=cfg.model.use_half)

    logging.info('Loaded model 2')
    model3Path = '/work/Source/deepspeech.pytorch/models/deepspeech_1600_vinfpt_25_50.pth'
    model3 = load_model(device=device,
                        model_path=model3Path,
                        use_half=cfg.model.use_half)
    logging.info('Loaded model 3')
    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)
    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)
    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=cfg.host, port=cfg.port, debug=False, use_reloader=False)
示例#2
0
    def __init__(self,
                 model_path,
                 gpus=None,
                 batch_size=1,
                 lr_stage1=100,
                 lr_stage2=0.1,
                 num_iter_stage1=1000,
                 num_iter_stage2=4000,
                 labels_path='labels.json'):

        # handle attacked model
        self.device = torch.device("cuda" if gpus is None else gpus)
        self.model = load_model(device=self.device,
                                model_path=model_path,
                                use_half=False)
        self.model.eval()

        # handle training parameters
        self.num_iter_stage1 = num_iter_stage1
        self.num_iter_stage2 = num_iter_stage2
        self.batch_size = batch_size
        self.lr_stage1 = lr_stage1

        with open(labels_path) as label_file:
            label = json.load(label_file)
            self.text2label = {x: idx for idx, x in enumerate(label)}
            self.label2text = {idx: x for idx, x in enumerate(label)}

        self.ctc_loss = torch.nn.CTCLoss(blank=len(self.text2label) - 1)
示例#3
0
def transcribe(cfg: TranscribeConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(
        "beam" if cfg.lm.decoder_type == DecoderType.beam else "greedy",
        model.labels, cfg.lm.lm_path, cfg.lm.alpha, cfg.lm.beta,
        cfg.lm.cutoff_top_n, cfg.lm.cutoff_prob, cfg.lm.beam_width,
        cfg.lm.lm_workers)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    start = time.time()
    decoded_output, decoded_offsets = run_transcribe(
        audio_path=cfg.audio_path,
        spect_parser=spect_parser,
        model=model,
        decoder=decoder,
        device=device,
        use_half=cfg.model.use_half)
    results = decode_results(decoded_output=decoded_output,
                             decoded_offsets=decoded_offsets,
                             cfg=cfg)
    end = time.time()

    print("Time taken: {}".format(end - start))
    print(json.dumps(results, ensure_ascii=False))
示例#4
0
def main(cfg: ServerConfig):
    global model, spect_parser, decoder, config, device
    config = cfg
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(
        "beam" if cfg.lm.decoder_type == DecoderType.beam else "greedy",
        model.labels, cfg.lm.lm_path, cfg.lm.alpha, cfg.lm.beta,
        cfg.lm.cutoff_top_n, cfg.lm.cutoff_prob, cfg.lm.beam_width,
        cfg.lm.lm_workers)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')

    serve(app, host=cfg.host, port=cfg.port)
    def __init__(self, cfg):
        self.cfg = cfg

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model = load_model(
            self.device, hydra.utils.to_absolute_path(self.cfg.model_path))
        self.ckpt = torch.load(hydra.utils.to_absolute_path(
            self.cfg.model_path),
                               map_location=self.device)
        self.labels = self.ckpt['hyper_parameters']['labels']

        self.decoder = BeamCTCDecoder(labels=self.labels,
                                      lm_path=hydra.utils.to_absolute_path(
                                          self.cfg.lm_path),
                                      beam_width=self.cfg.beam_width,
                                      num_processes=self.cfg.num_workers,
                                      blank_index=self.labels.index('_'))
        self.target_decoder = GreedyDecoder(labels=self.labels,
                                            blank_index=self.labels.index('_'))

        test_dataset = SpectrogramDataset(
            audio_conf=self.cfg.spect_cfg,
            input_path=hydra.utils.to_absolute_path(cfg.test_path),
            labels=self.labels,
            normalize=True)
        self.test_loader = AudioDataLoader(test_dataset,
                                           batch_size=self.cfg.batch_size,
                                           num_workers=self.cfg.num_workers)
示例#6
0
def evaluate(cfg: EvalConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(
        audio_conf=model.audio_conf,
        manifest_filepath=hydra.utils.to_absolute_path(cfg.test_manifest),
        labels=model.labels,
        normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=cfg.batch_size,
                                  num_workers=cfg.num_workers)
    wer, cer, output_data = run_evaluation(test_loader=test_loader,
                                           device=device,
                                           model=model,
                                           decoder=decoder,
                                           target_decoder=target_decoder,
                                           save_output=cfg.save_output,
                                           verbose=cfg.verbose,
                                           use_half=cfg.model.use_half)

    print('Test Summary \t'
          'Average WER {wer:.3f}\t'
          'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
    if cfg.save_output:
        torch.save(output_data, hydra.utils.to_absolute_path(cfg.save_output))
示例#7
0
def evaluate(cfg: EvalConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device, model_path=cfg.model.model_path)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)
    target_decoder = GreedyDecoder(labels=model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.spect_cfg,
                                      input_path=hydra.utils.to_absolute_path(
                                          cfg.test_path),
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=cfg.batch_size,
                                  num_workers=cfg.num_workers)
    wer, cer = run_evaluation_print(test_loader=test_loader,
                                    device=device,
                                    model=model,
                                    decoder=decoder,
                                    target_decoder=target_decoder,
                                    precision=cfg.model.precision)

    print('Test Summary \t'
          'Average WER {wer:.3f}\t'
          'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
示例#8
0
def transcribe(cfg: TranscribeConfig):
    commo_model, dict_data, word_dict, char_dict = transcribe_comma.loadModel()
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    #Đối với beamsearch decoded_putput cho ra mảng (1xbeam_width) với các phần tử là các câu có thể xảy ra:
    #VD: [["toi đi hộc", "tôi di hoc", "tôi đi ho",...]] 512 phần tử (beam_width=512)

    tim1 = time.time()
    decoded_output, decoded_outputGreedy, decoded_offsets, decoded_offsets2 = run_transcribe(
        audio_path=cfg.audio_path,
        spect_parser=spect_parser,
        model=model,
        decoder=decoder,
        device=device,
        use_half=cfg.model.use_half)
    results = decode_results(decoded_output=decoded_output,
                             decoded_offsets=decoded_offsets,
                             cfg=cfg)
    results2 = decode_results(decoded_output=decoded_outputGreedy,
                              decoded_offsets=decoded_offsets2,
                              cfg=cfg)
    resp = json.dumps(results, ensure_ascii=False)

    tim2 = time.time()
    print("Audio transcribe cost : " + str(tim2 - tim1))
    results['output'][0]['transcription'] = transcribe_comma.runTranscribe(
        commo_model, dict_data, word_dict, char_dict,
        results['output'][0]['transcription'])
    results2['output'][0]['transcription'] = transcribe_comma.runTranscribe(
        commo_model, dict_data, word_dict, char_dict,
        results2['output'][0]['transcription'])

    #print("DEBUG : ", resp)
    return results['output'][0]['transcription'], results2['output'][0][
        'transcription'], results['_meta']
示例#9
0
def main(cfg: ServerConfig):
    global model, spect_parser, decoder, config, device
    config = cfg
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device, model_path=cfg.model.model_path)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    spect_parser = SpectrogramParser(audio_conf=model.spect_cfg,
                                     normalize=True)
    logging.info('Server initialised')
    app.run(host=cfg.host, port=cfg.port, debug=True, use_reloader=False)
示例#10
0
文件: server.py 项目: mjurkus/deep_lt
def main():
    import argparse
    global model, spect_parser, decoder, args, device
    parser = argparse.ArgumentParser(
        description='DeepSpeech transcription server')
    parser.add_argument('--host',
                        type=str,
                        default='0.0.0.0',
                        help='Host to be used by the server')
    parser.add_argument('--port',
                        type=int,
                        default=8888,
                        help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from deepspeech_pytorch.decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
示例#11
0
def transcribe(cfg: TranscribeConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device, model_path=cfg.model.model_path)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)

    spect_parser = SpectrogramParser(audio_conf=model.spect_cfg,
                                     normalize=True)

    decoded_output, decoded_offsets = run_transcribe(
        audio_path=hydra.utils.to_absolute_path(cfg.audio_path),
        spect_parser=spect_parser,
        model=model,
        decoder=decoder,
        device=device,
        precision=cfg.model.precision)
    results = decode_results(decoded_output=decoded_output,
                             decoded_offsets=decoded_offsets,
                             cfg=cfg)
    print(json.dumps(results))
示例#12
0
 def __init__(self,
              *args,
              voting_kwargs,
              niters_forward=1,
              niters_backward=1,
              batch_backward=0,
              batch_forward=0,
              load_weights_file=None,
              use_half=False,
              random_init=False,
              **kwargs):
     filename = load_weights_file if load_weights_file else "librispeech_pretrained_v2.pth"
     saved_model_dir = paths.runtime_paths().saved_model_dir
     model_path = os.path.join(saved_model_dir, filename)
     model = load_model(device="cpu",
                        model_path=model_path,
                        use_half=use_half)
     optimizer = torch.optim.AdamW(model.parameters(),
                                   lr=1e-4,
                                   weight_decay=1e-5,
                                   amsgrad=False)
     super(SmoothedDeepSpeech, self).__init__(model,
                                              *args,
                                              optimizer=optimizer,
                                              **kwargs)
     self.model_path = model_path
     self.use_half = use_half
     self.niters_forward = niters_forward
     self.niters_backward = niters_backward
     if random_init:
         for p in self._model.parameters():
             if p.dim() > 1:
                 torch.nn.init.xavier_uniform(p)
             else:
                 torch.nn.init.zeros_(p)
     self.decoder = load_decoder_with_scores(self.decoder)
     self.set_voting_module(**voting_kwargs, **kwargs)
     self.batch_backward = batch_backward
     self.batch_forward = batch_forward
def transcribe(cfg: TranscribeConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(labels=model.labels,
                           cfg=cfg.lm)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    decoded_output, decoded_offsets = run_transcribe(audio_path=cfg.audio_path,
                                                     spect_parser=spect_parser,
                                                     model=model,
                                                     decoder=decoder,
                                                     device=device,
                                                     use_half=cfg.model.use_half)
    results = decode_results(decoded_output=decoded_output,
                             decoded_offsets=decoded_offsets,
                             cfg=cfg)
    print(json.dumps(results))
示例#14
0
文件: test.py 项目: mjurkus/deep_lt
                    default=8,
                    type=int,
                    help='Number of workers used in dataloading')
parser.add_argument('--verbose',
                    action="store_true",
                    help="print out decoded output and error of each sample")
parser.add_argument('--save-output',
                    default=None,
                    help="Saves output of model from test to this file_path")
parser = add_decoder_args(parser)

if __name__ == '__main__':
    args = parser.parse_args()
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path)

    with open('labels.json') as label_file:
        labels = json.load(label_file)

    decoder = load_decoder(decoder_type=args.decoder,
                           labels=labels,
                           lm_path=args.lm_path,
                           alpha=args.alpha,
                           beta=args.beta,
                           cutoff_top_n=args.cutoff_top_n,
                           cutoff_prob=args.cutoff_prob,
                           beam_width=args.beam_width,
                           lm_workers=args.lm_workers)

    target_decoder = GreedyDecoder(labels)
def run_quantsim_evaluation(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    import deepspeech_pytorch.model

    def wrapped_forward_function(self, x, lengths=None):
        if lengths is None:
            lengths = torch.IntTensor([_x.shape[0] for _x in x])
        return self.infer(x, lengths)

    deepspeech_pytorch.model.DeepSpeech.infer = deepspeech_pytorch.model.DeepSpeech.forward
    deepspeech_pytorch.model.DeepSpeech.forward = wrapped_forward_function

    model = load_model(device=device,
                       model_path=args.model_path,
                       use_half=False)

    decoder = load_decoder(labels=model.labels, cfg=LMConfig)

    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))

    def eval_func(model, iterations=None, device=device):
        test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                          manifest_filepath=args.test_manifest,
                                          labels=model.labels,
                                          normalize=True)

        if iterations is not None:
            test_dataset.size = iterations

        test_loader = AudioDataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers)

        wer, cer, output_data = run_evaluation(test_loader=test_loader,
                                               device=device,
                                               model=model,
                                               decoder=decoder,
                                               target_decoder=target_decoder,
                                               save_output=False,
                                               verbose=True,
                                               use_half=False)
        return wer, cer, output_data

    quant_scheme = QuantScheme.post_training_tf_enhanced

    sim = QuantizationSimModel(model.cpu(),
                               input_shapes=tuple([1, 1, 161, 500]),
                               quant_scheme=quant_scheme,
                               default_param_bw=args.default_param_bw,
                               default_output_bw=args.default_output_bw,
                               config_file=args.quantsim_config_file)

    manually_configure_quant_ops(sim)

    sim.model.to(device)
    sim.compute_encodings(eval_func,
                          forward_pass_callback_args=args.encodings_iterations)

    wer, cer, output_data = eval_func(sim.model, None)
    print('Average WER {:.4f}'.format(wer))
示例#16
0
parser.add_argument('--lm-alpha-to', default=3.0, type=float, help='Language model weight end tuning')
parser.add_argument('--lm-beta-from', default=0.0, type=float,
                    help='Language model word bonus (all words) start tuning')
parser.add_argument('--lm-beta-to', default=0.5, type=float,
                    help='Language model word bonus (all words) end tuning')
parser.add_argument('--lm-num-alphas', default=45, type=float, help='Number of alpha candidates for tuning')
parser.add_argument('--lm-num-betas', default=8, type=float, help='Number of beta candidates for tuning')
parser = add_decoder_args(parser)
args = parser.parse_args()

if args.lm_path is None:
    print("error: LM must be provided for tuning")
    sys.exit(1)

model = load_model(model_path=args.model_path,
                   device='cpu',
                   use_half=False)

saved_output = torch.load(args.saved_output)


def init(beam_width, blank_index, lm_path):
    global decoder
    decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers,
                             blank_index=blank_index)


def decode_dataset(params):
    lm_alpha, lm_beta = params
    global decoder
    decoder._decoder.reset_params(lm_alpha, lm_beta)
示例#17
0
                    default=45,
                    type=float,
                    help='Number of alpha candidates for tuning')
parser.add_argument('--lm-num-betas',
                    default=8,
                    type=float,
                    help='Number of beta candidates for tuning')
parser = add_decoder_args(parser)
args = parser.parse_args()

if args.lm_path is None:
    print("error: LM must be provided for tuning")
    sys.exit(1)

model = load_model(
    model_path=args.model_path,
    device='cpu',
)

saved_output = torch.load(args.saved_output)

with open('labels.json') as label_file:
    labels = json.load(label_file)


def init(beam_width, blank_index, lm_path):
    global decoder
    decoder = BeamCTCDecoder(labels,
                             lm_path=lm_path,
                             beam_width=beam_width,
                             num_processes=args.lm_workers,
                             blank_index=blank_index)
    def __init__(
        self,
        model: Optional["DeepSpeech"] = None,
        pretrained_model: Optional[str] = None,
        filename: Optional[str] = None,
        url: Optional[str] = None,
        use_half: bool = False,
        optimizer: Optional["torch.optim.Optimizer"] = None,  # type: ignore
        use_amp: bool = False,
        opt_level: str = "O1",
        decoder_type: str = "greedy",
        lm_path: str = "",
        top_paths: int = 1,
        alpha: float = 0.0,
        beta: float = 0.0,
        cutoff_top_n: int = 40,
        cutoff_prob: float = 1.0,
        beam_width: int = 10,
        lm_workers: int = 4,
        clip_values: Optional["CLIP_VALUES_TYPE"] = None,
        preprocessing_defences: Union["Preprocessor", List["Preprocessor"],
                                      None] = None,
        postprocessing_defences: Union["Postprocessor", List["Postprocessor"],
                                       None] = None,
        preprocessing: "PREPROCESSING_TYPE" = None,
        device_type: str = "gpu",
        verbose: bool = True,
    ):
        """
        Initialization of an instance PyTorchDeepSpeech.

        :param model: DeepSpeech model.
        :param pretrained_model: The choice of pretrained model if a pretrained model is required. Currently this
                                 estimator supports 3 different pretrained models consisting of `an4`, `librispeech`
                                 and `tedlium`.
        :param filename: Name of the file.
        :param url: Download URL.
        :param use_half: Whether to use FP16 for pretrained model.
        :param optimizer: The optimizer used to train the estimator.
        :param use_amp: Whether to use the automatic mixed precision tool to enable mixed precision training or
                        gradient computation, e.g. with loss gradient computation. When set to True, this option is
                        only triggered if there are GPUs available.
        :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                          values are `O0`, `O1`, `O2`, and `O3`.
        :param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want
                             transcription outputs.
        :param lm_path: Path to an (optional) kenlm language model for use with beam search. This parameter is only
                        used when users want transcription outputs.
        :param top_paths: Number of beams to be returned. This parameter is only used when users want transcription
                          outputs.
        :param alpha: The weight used for the language model. This parameter is only used when users want transcription
                      outputs.
        :param beta: Language model word bonus (all words). This parameter is only used when users want transcription
                     outputs.
        :param cutoff_top_n: Cutoff_top_n characters with highest probs in vocabulary will be used in beam search. This
                             parameter is only used when users want transcription outputs.
        :param cutoff_prob: Cutoff probability in pruning. This parameter is only used when users want transcription
                            outputs.
        :param beam_width: The width of beam to be used. This parameter is only used when users want transcription
                           outputs.
        :param lm_workers: Number of language model processes to use. This parameter is only used when users want
                           transcription outputs.
        :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and
               maximum values allowed for features. If floats are provided, these will be used as the range of all
               features. If arrays are provided, each value will be considered the bound for a feature, thus
               the shape of clip values needs to match the total number of features.
        :param preprocessing_defences: Preprocessing defence(s) to be applied by the estimator.
        :param postprocessing_defences: Postprocessing defence(s) to be applied by the estimator.
        :param preprocessing: Tuple of the form `(subtrahend, divisor)` of floats or `np.ndarray` of values to be
               used for data preprocessing. The first value will be subtracted from the input. The input will then
               be divided by the second one.
        :param device_type: Type of device to be used for model and tensors, if `cpu` run on CPU, if `gpu` run on GPU
                            if available otherwise run on CPU.
        """
        import torch  # lgtm [py/repeated-import]

        from deepspeech_pytorch.configs.inference_config import LMConfig
        from deepspeech_pytorch.enums import DecoderType
        from deepspeech_pytorch.utils import load_decoder, load_model

        # Super initialization
        super().__init__(
            model=None,
            clip_values=clip_values,
            channels_first=None,
            preprocessing_defences=preprocessing_defences,
            postprocessing_defences=postprocessing_defences,
            preprocessing=preprocessing,
        )

        self.verbose = verbose

        # Check clip values
        if self.clip_values is not None:
            if not np.all(self.clip_values[0] == -1):
                raise ValueError(
                    "This estimator requires normalized input audios with clip_vales=(-1, 1)."
                )
            if not np.all(self.clip_values[1] == 1):
                raise ValueError(
                    "This estimator requires normalized input audios with clip_vales=(-1, 1)."
                )

        # Check postprocessing defences
        if self.postprocessing_defences is not None:
            raise ValueError(
                "This estimator does not support `postprocessing_defences`.")

        # Set cpu/gpu device
        self._device: torch.device
        if device_type == "cpu" or not torch.cuda.is_available():
            self._device = torch.device("cpu")
        else:
            cuda_idx = torch.cuda.current_device()
            self._device = torch.device("cuda:{}".format(cuda_idx))

        self._input_shape = None

        # Load model
        if model is None:
            if pretrained_model == "an4":
                filename, url = (
                    "an4_pretrained_v2.pth",
                    "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/an4_pretrained_v2.pth",
                )

            elif pretrained_model == "librispeech":
                filename, url = (
                    "librispeech_pretrained_v2.pth",
                    "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/"
                    "librispeech_pretrained_v2.pth",
                )

            elif pretrained_model == "tedlium":
                filename, url = (
                    "ted_pretrained_v2.pth",
                    "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/ted_pretrained_v2.pth",
                )

            elif pretrained_model is None:
                # If model is None and no pretrained model is selected, then we need to have parameters filename and
                # url to download, extract and load the automatic speech recognition model
                if filename is None or url is None:
                    filename, url = (
                        "librispeech_pretrained_v2.pth",
                        "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/"
                        "librispeech_pretrained_v2.pth",
                    )

            else:
                raise ValueError(
                    "The input pretrained model %s is not supported." %
                    pretrained_model)

            # Download model
            model_path = get_file(filename=filename,
                                  path=config.ART_DATA_PATH,
                                  url=url,
                                  extract=False,
                                  verbose=self.verbose)

            # Then load model
            self._model = load_model(device=self._device,
                                     model_path=model_path,
                                     use_half=use_half)

        else:
            self._model = model

            # Push model to the corresponding device
            self._model.to(self._device)

        # Save first version of the optimizer
        self._optimizer = optimizer
        self._use_amp = use_amp

        # Now create a decoder
        # Create the language model config first
        lm_config = LMConfig()

        # Then setup the config
        if decoder_type == "greedy":
            lm_config.decoder_type = DecoderType.greedy
        elif decoder_type == "beam":
            lm_config.decoder_type = DecoderType.beam
        else:
            raise ValueError("Decoder type %s currently not supported." %
                             decoder_type)

        lm_config.lm_path = lm_path
        lm_config.top_paths = top_paths
        lm_config.alpha = alpha
        lm_config.beta = beta
        lm_config.cutoff_top_n = cutoff_top_n
        lm_config.cutoff_prob = cutoff_prob
        lm_config.beam_width = beam_width
        lm_config.lm_workers = lm_workers

        # Create the decoder with the lm config
        self.decoder = load_decoder(labels=self._model.labels, cfg=lm_config)

        # Setup for AMP use
        if self._use_amp:
            from apex import amp

            if self._optimizer is None:
                logger.warning(
                    "An optimizer is needed to use the automatic mixed precision tool, but none for provided. "
                    "A default optimizer is used.")

                # Create the optimizers
                parameters = self._model.parameters()
                self._optimizer = torch.optim.SGD(parameters, lr=0.01)

            if self._device.type == "cpu":
                enabled = False
            else:
                enabled = True

            self._model, self._optimizer = amp.initialize(
                models=self._model,
                optimizers=self._optimizer,
                enabled=enabled,
                opt_level=opt_level,
                loss_scale=1.0,
            )
示例#19
0
                        help='PGD iteration times')

    # plot parameters
    parser.add_argument('--plot_ori_spec',
                        type=str,
                        default="None",
                        help='Path to save the original spectrogram')
    parser.add_argument('--plot_adv_spec',
                        type=str,
                        default="None",
                        help='Path to save the adversarial spectrogram')
    args = parser.parse_args()

    cfg = TranscribeConfig
    model = load_model(device="cpu",
                       model_path=args.model_path,
                       use_half=False)
    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)

    sound, sample_rate = torchaudio.load(args.input_wav)
    target_sentence = args.target_sentence.upper()
    if args.output_wav == "None":
        args.output_wav = None
    attacker = Attacker(model=model,
                        sound=sound,
                        target=target_sentence,
                        decoder=decoder,
                        device=args.device,
                        save=output_wav)

    attacker.attack(epsilon=args.epsilon,
示例#20
0
    print(transcription)


if __name__ == "__main__":
    arg_parser = argparse.ArgumentParser(
        description='DeepSpeech transcription')
    arg_parser = add_inference_args(arg_parser)
    arg_parser.add_argument('--offsets',
                            dest='offsets',
                            action='store_true',
                            help='Returns time offset information')
    arg_parser = add_decoder_args(arg_parser)
    args = arg_parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    decoder = load_decoder(decoder_type=args.decoder,
                           labels=model.labels,
                           lm_path=args.lm_path,
                           alpha=args.alpha,
                           beta=args.beta,
                           cutoff_top_n=args.cutoff_top_n,
                           cutoff_prob=args.cutoff_prob,
                           beam_width=args.beam_width,
                           lm_workers=args.lm_workers)

    spect_parser = SpectrogramParser(audio_conf=model.audio_conf,
                                     normalize=True)

    vad = webrtcvad.Vad()
示例#21
0
 def reload_model(self):
     model = load_model(device="cpu",
                        model_path=self.model_path,
                        use_half=self.use_half)
     self._model = model.to(self._device)