예제 #1
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
예제 #2
0
def define_rnn(rnn_options, audio_conf):

    rnn = DeepSpeech(rnn_hidden_size=800,
                     nb_layers=5,
                     labels=rnn_options['labels'],
                     rnn_type=rnn_options['rnn_type'],
                     audio_conf=audio_conf,
                     bidirectional=True)

    parameters = rnn.parameters()

    return (rnn, parameters)
예제 #3
0
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)
예제 #4
0
def decode_results(model, decoded_output, decoded_offsets):
    results = {
        "output": [],
        "_meta": {
            "acoustic_model": {
                "name": os.path.basename(args.model_path)
            },
            "language_model": {
                "name": os.path.basename(args.lm_path) if args.lm_path else None,
            },
            "decoder": {
                "lm": args.lm_path is not None,
                "alpha": args.alpha if args.lm_path is not None else None,
                "beta": args.beta if args.lm_path is not None else None,
                "type": args.decoder,
            }
        }
    }
    results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model))

    for b in range(len(decoded_output)):
        for pi in range(min(args.top_paths, len(decoded_output[b]))):
            result = {'transcription': decoded_output[b][pi]}
            if args.offsets:
                result['offsets'] = decoded_offsets[b][pi]
            results['output'].append(result)
    return results
예제 #5
0
def load_model(device, model_path, use_half):
    model = DeepSpeech.load_model(model_path)
    model.eval()
    model = model.to(device)
    if use_half:
        model = model.half()
    return model
예제 #6
0
파일: utils.py 프로젝트: ssb2920/SEM-5
def load_model(device, model_path, is_cuda):
    model = DeepSpeech.load_model(model_path)
    model.eval()
    model = model.to(device)
    if is_cuda and model.mixed_precision:
        model = convert_model_to_half(model)
    return model
예제 #7
0
def build_deepspeech_model():
    sample_rate = 16000
    window_size = .02
    window_stride = .01
    window = 'hamming'
    noise_dir = None
    noise_prob = 0.4
    noise_min = 0.0
    noise_max = 0.5
    audio_conf = dict(sample_rate=sample_rate,
                      window_size=window_size,
                      window_stride=window_stride,
                      window=window,
                      noise_dir=noise_dir,
                      noise_prob=noise_prob,
                      noise_levels=(noise_min, noise_max))

    hidden_size = 100
    hidden_layers = 5
    labels_path = 'labels.json'

    rnn_type = 'gru'
    bidirectional = True
    model = DeepSpeech(rnn_hidden_size=hidden_size,
                       nb_layers=hidden_layers,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=bidirectional)
    return model
예제 #8
0
    def load_state(cls, state_path):
        print("Loading state from model %s" % state_path)
        state = torch.load(state_path,
                           map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(state)
        optim_state = state['optim_dict']
        amp_state = state['amp']
        epoch = int(state.get('epoch', 1)) - 1  # Index start at 0 for training
        training_step = state.get('iteration', None)
        if training_step is None:
            epoch += 1  # We saved model after epoch finished, start at the next epoch.
            training_step = 0
        else:
            training_step += 1
        avg_loss = int(state.get('avg_loss', 0))
        loss_results = state['loss_results']
        cer_results = state['cer_results']
        wer_results = state['wer_results']
        best_wer = state.get('best_wer')

        result_state = ResultState(loss_results=loss_results,
                                   cer_results=cer_results,
                                   wer_results=wer_results)
        return cls(optim_state=optim_state,
                   amp_state=amp_state,
                   model=model,
                   result_state=result_state,
                   best_wer=best_wer,
                   avg_loss=avg_loss,
                   epoch=epoch,
                   training_step=training_step)
예제 #9
0
파일: utils.py 프로젝트: abduld/inference
def get_model(params):
    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
        print(
            "ERROR: GRU does not currently support activations other than tanh"
        )
        sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
        print("ERROR: We should be using ReLU RNNs")
        sys.exit()

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size=params.hidden_size,
                       nb_layers=params.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=False,
                       rnn_activation=params.rnn_act_type,
                       bias=params.bias)

    return model
예제 #10
0
def decode_results(model, decoded_output, decoded_offsets):
    results = {
        "output": [],
        "_meta": {
            "acoustic_model": {
                "name": os.path.basename(args.model_path)
            },
            "language_model": {
                "name": os.path.basename(args.lm_path) if args.lm_path else None,
            },
            "decoder": {
                "lm": args.lm_path is not None,
                "alpha": args.alpha if args.lm_path is not None else None,
                "beta": args.beta if args.lm_path is not None else None,
                "type": args.decoder,
            }
        }
    }
    results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model))

    for b in range(len(decoded_output)):
        for pi in range(min(args.top_paths, len(decoded_output[b]))):
            result = {'transcription': decoded_output[b][pi]}
            if args.offsets:
                result['offsets'] = decoded_offsets[b][pi].tolist()
            results['output'].append(result)
    return results
예제 #11
0
def load_model(device, model_path, model_name, use_half):
    if model_name == 'DeepSpeech':
        model = DeepSpeech.load_model(model_path)
    elif model_name == 'DFCNN':
        model = DFCNN.load_model(model_path)
    model.eval()
    model = model.to(device)
    if use_half:
        model = model.half()
    return model
예제 #12
0
 def on_epoch_end(self, model, optimizer, epoch, loss_results, wer_results,
                  cer_results):
     self.logger.debug("Saving checkpoint {}".format(epoch + 1))
     file_path = '%s/deepspeech_%d.pth' % (self.save_folder, epoch + 1)
     torch.save(
         DeepSpeech.serialize(model,
                              optimizer=optimizer,
                              epoch=epoch,
                              loss_results=loss_results,
                              wer_results=wer_results,
                              cer_results=cer_results), file_path)
예제 #13
0
 def on_batch_end(self, model, optimizer, epoch, batch_no, loss_results,
                  wer_results, cer_results, avg_loss):
     if batch_no > 0 and (batch_no + 1) % self.checkpoint_per_batch == 0:
         file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (
             self.save_folder, epoch + 1, batch_no + 1)
         self.logger.debug("Saving checkpoint model to %s" % file_path)
         torch.save(
             DeepSpeech.serialize(model,
                                  optimizer=optimizer,
                                  epoch=epoch,
                                  iteration=batch_no,
                                  loss_results=loss_results,
                                  wer_results=wer_results,
                                  cer_results=cer_results,
                                  avg_loss=avg_loss), file_path)
예제 #14
0
 def __init__(self, package, input_size):
     super(M_Noise_Deepspeech, self).__init__()
     self.small_const = 1e-6
     self.T = input_size[3]
     self.K = input_size[2]
     self.m = torch.nn.Parameter(torch.Tensor(
         np.array([args.m] * self.T * self.K,
                  dtype=np.float32).reshape(self.K, self.T, 1)).cuda(),
                                 requires_grad=True)
     self.range1 = torch.Tensor(
         np.array(list(range(self.K)) * self.K * self.T).reshape(
             (self.K, self.T, self.K))).cuda()
     self.range2 = torch.Tensor(
         np.array(list(range(self.K)) * self.K * self.T).reshape(
             (self.K, self.T, self.K)).transpose()).cuda()
     self.relu = torch.nn.ReLU()
     self.deepspeech_net = DeepSpeech.load_model_package(package)
예제 #15
0
def load_model(device, model_path, use_half):

    # use load_model method from DeepSpeech class
    model = DeepSpeech.load_model(model_path)

    # set model to eval
    model.eval()

    # put model on device (GPU/CPU)
    model = model.to(device)

    # if the model is using half-precision sampling, use the half method of the model to indicate so
    if use_half:
        model = model.half()

    # return the model
    return model
예제 #16
0
 def __init__(self, package, input_size):
     super(M_Noise_Deepspeech, self).__init__()
     small_const = 1e-6
     self.T = input_size[3]
     self.K = input_size[2]
     self.m = torch.nn.Parameter(torch.Tensor(
         np.array([0.1] * self.T * self.K,
                  dtype=np.float32).reshape(self.K, self.T, 1)),
                                 requires_grad=True)
     self.m_tile = self.m.repeat([1, 1, self.K]).cuda()
     self.range1 = torch.Tensor(
         np.array(list(range(self.K)) * self.K * self.T).reshape(
             (self.K, self.T, self.K))).cuda()
     self.range2 = torch.Tensor(
         np.array(list(range(self.K)) * self.K * self.T).reshape(
             (self.K, self.T, self.K)).transpose()).cuda()
     self.relu = torch.nn.ReLU()
     out = self.relu(self.m_tile - torch.abs(self.range1 - self.range2)) / (
         torch.pow(self.m_tile, 2) + small_const)
     self.blar = torch.mul(out, (self.m_tile > 1).float()) + torch.mul(
         (self.m_tile < 1).float(), (self.range1 == self.range2).float())
     self.deepspeech_net = DeepSpeech.load_model_package(package)
예제 #17
0
def decode_results(model, decoded_output, decoded_offsets):
    results = {
        "output": [],
        "_meta": {
            "acoustic_model": {
                "name": os.path.basename(args.model_path)
            },
            "language_model": {
                "name":
                os.path.basename(args.lm_path) if args.lm_path else None,
            },
            "decoder": {
                "lm": args.lm_path is not None,
                "alpha": args.alpha if args.lm_path is not None else None,
                "beta": args.beta if args.lm_path is not None else None,
                "type": args.decoder,
            }
        }
    }
    results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model))
    str = ''
    print("len is : ", len(decoded_output))
    for b in range(len(decoded_output)):
        str2 = ''
        for pi in range(min(args.top_paths, len(decoded_output[b]))):
            result = {'transcription': decoded_output[b][pi]}
            #if(decoded_output[b][pi]!=" "):
            str2 += decoded_output[b][pi]
            if args.offsets:
                result['offsets'] = decoded_offsets[b][pi]
            results['output'].append(result)
        #str+=','
        #str+=removerepeat(str2)
        str += str2
    str = removerepeat(str)
    str = str.lower()
    print(str)
    #return results
    return str
예제 #18
0
    input_data = torch.randn(args.num_samples, 1, 161, args.seconds * 100)
input_data = input_data.to(device)
input_data = torch.chunk(input_data, int(len(input_data) / args.batch_size))

rnn_type = args.rnn_type.lower()
assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

with open(args.labels_path) as label_file:
    labels = str(''.join(json.load(label_file)))

audio_conf = dict(sample_rate=args.sample_rate,
                  window_size=args.window_size)

model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                   nb_layers=args.hidden_layers,
                   audio_conf=audio_conf,
                   labels=labels,
                   rnn_type=supported_rnns[rnn_type],
                   mixed_precision=args.mixed_precision)
model = model.to(device)
if args.mixed_precision:
    model = convert_model_to_half(model)
print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True, weight_decay=1e-5)
if args.distributed:
    model = DistributedDataParallel(model)
if args.mixed_precision:
    optimizer = FP16_Optimizer(optimizer,
                               static_loss_scale=args.static_loss_scale,
                               dynamic_loss_scale=args.dynamic_loss_scale)
예제 #19
0
                    default=16000,
                    type=int,
                    help='Sample rate')
parser.add_argument('--window_size',
                    default=.02,
                    type=float,
                    help='Window size for spectrogram in seconds')
args = parser.parse_args()

input = torch.randn(args.batch_size, 1, 161, args.seconds * 100).cuda()

rnn_type = args.rnn_type.lower()
assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                   nb_layers=args.hidden_layers,
                   num_classes=29,
                   rnn_type=supported_rnns[rnn_type],
                   sample_rate=args.sample_rate,
                   window_size=args.window_size)

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True)
model = torch.nn.DataParallel(model).cuda()
criterion = CTCLoss()

seconds = int(args.seconds)
batch_size = int(args.batch_size)


def iteration(input_data):
    target = torch.IntTensor(int(batch_size * ((seconds * 100) / 2))).fill_(
        1)  # targets, align half of the audio
예제 #20
0
    save_folder = args.save_folder
    os.makedirs(save_folder, exist_ok=True)  # Ensure save folder exists

    loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
        args.epochs)
    best_wer = None
    if main_proc and args.visdom:
        visdom_logger = VisdomLogger(args.id, args.epochs)
    if main_proc and args.tensorboard:
        tensorboard_logger = TensorBoardLogger(args.id, args.log_dir, args.log_params)

    avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(package)
        labels = model.labels 
        audio_conf = model.audio_conf
        if not args.finetune:  # Don't want to restart training
            optim_state = package['optim_dict']
            start_epoch = int(package.get('epoch', 1)) - 1  # Index start at 0 for training
            start_iter = package.get('iteration', None)
            if start_iter is None:
                start_epoch += 1  #   model is after epoch finished, start at the next epoch.
                start_iter = 0
            else:
                start_iter += 1
            avg_loss = int(package.get('avg_loss', 0))
            loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], \
                                                     package['wer_results']
            best_wer = wer_results[start_epoch]
예제 #21
0
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]


if __name__ == '__main__':
    if args.lm_path is None:
        print("error: LM must be provided for tuning")
        sys.exit(1)

    model = DeepSpeech.load_model(args.model_path)

    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=model.labels,
                                      normalize=True)

    logits = np.load(args.logits)
    batch_size = logits[0][0].shape[0]

    results = []

    def result_callback(result):
        results.append(result)

    p = Pool(args.num_workers)
    def attack2(self, init_delta, target, model_path):
        self.delta2 = torch.FloatTensor(init_delta).cuda()
        self.delta2.requires_grad = True
        self.rescale = torch.ones((self.batch_size, 1)).cuda()
        self.final_deltas = [None] * self.batch_size
        self.alpha = torch.ones((self.batch_size, )).cuda() * 1
        #self.alpha = 1

        model = DeepSpeech.load_model(model_path)
        model = model.cuda()

        self.optim21 = torch.optim.Adam([self.delta2], lr=2)
        self.optim22 = torch.optim.Adam([self.delta2], lr=self.lr2)

        criterion = CTCLoss()

        th_batch = []
        psd_max_batch = []
        for ii in range(self.batch_size):
            th, _, psd_max = generate_th(self.original[ii].cpu().numpy(),
                                         fs=16000,
                                         window_size=2048)
            th_batch.append(th)
            psd_max_batch.append(psd_max)
        th_batch = np.array(th_batch)
        psd_max_batch = np.array(psd_max_batch)
        th_batch = torch.FloatTensor(th_batch).cuda()
        psd_max_batch = torch.FloatTensor(psd_max_batch).cuda()

        MAX = self.num_iterations2
        model.train()
        loss_th = [np.inf] * self.batch_size
        for i in range(MAX):
            # print out some debug information every 10 iterations
            #print(self.delta)
            apply_delta = torch.clamp(
                self.delta2, -2000,
                2000) * self.rescale  #[batch_size * max_audio_len]
            new_input = apply_delta * self.mask + self.original  #[batch_size * max_audio_len]
            #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len]
            pass_in = torch.clamp(new_input, -2**15, 2**15 - 1)
            pass_in = torch.div(pass_in, 2**15)  #[batch_szie * max_audio_len]
            logits, logits_sizes = get_logits(pass_in, self.lengths.int(),
                                              model)  #[batch_size * T * H]
            logits_ = logits.transpose(0, 1)
            # loss

            loss2 = criterion(logits_, self.target_phrase, logits_sizes,
                              self.target_phrase_lengths).cuda()
            loss_value_2 = loss2.item()
            self.optim21.zero_grad()
            loss2.backward(retain_graph=True)
            self.delta2.grad = torch.sign(self.delta2.grad)
            self.optim21.step()

            loss1 = 0
            loss1_each = []
            for ii in range(self.batch_size):
                psd = psd_transform(apply_delta[ii],
                                    psd_max_batch[ii],
                                    win_length=2048,
                                    win_step=512)
                loss1 += self.alpha[ii] * torch.mean(
                    torch.relu(psd - th_batch[ii]))
                loss1_each.append(
                    torch.mean(torch.relu(psd - th_batch[ii])).item())
                #psd_num = psd.cpu().detach().numpy()
                #th_ = th_batch[ii].cpu().detach().numpy()

            loss1 = loss1 / self.batch_size
            loss_value_1 = np.mean(loss1_each)
            self.optim22.zero_grad()
            loss1.backward()
            for ii in range(self.batch_size):
                self.delta2.grad[ii] = self.alpha[ii] * torch.sign(
                    self.delta2.grad[ii])

            #grad = np.sum(self.delta2.grad.cpu().numpy())
            #if grad != grad:
            #    print("NaN")

            self.optim22.step()

            apply_delta_ = torch.clamp(self.delta2, -2000, 2000) * self.rescale

            print('loss: ', loss_value_1, loss_value_2)

            if i + 1 == 2000:
                param_groups = self.optim21.param_groups
                for g in param_groups:
                    g['lr'] = 0.1
                param_groups = self.optim22.param_groups
                for g in param_groups:
                    g['lr'] = 0.1
            if i + 1 == 3200:
                param_groups = self.optim21.param_groups
                for g in param_groups:
                    g['lr'] = 0.01
                param_groups = self.optim22.param_groups
                for g in param_groups:
                    g['lr'] = 0.01

            if (i + 1) % 10 == 0:
                decode_out, _ = self.decoder.decode(logits, logits_sizes)
                print(i + 1, decode_out[0], [target[0]])

            for ii in range(self.batch_size):
                if ((i + 1) % 50 == 0
                        and decode_out[ii] == [target[ii].upper()]) or (
                            i == MAX - 1 and self.final_deltas[ii] is None):
                    self.alpha[ii] = 1.2 * self.alpha[ii]
                    if self.alpha[ii] > 1000:
                        self.alpha[ii] = 1000
                    # Adjust the best solution found so far
                    if loss1_each[ii] < loss_th[ii]:
                        loss_th[ii] = loss1_each[ii]
                        self.final_deltas[ii] = new_input[ii][
                            0:self.lengths[ii].int()].cpu().detach().numpy()
                    print("up alpha=%f" % (self.alpha[ii]))

                if ((i + 1) % 100 == 0
                        and decode_out[ii] != [target[ii].upper()]):
                    self.alpha[ii] = 0.6 * self.alpha[ii]
                    '''
                    if self.alpha <= 100:
                        self.alpha = 100
                    else:
                        # Adjust the best solution found so far
                        print("down alpha=%f" % (self.alpha))
                    '''
                    print("down alpha=%f" % (self.alpha[ii]))
        return self.final_deltas
    def attack1(self, audios, lengths, max_audio_len, targets, model_path):
        self.max_audio_len = max_audio_len
        self.original = torch.FloatTensor(audios).cuda()
        self.lengths = torch.FloatTensor(lengths)
        #define some variables
        self.delta1 = torch.zeros((self.batch_size, self.max_audio_len)).cuda()
        self.delta1.requires_grad = True
        self.rescale = torch.ones((self.batch_size, 1)).cuda()
        self.mask = torch.FloatTensor(
            np.array([[1 if i < l else 0 for i in range(self.max_audio_len)]
                      for l in self.lengths])).cuda()
        self.final_deltas = [None] * self.batch_size

        self.target_phrase_lengths = torch.IntTensor(self.batch_size)
        self.target_phrase = []
        for x in range(self.batch_size):
            phrase = list(
                filter(
                    None,
                    [self.labels_map.get(x)
                     for x in list(targets[x].upper())]))
            self.target_phrase_lengths[x] = len(phrase)
            self.target_phrase.extend(phrase)
        self.target_phrase = torch.IntTensor(self.target_phrase)
        #print(self.target_phrase.size(), self.target_phrase_lengths)
        model = DeepSpeech.load_model(model_path)
        model = model.cuda()
        self.optim1 = torch.optim.Adam([self.delta1], lr=self.lr1)

        criterion = CTCLoss()

        MAX = self.num_iterations1
        model.train()
        #self.noise = torch.randn(self.delta1.shape).cuda()  #[batch_szie * max_audio_len]
        for i in range(MAX):

            # print out some debug information every 10 iterations
            apply_delta = torch.clamp(
                self.delta1, -2000,
                2000) * self.rescale  #[batch_size * max_audio_len]
            new_input = apply_delta * self.mask + self.original  #[batch_size * max_audio_len]
            #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len]
            pass_in = torch.clamp(new_input, -2**15, 2**15 - 1)
            pass_in = torch.div(pass_in, 2**15)  #[batch_szie * max_audio_len]
            logits, logits_sizes = get_logits(pass_in, self.lengths.int(),
                                              model)  #[batch_size * T * H]
            logits_ = logits.transpose(0, 1)
            # loss
            if not np.isinf(self.l2penalty):
                loss = torch.mean(
                    (new_input - self.original)
                    **2) + self.l2penalty * criterion(
                        logits_, self.target_phrase, logits_sizes,
                        self.target_phrase_lengths).cuda()
            else:
                loss = criterion(logits_, self.target_phrase, logits_sizes,
                                 self.target_phrase_lengths).cuda()
            loss_value = loss.item()
            # optimize
            self.optim1.zero_grad()
            loss.backward()
            # grad sign
            self.delta1.grad = torch.sign(self.delta1.grad)
            self.optim1.step()

            print('loss: ', loss_value)
            if (i + 1) % 10 == 0:
                decode_out, _ = self.decoder.decode(logits, logits_sizes)
                #print(decode_out, targets)

            for ii in range(self.batch_size):
                if ((i + 1) % 10 == 0
                        and decode_out[ii] == [targets[ii].upper()]) or (
                            i == MAX - 1 and self.final_deltas[ii] is None):
                    bound_tmp = torch.max(torch.abs(self.delta1[ii])).item()
                    if self.rescale[ii][0] * 2000 > bound_tmp:
                        print("It's way over", bound_tmp / 2000.0)
                        self.rescale[ii][0] = bound_tmp / 2000.0

                    self.rescale[ii][0] *= .8

                    # Adjust the best solution found so far
                    self.final_deltas[ii] = new_input[ii].cpu().detach().numpy(
                    )
                    print("bound=%f" % (2000 * self.rescale[ii][0]))

        return self.final_deltas
예제 #24
0
else:
    input_data = torch.randn(args.num_samples, 1, 161, args.seconds * 100).cuda()
input_data = torch.chunk(input_data, int(len(input_data) / args.batch_size))

rnn_type = args.rnn_type.lower()
assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

with open(args.labels_path) as label_file:
    labels = str(''.join(json.load(label_file)))

audio_conf = dict(sample_rate=args.sample_rate,
                  window_size=args.window_size)

model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                   nb_layers=args.hidden_layers,
                   audio_conf=audio_conf,
                   labels=labels,
                   rnn_type=supported_rnns[rnn_type])

print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4,
                            momentum=0.9, nesterov=True)
model.cuda()
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model)

criterion = CTCLoss()

seconds = int(args.seconds)
예제 #25
0

def transcribe(audio_path, parser, model, decoder, cuda=False):
    spect = parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    if cuda:
        spect = spect.cuda()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets


if __name__ == '__main__':
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
예제 #26
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels,
                                       normalize=True, augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size,
                                   num_workers=1)
    test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size,
                                  num_workers=1)

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = audio_conf,
                       bidirectional   = True,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters, lr=params.lr,
                                momentum=params.momentum, nesterov=True,
                                weight_decay = params.l2)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get('epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))

        if args.start_epoch != -1:
          start_epoch = args.start_epoch

        loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch]
        print(loss_results)
        epoch = start_epoch

    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
    if params.cuda:
        model         = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()

    for epoch in range(start_epoch, params.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if params.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)

            ctc_start_time = time.time()
            loss = criterion(out, targets, sizes, target_sizes)
            ctc_time.update(time.time() - ctc_start_time)

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm)
            # SGD step
            optimizer.step()

            if params.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time,
                data_time=data_time, ctc_time=ctc_time, loss=losses))

            del loss
            del out

        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
            'Average Loss {loss:.3f}\t'
            .format( epoch + 1, loss=avg_loss, ))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()

        wer, cer = eval_model( model, test_loader, decoder)

        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(
            epoch + 1, wer=wer, cer=cer))

        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results),
                       file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / params.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

        if best_wer is None or best_wer > wer:
            print("Found better validated model, saving to %s" % args.model_path)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results)
                       , args.model_path)
            best_wer = wer

        avg_loss = 0

        #If set to exit at a given accuracy, exit
        if params.exit_at_acc and (best_wer <= args.acc):
            break

    print("=======================================================")
    print("***Best WER = ", best_wer)
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")
예제 #27
0
                    default=.01,
                    type=float,
                    help='Window stride for spectrogram in seconds')
parser.add_argument('--window',
                    default='hamming',
                    help='Window type for spectrogram generation')
parser.add_argument('--cuda',
                    default=True,
                    type=bool,
                    help='Use cuda to train model')
args = parser.parse_args()

if __name__ == '__main__':
    package = torch.load(args.model_path)
    model = DeepSpeech(rnn_hidden_size=package['hidden_size'],
                       nb_layers=package['hidden_layers'],
                       num_classes=package['nout'])
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(package['state_dict'])
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)
    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    decoder = ArgMaxDecoder(labels)
    parser = SpectrogramParser(audio_conf, normalize=True)
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect))
예제 #28
0
def convert(parser):
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    val_batch_size = min(8,params.batch_size_val)
    print("Using bs={} for validation. Parameter found was {}".format(val_batch_size,params.batch_size_val))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels,
                                       normalize=True, augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size,
                                   num_workers=(1 if params.cuda else 1))
    test_loader = AudioDataLoader(test_dataset, batch_size=val_batch_size,
                                  num_workers=(1 if params.cuda else 1))

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = audio_conf,
                       bidirectional   = False,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        if params.cuda:
            model = model.cuda()

    if params.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    ####################################################
    #  Begin ONNX conversion
    ####################################################
    model.train(False)
    # Input to the model
    data = next(iter(train_loader))
    inputs, targets, input_percentages, target_sizes = data
    inputs = Variable(inputs, requires_grad=False)
    target_sizes = Variable(target_sizes, requires_grad=False)
    targets = Variable(targets, requires_grad=False)

    if params.cuda:
        inputs = inputs.cuda()

    x = inputs
    print(x.size())

    # Export the model
    onnx_file_path = osp.join(osp.dirname(args.continue_from),osp.basename(args.continue_from).split('.')[0]+".onnx")
    print("Saving new ONNX model to: {}".format(onnx_file_path))
    torch.onnx.export(model,                   # model being run
                      inputs,                  # model input (or a tuple for multiple inputs)
		              onnx_file_path,          # where to save the model (can be a file or file-like object)
                      export_params=True,      # store the trained parameter weights inside the model file
                      verbose=False)
예제 #29
0
        with open(args.labels_path) as label_file:
            labels = json.load(label_file)

        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)

        state = TrainingState(model=model)
        state.init_results_tracking(epochs=args.epochs)

    # Data setup
    evaluation_decoder = GreedyDecoder(model.labels)  # Decoder used for validation
    train_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=model.labels,
                                       normalize=True,
                                       speed_volume_perturb=args.speed_volume_perturb,
                                       spec_augment=args.spec_augment)
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
예제 #30
0
        viz = Visdom()
        opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER'])
        viz_window = None
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard and main_proc:
        os.makedirs(args.log_dir, exist_ok=True)
        from tensorboardX import SummaryWriter

        tensorboard_writer = SummaryWriter(args.log_dir)
    os.makedirs(save_folder, exist_ok=True)

    avg_loss, start_epoch, start_iter = 0, 0, 0
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(package)
        labels = DeepSpeech.get_labels(model)
        audio_conf = DeepSpeech.get_audio_conf(model)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
        if not args.finetune:  # Don't want to restart training
            if args.cuda:
                model.cuda()
            optimizer.load_state_dict(package['optim_dict'])
            start_epoch = int(package.get('epoch', 1)) - 1  # Index start at 0 for training
            start_iter = package.get('iteration', None)
            if start_iter is None:
                start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
                start_iter = 0
            else:
    loss_results, cer_results, wer_results = torch.Tensor(
        args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
    best_wer = None
    if main_proc and args.visdom:
        visdom_logger = VisdomLogger(args.id, args.epochs)
    if main_proc and args.tensorboard:
        tensorboard_logger = TensorBoardLogger(args.id, args.log_dir,
                                               args.log_params)

    avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from,
                             map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(package)
        labels = model.labels
        audio_conf = model.audio_conf
        if not args.finetune:  # Don't want to restart training
            optim_state = package['optim_dict']
            start_epoch = int(package.get(
                'epoch', 1)) - 1  # Index start at 0 for training
            start_iter = package.get('iteration', None)
            if start_iter is None:
                start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
                start_iter = 0
            else:
                start_iter += 1
            avg_loss = int(package.get('avg_loss', 0))
            loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], \
                                                     package['wer_results']
예제 #32
0
                    legend=['Loss', 'WER', 'CER'])
        viz_window = None
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard and main_proc:
        os.makedirs(args.log_dir, exist_ok=True)
        from tensorboardX import SummaryWriter

        tensorboard_writer = SummaryWriter(args.log_dir)
    os.makedirs(save_folder, exist_ok=True)

    avg_loss, start_epoch, start_iter = 0, 0, 0
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from,
                             map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(package)
        labels = DeepSpeech.get_labels(model)
        audio_conf = DeepSpeech.get_audio_conf(model)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    nesterov=True)
        if not args.finetune:  # Don't want to restart training
            optimizer.load_state_dict(package['optim_dict'])
            start_epoch = int(package.get(
                'epoch', 1)) - 1  # Index start at 0 for training
            start_iter = package.get('iteration', None)
            if start_iter is None:
                start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
                start_iter = 0
예제 #33
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(
        args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
    if args.visdom:
        from visdom import Visdom
        viz = Visdom()

        opts = [
            dict(title='Loss', ylabel='Loss', xlabel='Epoch'),
            dict(title='WER', ylabel='WER', xlabel='Epoch'),
            dict(title='CER', ylabel='CER', xlabel='Epoch')
        ]

        viz_windows = [None, None, None]
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard:
        from logger import TensorBoardLogger
        try:
            os.makedirs(args.log_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                print('Directory already exists.')
                for file in os.listdir(args.log_dir):
                    file_path = os.path.join(args.log_dir, file)
                    try:
                        if os.path.isfile(file_path):
                            os.unlink(file_path)
                    except Exception as e:
                        raise
            else:
                raise
        logger = TensorBoardLogger(args.log_dir)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get(
            'epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))
        loss_results, cer_results, wer_results = package[
            'loss_results'], package['cer_results'], package['wer_results']
        if args.visdom and \
                        package['loss_results'] is not None and start_epoch > 0:  # Add previous scores to visdom graph
            x_axis = epochs[0:start_epoch]
            y_axis = [
                loss_results[0:start_epoch], wer_results[0:start_epoch],
                cer_results[0:start_epoch]
            ]
            for x in range(len(viz_windows)):
                viz_windows[x] = viz.line(
                    X=x_axis,
                    Y=y_axis[x],
                    opts=opts[x],
                )
        if args.tensorboard and \
                        package['loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
            for i in range(start_epoch):
                info = {
                    'Avg Train Loss': loss_results[i],
                    'Avg WER': wer_results[i],
                    'Avg CER': cer_results[i]
                }
                for tag, val in info.items():
                    logger.scalar_summary(tag, val, i + 1)
        if not args.no_bucketing:
            print("Using bucketing sampler for the following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                labels=labels,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler
    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(),
                             requires_grad=False)

            loss = criterion(out, targets, sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            if args.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses))
            if args.checkpoint_per_batch > 0 and i > 0 and (
                    i + 1) % args.checkpoint_per_batch == 0:
                file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % (
                    save_folder, epoch + 1, i + 1)
                print("Saving checkpoint model to %s" % file_path)
                torch.save(
                    DeepSpeech.serialize(model,
                                         optimizer=optimizer,
                                         epoch=epoch,
                                         iteration=i,
                                         loss_results=loss_results,
                                         wer_results=wer_results,
                                         cer_results=cer_results,
                                         avg_loss=avg_loss), file_path)
            del loss
            del out
        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()
        for i, (data) in enumerate(test_loader):  # test
            inputs, targets, input_percentages, target_sizes = data

            inputs = Variable(inputs, volatile=True)

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH
            seq_length = out.size(0)
            sizes = input_percentages.mul_(int(seq_length)).int()

            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(
                decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x]))
            total_cer += cer
            total_wer += wer

            if args.cuda:
                torch.cuda.synchronize()
            del out
        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100
        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.visdom:
            # epoch += 1
            x_axis = epochs[0:epoch + 1]
            y_axis = [
                loss_results[0:epoch + 1], wer_results[0:epoch + 1],
                cer_results[0:epoch + 1]
            ]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.tensorboard:
            info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer}
            for tag, val in info.items():
                logger.scalar_summary(tag, val, epoch + 1)
            if args.log_params:
                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    logger.histo_summary(tag, to_np(value), epoch + 1)
                    logger.histo_summary(tag + '/grad', to_np(value.grad),
                                         epoch + 1)
        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(
                DeepSpeech.serialize(model,
                                     optimizer=optimizer,
                                     epoch=epoch,
                                     loss_results=loss_results,
                                     wer_results=wer_results,
                                     cer_results=cer_results), file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0][
            'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(
            lr=optim_state['param_groups'][0]['lr']))

        avg_loss = 0
        if not args.no_bucketing and epoch == 0:
            print("Switching to bucketing sampler for following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                labels=labels,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler

    torch.save(DeepSpeech.serialize(model, optimizer=optimizer),
               args.final_model_path)
예제 #34
0
parser.add_argument('--verbose',
                    action="store_true",
                    help="print out decoded output and error of each sample")
no_decoder_args = parser.add_argument_group(
    "No Decoder Options", "Configuration options for when no decoder is "
    "specified")
no_decoder_args.add_argument('--output-path',
                             default=None,
                             type=str,
                             help="Where to save raw acoustic output")
parser = add_decoder_args(parser)
args = parser.parse_args()

if __name__ == '__main__':
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = model.to(device)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
예제 #35
0
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Model Save directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    avg_loss, start_epoch, start_iter = 0, 0, 0
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from,
                             map_location=lambda storage, loc: storage)
        model_teacher = DeepSpeech.load_model_package(package)
        labels = DeepSpeech.get_labels(model_teacher)
        audio_conf = DeepSpeech.get_audio_conf(model_teacher)
        parameters_teacher = model_teacher.parameters()
        optimizer_teacher = torch.optim.SGD(parameters_teacher,
                                            lr=args.lr,
                                            momentum=args.momentum,
                                            nesterov=True)

        # load student model with pretrained model
        '''
        model_student = DeepSpeech.load_model_package(package)
        parameters_student = model_student.parameters()
        optimizer_student = torch.optim.SGD(parameters_student, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
        '''