示例#1
0
def run_evaluation(
    test_loader,
    model,
    decoder: Decoder,
    device: torch.device,
    target_decoder: Decoder,
    # precision: Precision
):
    model.eval()
    wer = WordErrorRate(decoder=decoder, target_decoder=target_decoder)
    cer = CharErrorRate(decoder=decoder, target_decoder=target_decoder)
    for i, (batch) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = batch
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        inputs = inputs.to(device)
        # with autocast(enabled=precision is Precision.half):
        with autocast(enabled=True):
            out, output_sizes = model(inputs, input_sizes)
        decoded_output, _ = decoder.decode(out, output_sizes)
        wer.update(preds=out,
                   preds_sizes=output_sizes,
                   targets=targets,
                   target_sizes=target_sizes)
        cer.update(preds=out,
                   preds_sizes=output_sizes,
                   targets=targets,
                   target_sizes=target_sizes)
    return wer.compute(), cer.compute()
示例#2
0
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser,
                   model: DeepSpeech, decoder: Decoder, device: torch.device,
                   use_half: bool):
    # audio_path
    # try:
    #     # inTranscript = audio_path.replace("wav", "txt")
    #     # print(inTranscript)
    #     # getTranscript(inTranscript)
    #     pass
    # except Exception as asd:
    #     print(asd)
    #     pass
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    if use_half:
        spect = spect.half()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)

    #Thêm vào greedy
    decoder2 = GreedyDecoder(labels=model.labels,
                             blank_index=model.labels.index('_'))
    decoded_output2, decoded_offsets2 = decoder2.decode(out, output_sizes)

    return decoded_output, decoded_output2, decoded_offsets, decoded_offsets2
示例#3
0
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser,
                   model: DeepSpeech, decoder: Decoder, device: torch.device,
                   precision: int):
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    with autocast(enabled=precision == 16):
        out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets
示例#4
0
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser,
                   model: DeepSpeech, decoder: Decoder, device: torch.device,
                   use_half: bool):
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    if use_half:
        spect = spect.half()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets
def run_evaluation_print(test_loader,
                   model,
                   decoder: Decoder,
                   device: torch.device,
                   target_decoder: Decoder,
                   precision: int):
    # track time to complete
    start_time = time.time()

    model.eval()
    wer = WordErrorRate(
        decoder=decoder,
        target_decoder=target_decoder
    )
    cer = CharErrorRate(
        decoder=decoder,
        target_decoder=target_decoder
    )
    
    output_dict = {}
    pred, gt = [],[]

    for i, (batch) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = batch
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        inputs = inputs.to(device)
        with autocast(enabled=precision == 16):
            out, output_sizes = model(inputs, input_sizes)
        decoded_output, _ = decoder.decode(out, output_sizes)
        # prints out text
        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size
        #print('split_targets:')
        #print(split_targets)

        target_strings = target_decoder.convert_to_strings(split_targets)
        #print('target_strings:')
        #print(target_strings)
        
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            pred.append(transcript)
            gt.append(reference)

            # self.calculate_metric(
            #     transcript=transcript,
            #     reference=reference
            # )
        

        wer.update(
            preds=out,
            preds_sizes=output_sizes,
            targets=targets,
            target_sizes=target_sizes
        )
        cer.update(
            preds=out,
            preds_sizes=output_sizes,
            targets=targets,
            target_sizes=target_sizes
        )
    output_dict['pred'] = pred
    output_dict['gt'] = gt

    print("--- Time taken to infer %s seconds ---" % (time.time() - start_time))
    
    output_file_path = '/root/deepspeech/outputs/valid_output.json'
    #export to file
    with open(output_file_path, 'w') as fp:
        json.dump(output_dict, fp)

    return wer.compute(), cer.compute()