Exemplos de tokenize em Python, exemplos de src.utils.tokenizer.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

 def validation_step(self, batch, batch_nb):
     unvalid = ['[B]', '[S]', '[N]', '[T]', '[P]']
     feature, feature_length, target, target_length = batch[0], batch[
         1], batch[2], batch[3]
     model_output, output_token, spec_output, feature_length, ori_token, ori_token_length, ce_loss, switch_loss = self.forward(
         feature, feature_length, target, target_length, True)
     result_string_list = [
         ' '.join([j for j in tokenize(i) if j not in unvalid])
         for i in self.transformer.inference(feature, feature_length)
     ]
     target_string_list = [
         ' '.join([
             j
             for j in tokenize(self.transformer.vocab.id2string(i.tolist()))
             if j not in unvalid
         ]) for i in output_token
     ]
     print(result_string_list[0])
     print(target_string_list[0])
     mers = [
         cal_wer(i[0], i[1])
         for i in zip(target_string_list, result_string_list)
     ]
     mer = np.mean(mers)
     ctc_loss = self.transformer.cal_ctc_loss(spec_output, feature_length,
                                              ori_token, ori_token_length)
     loss = self.hparams.loss_lambda * ce_loss + (
         1 - self.hparams.loss_lambda) * ctc_loss + switch_loss / 2
     tqdm_dict = {
         'loss': loss,
         'ce': ce_loss,
         'switch': switch_loss,
         'mer': mer,
         'lr': self.lr
     }
     output = OrderedDict({
         'loss': loss,
         'ce': ce_loss,
         # 'ctc_loss': ctc_loss,
         'switch': switch_loss,
         'mer': mer,
         'progress_bar': tqdm_dict,
         'log': tqdm_dict
     })
     return output

Exemplo n.º 2

0

Exibir arquivo

def merge(wav_list, target_dict, extract_name_fn, manifest_csv_path):
    wav_df = pd.DataFrame(wav_list, columns=['wav_file'])
    wav_df.index = wav_df.wav_file.apply(extract_name_fn)
    target_df = pd.DataFrame.from_dict(target_dict,
                                       orient='index',
                                       columns=['target'])
    merged_df = pd.merge(left=wav_df,
                         right=target_df,
                         left_index=True,
                         right_index=True)
    merged_df['duration'] = merged_df['wav_file'].apply(cal_duration)
    merged_df['target'] = merged_df['target'].apply(
        lambda x: combine(tokenize(x)))
    try:
        merged_df.to_csv(manifest_csv_path, encoding='utf8')
    except:
        merged_df.to_csv(manifest_csv_path)
    print(f'manifest saved to {manifest_csv_path}')
    return 'done'

Exemplo n.º 3

0

Exibir arquivo

def extract_corpus_from_target_dict(target_dict, write_to):
    with open(write_to, 'w', encoding='utf8') as writer:
        for name, target in target_dict.items():
            writer.write(combine(tokenize(target.strip())) + '\n')
    print('done')