Python tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: src.utils.tokenizer

메소드/함수: tokenize

hotexamples.com에서의 예제들: 3

Python tokenize - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 src.utils.tokenizer.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

 def validation_step(self, batch, batch_nb):
     unvalid = ['[B]', '[S]', '[N]', '[T]', '[P]']
     feature, feature_length, target, target_length = batch[0], batch[
         1], batch[2], batch[3]
     model_output, output_token, spec_output, feature_length, ori_token, ori_token_length, ce_loss, switch_loss = self.forward(
         feature, feature_length, target, target_length, True)
     result_string_list = [
         ' '.join([j for j in tokenize(i) if j not in unvalid])
         for i in self.transformer.inference(feature, feature_length)
     ]
     target_string_list = [
         ' '.join([
             j
             for j in tokenize(self.transformer.vocab.id2string(i.tolist()))
             if j not in unvalid
         ]) for i in output_token
     ]
     print(result_string_list[0])
     print(target_string_list[0])
     mers = [
         cal_wer(i[0], i[1])
         for i in zip(target_string_list, result_string_list)
     ]
     mer = np.mean(mers)
     ctc_loss = self.transformer.cal_ctc_loss(spec_output, feature_length,
                                              ori_token, ori_token_length)
     loss = self.hparams.loss_lambda * ce_loss + (
         1 - self.hparams.loss_lambda) * ctc_loss + switch_loss / 2
     tqdm_dict = {
         'loss': loss,
         'ce': ce_loss,
         'switch': switch_loss,
         'mer': mer,
         'lr': self.lr
     }
     output = OrderedDict({
         'loss': loss,
         'ce': ce_loss,
         # 'ctc_loss': ctc_loss,
         'switch': switch_loss,
         'mer': mer,
         'progress_bar': tqdm_dict,
         'log': tqdm_dict
     })
     return output

예제 #2

파일 보기

def merge(wav_list, target_dict, extract_name_fn, manifest_csv_path):
    wav_df = pd.DataFrame(wav_list, columns=['wav_file'])
    wav_df.index = wav_df.wav_file.apply(extract_name_fn)
    target_df = pd.DataFrame.from_dict(target_dict,
                                       orient='index',
                                       columns=['target'])
    merged_df = pd.merge(left=wav_df,
                         right=target_df,
                         left_index=True,
                         right_index=True)
    merged_df['duration'] = merged_df['wav_file'].apply(cal_duration)
    merged_df['target'] = merged_df['target'].apply(
        lambda x: combine(tokenize(x)))
    try:
        merged_df.to_csv(manifest_csv_path, encoding='utf8')
    except:
        merged_df.to_csv(manifest_csv_path)
    print(f'manifest saved to {manifest_csv_path}')
    return 'done'

예제 #3

파일 보기

def extract_corpus_from_target_dict(target_dict, write_to):
    with open(write_to, 'w', encoding='utf8') as writer:
        for name, target in target_dict.items():
            writer.write(combine(tokenize(target.strip())) + '\n')
    print('done')