Python Tokenizer.find_offsets 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: fairseq.tokenizer

클래스/타입: Tokenizer

메소드/함수: find_offsets

hotexamples.com에서의 예제들: 3

Python Tokenizer.find_offsets - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 fairseq.tokenizer.Tokenizer.find_offsets에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

tokenize(22)

binarize(9)

add_file_to_dictionary(7)

build_dictionary(3)

find_offsets(3)

build_tokenizer(2)

segment(1)

예제 #1

파일 보기

파일: preprocess_universal.py 프로젝트: sugeeth14/multilingual-kd-pytorch

    def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang,
                            num_workers):
        if not args.joined_dictionary and lang != 'en':
            dict = dictionary.Dictionary.load(tgt_dict_path)
        else:
            dict = dictionary.Dictionary.load(dict_path)

        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe'
        if not os.path.exists(input_file):
            input_file = f'{input_prefix}.{lng_pair}.{lang}'
            if not os.path.exists(input_file):
                print("| {} not found".format(input_file))
                return
        if args.expert:
            input_file = input_file + '.e'
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                pool.apply_async(binarize,
                                 (input_file, dict, fn_without_ext,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            f"{output_prefix}.{lng_pair}.{lang}.bin")
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx")

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))

예제 #2

파일 보기

    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        print("offsets", offsets)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))

예제 #3

파일 보기

def get_offsets(input_file, num_workers):
    return Tokenizer.find_offsets(input_file, num_workers)