Python Chunker 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: fairseq.file_chunker_utils

클래스/타입: Chunker

hotexamples.com에서의 예제들: 6

Python Chunker - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 fairseq.file_chunker_utils.Chunker에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Chunker(6)

자주 사용되는 메소드들

Chunker (6)

예제 #1

파일 보기

파일: binarizer.py 프로젝트: tma15/fairseq

    def _binarize_file_chunk(
        binarizer: Binarizer,
        filename: str,
        offset_start: int,
        offset_end: int,
        output_prefix: str,
        dataset_impl: str,
        vocab_size=None,
    ) -> tp.Tuple[tp.Any, BinarizeSummary]:  # (dataset builder, BinarizeSummary)
        """
        creates a dataset builder and append binarized items to it. This function does not
        finalize the builder, this is useful if you want to do other things with your bin file
        like appending/merging other files
        """
        bin_file = indexed_dataset.data_file_path(output_prefix)
        ds = indexed_dataset.make_builder(
            bin_file,
            impl=dataset_impl,
            vocab_size=vocab_size,
        )
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                ds.add_item(binarizer.binarize_line(line, summary))

        return ds, summary

예제 #2

파일 보기

 def _add_file_to_dictionary_single_worker(
     filename,
     tokenize,
     eos_word,
     start_offset,
     end_offset,
 ):
     counter = Counter()
     with Chunker(filename, start_offset, end_offset) as line_iterator:
         for line in line_iterator:
             for word in tokenize(line):
                 counter.update([word])
             counter.update([eos_word])
     return counter

예제 #3

파일 보기

    def test_readchunks(self):
        from fairseq.file_chunker_utils import Chunker, find_offsets

        offsets = find_offsets(self._tmpfile, self._num_splits)
        for start, end in zip(offsets, offsets[1:]):
            with Chunker(self._tmpfile, start, end) as lines:
                all_lines = list(lines)
                num_lines = self._num_lines / self._num_splits
                self.assertAlmostEqual(
                    len(all_lines), num_lines, delta=1
                )  # because we split on the bites, we might end up with one more/less line in a chunk
                self.assertListEqual(
                    all_lines,
                    [self._line_content for _ in range(len(all_lines))])

예제 #4

파일 보기

파일: binarizer.py 프로젝트: sdadas/fairseq

    def binarize_alignments(filename,
                            alignment_parser,
                            consumer,
                            offset=0,
                            end=-1) -> Dict[str, int]:
        nseq = 0

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                ids = alignment_parser(line)
                nseq += 1
                consumer(ids)
        return {"nseq": nseq}

예제 #5

파일 보기

파일: binarizer.py 프로젝트: sdadas/fairseq

    def binarize(
        filename,
        dict,
        consumer,
        tokenize=tokenize_line,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        already_numberized=False,
    ) -> Dict[str, int]:
        nseq, ntok = 0, 0
        replaced = Counter()

        def replaced_consumer(word, idx):
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }

예제 #6

파일 보기

파일: binarizer.py 프로젝트: tma15/fairseq

    def _consume_file(
        filename: str,
        binarizer: Binarizer,
        consumer: tp.Callable[[torch.IntTensor], None],
        offset_start: int,
        offset_end: int,
    ) -> tp.Dict[str, int]:
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                consumer(binarizer.binarize_line(line, summary))

        return {
            "nseq": summary.num_seq,
            "nunk": summary.num_replaced,
            "ntok": summary.num_tok,
            "replaced": summary.replaced,
        }