예제 #1
0
파일: binarizer.py 프로젝트: tma15/fairseq
    def _binarize_file_chunk(
        binarizer: Binarizer,
        filename: str,
        offset_start: int,
        offset_end: int,
        output_prefix: str,
        dataset_impl: str,
        vocab_size=None,
    ) -> tp.Tuple[tp.Any, BinarizeSummary]:  # (dataset builder, BinarizeSummary)
        """
        creates a dataset builder and append binarized items to it. This function does not
        finalize the builder, this is useful if you want to do other things with your bin file
        like appending/merging other files
        """
        bin_file = indexed_dataset.data_file_path(output_prefix)
        ds = indexed_dataset.make_builder(
            bin_file,
            impl=dataset_impl,
            vocab_size=vocab_size,
        )
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                ds.add_item(binarizer.binarize_line(line, summary))

        return ds, summary
예제 #2
0
 def _add_file_to_dictionary_single_worker(
     filename,
     tokenize,
     eos_word,
     start_offset,
     end_offset,
 ):
     counter = Counter()
     with Chunker(filename, start_offset, end_offset) as line_iterator:
         for line in line_iterator:
             for word in tokenize(line):
                 counter.update([word])
             counter.update([eos_word])
     return counter
예제 #3
0
    def test_readchunks(self):
        from fairseq.file_chunker_utils import Chunker, find_offsets

        offsets = find_offsets(self._tmpfile, self._num_splits)
        for start, end in zip(offsets, offsets[1:]):
            with Chunker(self._tmpfile, start, end) as lines:
                all_lines = list(lines)
                num_lines = self._num_lines / self._num_splits
                self.assertAlmostEqual(
                    len(all_lines), num_lines, delta=1
                )  # because we split on the bites, we might end up with one more/less line in a chunk
                self.assertListEqual(
                    all_lines,
                    [self._line_content for _ in range(len(all_lines))])
예제 #4
0
파일: binarizer.py 프로젝트: sdadas/fairseq
    def binarize_alignments(filename,
                            alignment_parser,
                            consumer,
                            offset=0,
                            end=-1) -> Dict[str, int]:
        nseq = 0

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                ids = alignment_parser(line)
                nseq += 1
                consumer(ids)
        return {"nseq": nseq}
예제 #5
0
파일: binarizer.py 프로젝트: sdadas/fairseq
    def binarize(
        filename,
        dict,
        consumer,
        tokenize=tokenize_line,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        already_numberized=False,
    ) -> Dict[str, int]:
        nseq, ntok = 0, 0
        replaced = Counter()

        def replaced_consumer(word, idx):
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #6
0
파일: binarizer.py 프로젝트: tma15/fairseq
    def _consume_file(
        filename: str,
        binarizer: Binarizer,
        consumer: tp.Callable[[torch.IntTensor], None],
        offset_start: int,
        offset_end: int,
    ) -> tp.Dict[str, int]:
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                consumer(binarizer.binarize_line(line, summary))

        return {
            "nseq": summary.num_seq,
            "nunk": summary.num_replaced,
            "ntok": summary.num_tok,
            "replaced": summary.replaced,
        }