示例#1
0
 def _add_file_to_dictionary_single_worker(
     filename, tokenize, eos_word, worker_id=0, num_workers=1
 ):
     counter = Counter()
     with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_workers
         offset = worker_id * chunk_size
         end = offset + chunk_size
         f.seek(offset)
         if offset > 0:
             safe_readline(f)  # drop first incomplete line
         line = f.readline()
         while line:
             for word in tokenize(line):
                 counter.update([word])
             counter.update([eos_word])
             # f.tell() returns only an opaque number which can
             # return to the position in the file via f.seek()
             # and does not necessarily represent a byte position
             # in the file. However, f.tell() is faithful to the
             # byte position _most of the time_. Thus we can just
             # check against the file size to prevent early exit.
             if f.tell() > end and f.tell() < size:
                 break
             line = f.readline()
     return counter
    def _add_file_to_dictionary_single_worker(filename,
                                              tokenize,
                                              eos_word,
                                              worker_id=0,
                                              num_workers=1,
                                              L=None):
        # This method is modified as it takes first L words in each line.

        counter = Counter()
        with open(PathManager.get_local_path(filename), "r",
                  encoding="utf-8") as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = size // num_workers
            offset = worker_id * chunk_size
            end = offset + chunk_size
            f.seek(offset)
            if offset > 0:
                safe_readline(f)  # drop first incomplete line
            line = f.readline()
            while line:
                for word in tokenize(line[:L]):
                    counter.update([word])
                counter.update([eos_word])
                if f.tell() > end:
                    break
                line = f.readline()
        return counter
示例#3
0
    def _add_file_to_dictionary_single_worker(filename,
                                              tokenize,
                                              eos_word,
                                              worker_id=0,
                                              num_workers=1):
        raise NotImplementedError

        counter = Counter()
        with open(filename, 'r', encoding='utf-8') as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = size // num_workers
            offset = worker_id * chunk_size
            end = offset + chunk_size
            f.seek(offset)
            if offset > 0:
                safe_readline(f)  # drop first incomplete line
            line = f.readline()
            while line:
                for word in tokenize(line):
                    counter.update([word])
                counter.update([eos_word])
                if f.tell() > end:
                    break
                line = f.readline()
        return counter
示例#4
0
def reduce_words(filename, save_path, drop_filename):
    with open(drop_filename, 'r', encoding='utf-8') as f:
        line = safe_readline(f)
        drop_list = []
        while line:
            drop_list.append(line)
            line = f.readline()
    name_list = ['.en', '.de', '.bert.en', '.bert.de', '.bart.en', '.bart.de']
    for name in name_list:
        remove_lines(filename + name, save_path, drop_list)
示例#5
0
 def _add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1):
     counter = Counter()
     char_n = 4
     with open(filename, 'r', encoding='utf-8') as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_workers
         offset = worker_id * chunk_size
         end = offset + chunk_size
         f.seek(offset)
         if offset > 0:
             safe_readline(f)  # drop first incomplete line
         line = f.readline()
         while line:
             for word in tokenize(line):
                 #counter.update([word])
                 for i in range(len(word)):
                     for j in range(i+1, min(i+char_n, len(word))+1):
                         char = word[i:j]
                         counter.update([char])
             counter.update([eos_word])
             if f.tell() > end:
                 break
             line = f.readline()
     return counter
示例#6
0
def remove_lines(filename, save_path, drop_list):
    with open(filename, 'r', encoding='utf-8') as f:
        line = safe_readline(f)
        reduce_line = []
        num = 0
        j = 0
        print(len(drop_list))
        while line:
            if j >= len(drop_list) or num == int(drop_list[j]):
                j += 1
            else:
                reduce_line.append(line)
            num += 1
            if num % 100000 == 0:
                print(num)
            line = f.readline()
    filename_reduce = save_path + filename.split('/')[-1]
    with open(filename_reduce, 'w', encoding='utf8') as f:
        for line in reduce_line:
            f.write(str(line))
        f.close()
示例#7
0
def remove_lines(filename, drop_list):
    with open(filename, 'r', encoding='utf-8') as f:
        line = safe_readline(f)
        reduce_line = []
        num = 0
        j = 0
        print(len(drop_list))
        while line:
            #import pdb; pdb.set_trace()
            if j < len(drop_list) and num == int(drop_list[j].split(',')[0]):
                j += 1
            else:
                reduce_line.append(line)
            num += 1
            if num % 100000 == 0:
                print(num)
            line = f.readline()
    # filename_reduce = save_path + filename.split('/')[-1]
    print('filter j is {}'.format(j))
    save_path = filename + '.filter'
    with open(save_path, 'w', encoding='utf8') as f:
        for line in reduce_line:
            f.write(str(line))
        f.close()
def str_to_bin(filename,
               dict,
               consumer,
               char_vocab,
               append_eos=True,
               reverse_order=False,
               offset=0,
               end=-1):
    nseq, ntok = 0, 0
    replaced = Counter()

    def replaced_consumer(word, idx):
        if idx == dict.unk_index and word != dict.unk_word:
            replaced.update([word])

    def collate_char_tokens(values,
                            pad_idx,
                            sentence_length,
                            word_max_length,
                            eos_idx=None,
                            left_pad=False,
                            move_eos_to_beginning=False):
        """Convert a list of 1d tensors into a padded 2d tensor."""
        size = word_max_length
        res = values[0][0].new(len(values), sentence_length,
                               size).fill_(pad_idx)

        def copy_tensor(src, dst):
            assert dst.numel() == src.numel(), "{} != {}".format(
                dst.numel(), src.numel())
            if move_eos_to_beginning:
                assert src[-1] == eos_idx
                dst[0] = eos_idx
                dst[1:] = src[:-1]
            else:
                dst.copy_(src)

        for i, line in enumerate(values):
            for j, v in enumerate(line):
                if len(v) > word_max_length:
                    v = v[-word_max_length:]
                copy_tensor(
                    v, res[i][sentence_length - len(line) + j][size - len(v):]
                    if left_pad else res[i][sentence_length - len(line) +
                                            j][:len(v)])
        return res

    with open(filename, 'r', encoding='utf-8') as f:
        f.seek(offset)
        # next(f) breaks f.tell(), hence readline() must be used
        line = safe_readline(f)
        while line:
            if end > 0 and f.tell() > end:
                break
            ids = dict.encode_line(
                line=line,
                line_tokenizer=tokenize_line_word,
                add_if_not_exist=False,
                consumer=replaced_consumer,
                append_eos=append_eos,
                reverse_order=reverse_order,
            )
            if char_vocab is not None:
                ids_char = [
                    char_vocab.encode_line(
                        line=word,
                        line_tokenizer=tokenize_line_char,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                    ) for word in line.split() + ['']
                ]

                ids_char_pad = collate_char_tokens(values=[ids_char],
                                                   pad_idx=char_vocab.pad(),
                                                   word_max_length=15,
                                                   sentence_length=len(ids),
                                                   eos_idx=char_vocab.eos(),
                                                   left_pad=True,
                                                   move_eos_to_beginning=False)
                ids = torch.cat([ids, ids_char_pad.view(-1)])
            nseq += 1
            ntok += len(ids)
            consumer(ids)
            line = f.readline()
    return {
        'nseq': nseq,
        'nunk': sum(replaced.values()),
        'ntok': ntok,
        'replaced': replaced
    }