def _add_file_to_dictionary_single_worker( filename, tokenize, eos_word, worker_id=0, num_workers=1 ): counter = Counter() with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: for word in tokenize(line): counter.update([word]) counter.update([eos_word]) # f.tell() returns only an opaque number which can # return to the position in the file via f.seek() # and does not necessarily represent a byte position # in the file. However, f.tell() is faithful to the # byte position _most of the time_. Thus we can just # check against the file size to prevent early exit. if f.tell() > end and f.tell() < size: break line = f.readline() return counter
def _add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1, L=None): # This method is modified as it takes first L words in each line. counter = Counter() with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: for word in tokenize(line[:L]): counter.update([word]) counter.update([eos_word]) if f.tell() > end: break line = f.readline() return counter
def _add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1): raise NotImplementedError counter = Counter() with open(filename, 'r', encoding='utf-8') as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: for word in tokenize(line): counter.update([word]) counter.update([eos_word]) if f.tell() > end: break line = f.readline() return counter
def reduce_words(filename, save_path, drop_filename): with open(drop_filename, 'r', encoding='utf-8') as f: line = safe_readline(f) drop_list = [] while line: drop_list.append(line) line = f.readline() name_list = ['.en', '.de', '.bert.en', '.bert.de', '.bart.en', '.bart.de'] for name in name_list: remove_lines(filename + name, save_path, drop_list)
def _add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1): counter = Counter() char_n = 4 with open(filename, 'r', encoding='utf-8') as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: for word in tokenize(line): #counter.update([word]) for i in range(len(word)): for j in range(i+1, min(i+char_n, len(word))+1): char = word[i:j] counter.update([char]) counter.update([eos_word]) if f.tell() > end: break line = f.readline() return counter
def remove_lines(filename, save_path, drop_list): with open(filename, 'r', encoding='utf-8') as f: line = safe_readline(f) reduce_line = [] num = 0 j = 0 print(len(drop_list)) while line: if j >= len(drop_list) or num == int(drop_list[j]): j += 1 else: reduce_line.append(line) num += 1 if num % 100000 == 0: print(num) line = f.readline() filename_reduce = save_path + filename.split('/')[-1] with open(filename_reduce, 'w', encoding='utf8') as f: for line in reduce_line: f.write(str(line)) f.close()
def remove_lines(filename, drop_list): with open(filename, 'r', encoding='utf-8') as f: line = safe_readline(f) reduce_line = [] num = 0 j = 0 print(len(drop_list)) while line: #import pdb; pdb.set_trace() if j < len(drop_list) and num == int(drop_list[j].split(',')[0]): j += 1 else: reduce_line.append(line) num += 1 if num % 100000 == 0: print(num) line = f.readline() # filename_reduce = save_path + filename.split('/')[-1] print('filter j is {}'.format(j)) save_path = filename + '.filter' with open(save_path, 'w', encoding='utf8') as f: for line in reduce_line: f.write(str(line)) f.close()
def str_to_bin(filename, dict, consumer, char_vocab, append_eos=True, reverse_order=False, offset=0, end=-1): nseq, ntok = 0, 0 replaced = Counter() def replaced_consumer(word, idx): if idx == dict.unk_index and word != dict.unk_word: replaced.update([word]) def collate_char_tokens(values, pad_idx, sentence_length, word_max_length, eos_idx=None, left_pad=False, move_eos_to_beginning=False): """Convert a list of 1d tensors into a padded 2d tensor.""" size = word_max_length res = values[0][0].new(len(values), sentence_length, size).fill_(pad_idx) def copy_tensor(src, dst): assert dst.numel() == src.numel(), "{} != {}".format( dst.numel(), src.numel()) if move_eos_to_beginning: assert src[-1] == eos_idx dst[0] = eos_idx dst[1:] = src[:-1] else: dst.copy_(src) for i, line in enumerate(values): for j, v in enumerate(line): if len(v) > word_max_length: v = v[-word_max_length:] copy_tensor( v, res[i][sentence_length - len(line) + j][size - len(v):] if left_pad else res[i][sentence_length - len(line) + j][:len(v)]) return res with open(filename, 'r', encoding='utf-8') as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = safe_readline(f) while line: if end > 0 and f.tell() > end: break ids = dict.encode_line( line=line, line_tokenizer=tokenize_line_word, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, ) if char_vocab is not None: ids_char = [ char_vocab.encode_line( line=word, line_tokenizer=tokenize_line_char, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, ) for word in line.split() + [''] ] ids_char_pad = collate_char_tokens(values=[ids_char], pad_idx=char_vocab.pad(), word_max_length=15, sentence_length=len(ids), eos_idx=char_vocab.eos(), left_pad=True, move_eos_to_beginning=False) ids = torch.cat([ids, ids_char_pad.view(-1)]) nseq += 1 ntok += len(ids) consumer(ids) line = f.readline() return { 'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced }