def count_file(filename, tokenizer, worker_id=0, num_workers=1): counter = Counter() with open(filename, 'r', encoding='utf-8') as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() count = 0 while line: tokenized_words = tokenizer.tokenize(line) for word in tokenized_words: counter.update([word]) if f.tell() > end: break line = f.readline() count += 1 if count % 100000 == 0: print("[INFO] Thread %d processed %d lines." % (worker_id, count)) return counter
def find_offsets(filename, num_chunks): with open(filename, 'r', encoding='utf-8') as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_chunks offsets = [0 for _ in range(num_chunks + 1)] for i in range(1, num_chunks): f.seek(chunk_size * i) safe_readline(f) offsets[i] = f.tell() return offsets
def find_offsets(filename, num_chunks): """ :param filename: string :param num_chunks: int :return: a list of offsets (positions to start and stop reading) """ with open(filename, 'r', encoding='utf-8') as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_chunks offsets = [0 for _ in range(num_chunks + 1)] for i in range(1, num_chunks): f.seek(chunk_size * i) safe_readline(f) offsets[i] = f.tell() return offsets
def distribute_to_tempfiles(srcfile, n): tmpfiles = [ tempfile.NamedTemporaryFile('w', encoding='utf8') for _ in range(n) ] offsets = find_offsets(srcfile, n) lines_per_tf = list() all_lines = len(open(srcfile).readlines()) for i, tf in enumerate(tmpfiles): n_lines = 0 start, end = offsets[i], offsets[i + 1] with open(srcfile, 'r', encoding='utf8') as f: f.seek(start) line = safe_readline(f) while line: if 0 < end < f.tell(): break tf.write(line) n_lines += 1 line = f.readline() tf.flush() lines_per_tf.append(n_lines) print("Lines per tmp files to be translated: ", lines_per_tf) assert (sum(lines_per_tf) == all_lines) # nlines = len(list(f)) # f.seek(0) # # round up # linesperpart = int((nlines + n - 1) / n) # for tf in tmpfiles: # for line in islice(f, linesperpart): # tf.write(line) # tf.flush() return tmpfiles, lines_per_tf
def binarize_file_single_thread(filename, ark_loader, offset=0, end=-1, worker_id=0, input_format='scp', output_format='raw', prev_context=0, concat=4, stride=1, fp16=False): # if output_format is scp, we only read the length for sorting if output_format == 'scp': assert input_format in ['kaldi', 'scp'] # audio_data = iter(ReadHelper('scp:' + filename)) # data_file = open(filename) # data_keys = list(data.keys()) # data_paths = list(data._dict.values()) result = dict() data = list() lengths = list() index = 0 with open(filename, 'r', encoding='utf-8') as f: f.seek(offset) line = safe_readline(f) while line: if 0 < end < f.tell(): break parts = line.split() path = parts[1] key = parts[0] # read numpy array from the ark here feature_vector = ark_loader.load_mat(path) # feature_vector.setflags(write=True) if stride == 1: feature_vector = torch.from_numpy(feature_vector) else: feature_vector = torch.from_numpy( feature_vector[0::opt.stride]) if concat > 1: print('concatenating ...') add = (concat - feature_vector.size()[0] % concat) % concat z = torch.FloatTensor(add, feature_vector.size()[1]).zero_() feature_vector = torch.cat((feature_vector, z), 0) feature_vector = feature_vector.reshape( (int(feature_vector.size()[0] / concat), feature_vector.size()[1] * concat)) if prev_context > 0: print( "Multiple ASR context isn't supported at the moment " ) raise NotImplementedError # s_prev_context.append(feature_vector) # t_prev_context.append(tline) # for i in range(1,prev_context+1): # if i < len(s_prev_context): # feature_vector = torch.cat((torch.cat((s_prev_context[-i-1], # torch.zeros(1,feature_vector.size()[1]))),feature_vector)) # tline = t_prev_context[-i-1]+" # "+tline # if len(s_prev_context) > prev_context: # s_prev_context = s_prev_context[-1*prev_context:] # t_prev_context = t_prev_context[-1*prev_context:] if fp16: feature_vector = feature_vector.half() if output_format not in ['scp', 'scpmem']: data.append(feature_vector.numpy()) # convert to numpy for serialization else: data.append(path) lengths.append(feature_vector.size(0)) line = f.readline() if (index + 1) % 100000 == 0: print("[INFO] Thread %d Processed %d audio utterances." % (worker_id, index + 1)) index = index + 1 result['data'] = data result['sizes'] = lengths result['id'] = worker_id result['total'] = len(lengths) return result
def binarize_file_single_thread(filename, tokenizer, vocab, worker_id=0, bos_word=None, eos_word=None, offset=0, end=-1, data_type='int64', verbose=False): """ This function should read in the lines, convert sentences to tensors And then finalize into a dataset? """ result = dict() unk_word = onmt.constants.UNK_WORD data = list() sizes = list() count = 0 with open(filename, 'r', encoding='utf-8') as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = safe_readline(f) while line: if 0 < end < f.tell(): break tokenized_sent = tokenizer.tokenize(line) binarized_line = vocab.convertToIdx(tokenized_sent, unk_word, bos_word=bos_word, eos_word=eos_word, type=data_type) # move to shared_memory to transfer between threads # conversion to numpy is necessary because torch.Tensor is not serializable by the mprocess data += [binarized_line.numpy()] sizes += [len(tokenized_sent)] line = f.readline() count += 1 if count % 100000 == 0: if verbose: print("[INFO] Thread %d processed %d lines." % (worker_id, count)) if verbose: print("[INFO] Thread %d Done." % worker_id) result['data'] = data result['sizes'] = sizes result['id'] = worker_id result['total'] = len(sizes) return result