예제 #1
0
    def count_file(filename, tokenizer, worker_id=0, num_workers=1):

        counter = Counter()
        with open(filename, 'r', encoding='utf-8') as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = size // num_workers
            offset = worker_id * chunk_size
            end = offset + chunk_size

            f.seek(offset)

            if offset > 0:
                safe_readline(f)  # drop first incomplete line
            line = f.readline()

            count = 0

            while line:
                tokenized_words = tokenizer.tokenize(line)
                for word in tokenized_words:
                    counter.update([word])
                if f.tell() > end:
                    break
                line = f.readline()

                count += 1
                if count % 100000 == 0:
                    print("[INFO] Thread %d processed %d lines." %
                          (worker_id, count))

        return counter
예제 #2
0
 def find_offsets(filename, num_chunks):
     with open(filename, 'r', encoding='utf-8') as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_chunks
         offsets = [0 for _ in range(num_chunks + 1)]
         for i in range(1, num_chunks):
             f.seek(chunk_size * i)
             safe_readline(f)
             offsets[i] = f.tell()
         return offsets
예제 #3
0
 def find_offsets(filename, num_chunks):
     """
     :param filename: string
     :param num_chunks: int
     :return: a list of offsets (positions to start and stop reading)
     """
     with open(filename, 'r', encoding='utf-8') as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_chunks
         offsets = [0 for _ in range(num_chunks + 1)]
         for i in range(1, num_chunks):
             f.seek(chunk_size * i)
             safe_readline(f)
             offsets[i] = f.tell()
         return offsets
예제 #4
0
def distribute_to_tempfiles(srcfile, n):
    tmpfiles = [
        tempfile.NamedTemporaryFile('w', encoding='utf8') for _ in range(n)
    ]

    offsets = find_offsets(srcfile, n)
    lines_per_tf = list()

    all_lines = len(open(srcfile).readlines())

    for i, tf in enumerate(tmpfiles):

        n_lines = 0
        start, end = offsets[i], offsets[i + 1]

        with open(srcfile, 'r', encoding='utf8') as f:
            f.seek(start)
            line = safe_readline(f)

            while line:
                if 0 < end < f.tell():
                    break

                tf.write(line)
                n_lines += 1

                line = f.readline()

        tf.flush()

        lines_per_tf.append(n_lines)

    print("Lines per tmp files to be translated: ", lines_per_tf)
    assert (sum(lines_per_tf) == all_lines)

    #     nlines = len(list(f))
    #     f.seek(0)
    #     # round up
    #     linesperpart = int((nlines + n - 1) / n)
    #     for tf in tmpfiles:
    #         for line in islice(f, linesperpart):
    #             tf.write(line)
    #         tf.flush()

    return tmpfiles, lines_per_tf
예제 #5
0
    def binarize_file_single_thread(filename,
                                    ark_loader,
                                    offset=0,
                                    end=-1,
                                    worker_id=0,
                                    input_format='scp',
                                    output_format='raw',
                                    prev_context=0,
                                    concat=4,
                                    stride=1,
                                    fp16=False):
        # if output_format is scp, we only read the length for sorting

        if output_format == 'scp':
            assert input_format in ['kaldi', 'scp']

        # audio_data = iter(ReadHelper('scp:' + filename))
        # data_file = open(filename)
        # data_keys = list(data.keys())
        # data_paths = list(data._dict.values())

        result = dict()
        data = list()
        lengths = list()
        index = 0

        with open(filename, 'r', encoding='utf-8') as f:
            f.seek(offset)

            line = safe_readline(f)

            while line:
                if 0 < end < f.tell():
                    break

                parts = line.split()
                path = parts[1]
                key = parts[0]

                # read numpy array from the ark here
                feature_vector = ark_loader.load_mat(path)

                # feature_vector.setflags(write=True)
                if stride == 1:
                    feature_vector = torch.from_numpy(feature_vector)
                else:
                    feature_vector = torch.from_numpy(
                        feature_vector[0::opt.stride])

                if concat > 1:
                    print('concatenating ...')
                    add = (concat - feature_vector.size()[0] % concat) % concat
                    z = torch.FloatTensor(add,
                                          feature_vector.size()[1]).zero_()
                    feature_vector = torch.cat((feature_vector, z), 0)
                    feature_vector = feature_vector.reshape(
                        (int(feature_vector.size()[0] / concat),
                         feature_vector.size()[1] * concat))

                if prev_context > 0:
                    print(
                        "Multiple ASR context isn't supported at the moment   "
                    )
                    raise NotImplementedError

                    # s_prev_context.append(feature_vector)
                    # t_prev_context.append(tline)
                    # for i in range(1,prev_context+1):
                    #     if i < len(s_prev_context):
                    #         feature_vector = torch.cat((torch.cat((s_prev_context[-i-1],
                    #         torch.zeros(1,feature_vector.size()[1]))),feature_vector))
                    #         tline = t_prev_context[-i-1]+" # "+tline
                    # if len(s_prev_context) > prev_context:
                    #     s_prev_context = s_prev_context[-1*prev_context:]
                    #     t_prev_context = t_prev_context[-1*prev_context:]

                if fp16:
                    feature_vector = feature_vector.half()

                if output_format not in ['scp', 'scpmem']:
                    data.append(feature_vector.numpy())
                    # convert to numpy for serialization
                else:
                    data.append(path)

                lengths.append(feature_vector.size(0))

                line = f.readline()

                if (index + 1) % 100000 == 0:
                    print("[INFO] Thread %d Processed %d audio utterances." %
                          (worker_id, index + 1))

                index = index + 1

        result['data'] = data
        result['sizes'] = lengths
        result['id'] = worker_id
        result['total'] = len(lengths)

        return result
예제 #6
0
    def binarize_file_single_thread(filename,
                                    tokenizer,
                                    vocab,
                                    worker_id=0,
                                    bos_word=None,
                                    eos_word=None,
                                    offset=0,
                                    end=-1,
                                    data_type='int64',
                                    verbose=False):
        """
        This function should read in the lines, convert sentences to tensors
        And then finalize into a dataset?
        """

        result = dict()
        unk_word = onmt.constants.UNK_WORD

        data = list()
        sizes = list()

        count = 0

        with open(filename, 'r', encoding='utf-8') as f:
            f.seek(offset)

            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)

            while line:
                if 0 < end < f.tell():
                    break

                tokenized_sent = tokenizer.tokenize(line)

                binarized_line = vocab.convertToIdx(tokenized_sent,
                                                    unk_word,
                                                    bos_word=bos_word,
                                                    eos_word=eos_word,
                                                    type=data_type)

                # move to shared_memory to transfer between threads
                # conversion to numpy is necessary because torch.Tensor is not serializable by the mprocess
                data += [binarized_line.numpy()]
                sizes += [len(tokenized_sent)]

                line = f.readline()

                count += 1
                if count % 100000 == 0:
                    if verbose:
                        print("[INFO] Thread %d processed %d lines." %
                              (worker_id, count))

        if verbose:
            print("[INFO] Thread %d Done." % worker_id)
        result['data'] = data
        result['sizes'] = sizes
        result['id'] = worker_id
        result['total'] = len(sizes)

        return result