Пример #1
0
 def get_counter_file_batch(self, txt_file_batch):
     counter = Counter()
     for txt_file in txt_file_batch:
         with io.open(txt_file, 'r', encoding=self.encoding) as fp:
             for txt in fp:
                 counter = nlp.data.count_tokens(
                     self.tokenizer.tokenize(txt), counter=counter)
     return counter
Пример #2
0
 def get_counter_dir_parallel(self, data_dir, pat):
     files = glob.glob(data_dir + '/' + pat)
     file_batches = list(
         self.chunks(files, max(1,
                                len(files) // cpu_count())))
     logging.info(
         "Counting vocabulary over {} text files with {} batches".format(
             len(files), len(file_batches)))
     with mantichora() as mcore:
         for i in range(len(file_batches)):
             mcore.run(self.task,
                       "Counting Vocab Items - Batch {}".format(i),
                       file_batches[i])
         counters = mcore.returns()
     return sum(counters, Counter())
Пример #3
0
    def get_counter_dir_parallel(self, txt_dir, pat='*.txt'):
        def batches(l, n):
            for i in range(0, len(l), n):
                yield l[i:i + n]

        files = glob.glob(txt_dir + '/' + pat)
        batch_size = max(1, int(len(files) / 20))
        file_batches = list(batches(files, batch_size))
        file_batch_batches = list(
            self.chunks(file_batches, max(1,
                                          len(files) // cpu_count())))
        with mantichora() as mcore:
            for i in range(len(file_batch_batches)):
                mcore.run(self.task,
                          "Counting Vocab Items - Batch {}".format(i),
                          file_batch_batches[i])
            counter_cs = mcore.returns()
        counters = [item for sl in counter_cs for item in sl]
        return sum(counters, Counter())