def get_counter_file_batch(self, txt_file_batch): counter = Counter() for txt_file in txt_file_batch: with io.open(txt_file, 'r', encoding=self.encoding) as fp: for txt in fp: counter = nlp.data.count_tokens( self.tokenizer.tokenize(txt), counter=counter) return counter
def get_counter_dir_parallel(self, data_dir, pat): files = glob.glob(data_dir + '/' + pat) file_batches = list( self.chunks(files, max(1, len(files) // cpu_count()))) logging.info( "Counting vocabulary over {} text files with {} batches".format( len(files), len(file_batches))) with mantichora() as mcore: for i in range(len(file_batches)): mcore.run(self.task, "Counting Vocab Items - Batch {}".format(i), file_batches[i]) counters = mcore.returns() return sum(counters, Counter())
def get_counter_dir_parallel(self, txt_dir, pat='*.txt'): def batches(l, n): for i in range(0, len(l), n): yield l[i:i + n] files = glob.glob(txt_dir + '/' + pat) batch_size = max(1, int(len(files) / 20)) file_batches = list(batches(files, batch_size)) file_batch_batches = list( self.chunks(file_batches, max(1, len(files) // cpu_count()))) with mantichora() as mcore: for i in range(len(file_batch_batches)): mcore.run(self.task, "Counting Vocab Items - Batch {}".format(i), file_batch_batches[i]) counter_cs = mcore.returns() counters = [item for sl in counter_cs for item in sl] return sum(counters, Counter())