def build_tokenized_corpus(input_root, tokenizer, output_dir, skip_dirs=False, n_processes=1, wiki_only=False): if not exists(output_dir): makedirs(output_dir) all_files = _gather_files(input_root, output_dir, skip_dirs, wiki_only) if n_processes == 1: voc = build_tokenized_files(tqdm(all_files, ncols=80), input_root, output_dir, tokenizer) else: voc = set() from multiprocessing import Pool with Pool(n_processes) as pool: chunks = split(all_files, n_processes) chunks = flatten_iterable(group(c, 500) for c in chunks) pbar = tqdm(total=len(chunks), ncols=80) for v in pool.imap_unordered( _build_tokenized_files_t, [[c, input_root, output_dir, tokenizer] for c in chunks]): voc.update(v) pbar.update(1) pbar.close() voc_file = join(output_dir, "vocab.txt") with open(voc_file, "w") as f: for word in sorted(voc): f.write(word) f.write("\n")
def _build_dataset(cls, corpus_name, n_processes, train_file: str, dev_file: str): hotpotqa = cls(corpus_name=corpus_name) with open(join(hotpotqa.dir, train_file), "rt") as f_train: _raw_train = json.load(f_train) with open(join(hotpotqa.dir, dev_file), "rt") as f_dev: _raw_dev = json.load(f_dev) dataset = {'train': _raw_train, 'dev': _raw_dev} for d in dataset: with Pool(n_processes) as pool, tqdm(total=len(dataset[d]), desc=d, ncols=70) as pbar: tqdm.write(bcolors.OKBLUE + "[+] Preprocess for {} set".format(d) + bcolors.ENDC) chunks = split(dataset[d], n_processes) for questions in pool.starmap( hotpotqa._build_question, [[c, hotpotqa.tokenizer, hotpotqa.detector] for c in chunks]): pbar.update(len(questions)) if d == 'train': hotpotqa._train += questions elif d == 'dev': hotpotqa._dev += questions hotpotqa._train = FilteredData(hotpotqa._train, len(hotpotqa._train)) hotpotqa._dev = FilteredData(hotpotqa._dev, len(hotpotqa._dev)) return hotpotqa
def compute_answer_spans_par(questions: List[TriviaQaQuestion], corpus, tokenizer, detector, n_processes: int): if n_processes == 1: word_tokenize = tokenizer.tokenize_paragraph_flat compute_answer_spans(questions, corpus, word_tokenize, detector) return questions from multiprocessing import Pool with Pool(n_processes) as p: chunks = split(questions, n_processes) questions = flatten_iterable( p.starmap(_compute_answer_spans_chunk, [[c, corpus, tokenizer, detector] for c in chunks])) return questions
def preprocess_par(questions: List, evidence, preprocessor, n_processes=2, chunk_size=200, name=None): if chunk_size <= 0: raise ValueError("Chunk size must be >= 0, but got %s" % chunk_size) if n_processes is not None and n_processes <= 0: raise ValueError("n_processes must be >= 1 or None, but got %s" % n_processes) n_processes = min(len(questions), n_processes) if n_processes == 1: out = preprocessor.preprocess(tqdm(questions, desc=name, ncols=80), evidence) preprocessor.finalize_chunk(out) return out else: from multiprocessing import Pool chunks = split(questions, n_processes) chunks = flatten_iterable([group(c, chunk_size) for c in chunks]) print("Processing %d chunks with %d processes" % (len(chunks), n_processes)) pbar = tqdm(total=len(questions), desc=name, ncols=80) lock = Lock() def call_back(results): preprocessor.finalize_chunk(results[0]) with lock: # FIXME Even with the lock, the progress bar still is jumping around pbar.update(results[1]) with Pool(n_processes) as pool: results = [ pool.apply_async(_preprocess_and_count, [c, evidence, preprocessor], callback=call_back) for c in chunks ] results = [r.get()[0] for r in results] pbar.close() output = results[0] for r in results[1:]: output += r return output
def get_evidence_voc(corpus, n_processes=1): doc_ids = corpus.list_documents() voc = Counter() if n_processes == 1: for doc in tqdm(doc_ids): voc = corpus.get_document(doc, flat=True) else: from multiprocessing import Pool chunks = split(doc_ids, n_processes) chunks = flatten_iterable(group(x, 10000) for x in chunks) pbar = tqdm(total=len(chunks), ncols=80) with Pool(n_processes) as pool: for v in pool.imap_unordered(_extract_voc_tuple, [[corpus, c] for c in chunks]): voc += v pbar.update(1) pbar.close() return voc