def preprocess_text(docs): num_task = os.cpu_count() len_slices = len(docs) // num_task remainder_slices = len(docs) % num_task texts = [] stoplist = set(stopwords.words('english')) wn.ensure_loaded() t_start = time.perf_counter() with ProcessPoolExecutor(max_workers=num_task) as executor: futures_tokenize = [] for n in range(0, num_task): upper_bound = (n+1) * len_slices if n == num_task - 1: upper_bound = (n+1) * len_slices + remainder_slices print(n, upper_bound) futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound], stoplist)) for future in concurrent.futures.as_completed(futures_tokenize): texts += future.result() t_stop = time.perf_counter() print("removed stopwords and lemmatized in {} s".format(t_stop - t_start)) # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phraser(Phrases(texts, min_count=20)) for idx in range(len(texts)): for token in bigram[texts[idx]]: if '_' in token: # Token is a bigram, add to document. texts[idx].append(token) print("Done bigrams") dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=30, no_above=0.5) dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]]) special_tokens = {'_pad_': 0} dictionary.patch_with_special_tokens(special_tokens) return texts, dictionary