def build_vocab(self, cutoff=0.5, batchsize=100): counter = Counter() articles = [] for article in self.articles: articles.append(article) if len(articles) == batchsize: for counts in apply_async(count_words, articles): counter.update(counts) articles = [] self.vocab = counter.most_common(int(len(counter) * cutoff)) self.vocab = sorted([pair[0] for pair in self.vocab]) self._reset_streams()
def write_to_cache(self, path, processor, n_per_file=200, pool_size=10): '''Write batches of preprocessed articles to cache for later use''' paramlist = [] batch = [] count = 0 for article in self.articles: batch.append(article) if len(batch) == n_per_file: fname = str(count) + '.txt' paramlist.append((batch, path + fname, processor)) batch = [] count += 1 if len(paramlist) == pool_size: apply_async(wiki_cache, paramlist) paramlist = [] if len(paramlist) != 0: apply_async(wiki_cache, paramlist) self._reset_streams()
def _update_order_vectors(self, batch): sents = apply_async(self._preprocess, batch) self._run_pool(self._encode_order, sents, self.order_vectors)
def _update_context_vectors(self, batch): sents = apply_async(self._preprocess, batch) self._run_pool(self._encode_context, sents, self.context_vectors)
def _encode_all(self, batch): sents = apply_async(self._preprocess, batch) sents = [lst for lst in sents if len(lst) > 1] self._run_pool(self._encode_syntax, batch, self.syntax_vectors) self._run_pool(self._encode_context, sents, self.context_vectors) self._run_pool(self._encode_order, sents, self.order_vectors)