예제 #1
0
    def build_vocab(self, cutoff=0.5, batchsize=100):
        counter = Counter()
        articles = []
        for article in self.articles:
            articles.append(article)
            if len(articles) == batchsize:
                for counts in apply_async(count_words, articles):
                    counter.update(counts)
                articles = []

        self.vocab = counter.most_common(int(len(counter) * cutoff))
        self.vocab = sorted([pair[0] for pair in self.vocab])
        self._reset_streams()
예제 #2
0
    def write_to_cache(self, path, processor, n_per_file=200, pool_size=10):
        '''Write batches of preprocessed articles to cache for later use'''
        paramlist = []
        batch = []
        count = 0

        for article in self.articles:
            batch.append(article)

            if len(batch) == n_per_file:
                fname = str(count) + '.txt'
                paramlist.append((batch, path + fname, processor))
                batch = []
                count += 1

            if len(paramlist) == pool_size:
                apply_async(wiki_cache, paramlist)
                paramlist = []

        if len(paramlist) != 0:
            apply_async(wiki_cache, paramlist)

        self._reset_streams()
예제 #3
0
 def _update_order_vectors(self, batch):
     sents = apply_async(self._preprocess, batch)
     self._run_pool(self._encode_order, sents, self.order_vectors)
예제 #4
0
 def _update_context_vectors(self, batch):
     sents = apply_async(self._preprocess, batch)
     self._run_pool(self._encode_context, sents, self.context_vectors)
예제 #5
0
 def _encode_all(self, batch):
     sents = apply_async(self._preprocess, batch)
     sents = [lst for lst in sents if len(lst) > 1]
     self._run_pool(self._encode_syntax, batch, self.syntax_vectors)
     self._run_pool(self._encode_context, sents, self.context_vectors)
     self._run_pool(self._encode_order, sents, self.order_vectors)
예제 #6
0
 def _update_order_vectors(self, batch):
     sents = apply_async(self._preprocess, batch)
     self._run_pool(self._encode_order, sents, self.order_vectors)
예제 #7
0
 def _update_context_vectors(self, batch):
     sents = apply_async(self._preprocess, batch)
     self._run_pool(self._encode_context, sents, self.context_vectors)
예제 #8
0
 def _encode_all(self, batch):
     sents = apply_async(self._preprocess, batch)
     sents = [lst for lst in sents if len(lst) > 1]
     self._run_pool(self._encode_syntax, batch, self.syntax_vectors)
     self._run_pool(self._encode_context, sents, self.context_vectors)
     self._run_pool(self._encode_order, sents, self.order_vectors)