def work (unit): logger.info('working %r', unit) if isinstance(unit, Thread): thread = unit.download_and_decode() words = set() for post in thread['posts']: for field in ('name', 'email', 'sub', 'com', 'filename'): contents = post.get(field, '') contents = sanitize(contents).encode('utf8') words.update(word_pattern.findall(contents)) return words for e in unit.process(): pool.push(work, e)
def work(unit): logger.info("working %r", unit) if isinstance(unit, Thread): thread = unit.download_and_decode() ngrams = collections.Counter() for post in thread["posts"]: contents = post.get("com", "") contents = sanitize(contents).encode("utf8") tokens = token_pattern.findall(contents) tokens = [token.lower() for token in tokens] ngrams.update(generate_ngrams(tokens)) return ngrams for e in unit.process(): pool.push(work, e)