def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False): # Fetch all posts, filtering by those for which tags have been specified posts = Post.select(Post.id, Post.body) if tags is not None: posts = ( posts .join(PostTag, on=(Post.id == PostTag.post_id)) .join(Tag, on=(Tag.id == PostTag.tag_id)) .where(Tag.tag_name << tags) ) # Initialize the progress bar if show_progress: post_count = posts.count() progress_bar = ProgressBar(maxval=post_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Processing web page ', Counter(), ' / ' + str(post_count) + '.' ]) progress_bar.start() # Zip all patterns with a scanner that scans for it pattern_scanner_pairs = [] for pattern in patterns: snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern) extractor = PythonSnippetExtractor(pattern, lines_of_context) scanner = NodeScanner(extractor, tags=['pre', 'code']) pattern_scanner_pairs.append((snippet_pattern, scanner)) # For each post, extract snippets for all patterns # Note that currently there is some repeated work: each extractor will # try to parse all relevant nodes as Python for post_index, post in enumerate(posts, start=1): document = BeautifulSoup(post.body, 'html.parser') for snippet_pattern, scanner in pattern_scanner_pairs: snippets = scanner.scan(document) # Store a record of each snippet that was found for snippet in snippets: PostSnippet.create( post=post, snippet=snippet, compute_index=compute_index, pattern=snippet_pattern, ) if show_progress: progress_bar.update(post_index) if show_progress: progress_bar.finish()
def test_find_snippet(self): # First, we create the models in memory post = create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ' print c', ]))) # Here is the line of code that actually performs the extraction for a pattern # By default, it should run extraction for all posts self._extract(['re.findall']) # There are a few effects that we check # First, that the number of snippets has increased self.assertEqual(PostSnippet.select().count(), 1) # The content of this snippet should show context around the pattern self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([ '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ])) # The snippet should link back to the post that it was create from self.assertEqual(PostSnippet.select().first().post, post) # A model for the pattern should have been created self.assertEqual(SnippetPattern.select().count(), 1) self.assertEqual(SnippetPattern.select().first().pattern, 're.findall') # The snippet should be linked back to the pattern self.assertEqual(SnippetPattern.select().first(), PostSnippet.select().first().pattern)