예제 #1
0
 def test_find_snippets_for_multiple_patterns(self):
     create_post(body=self._make_post_body('\n'.join([
         'import re',
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         '',
         'for c in characters:',
         '    print c',
     ])))
     self._extract(['re.findall', '"foo"'])
     self.assertEqual(PostSnippet.select().count(), 2)
     snippets = [s.snippet for s in PostSnippet.select()]
     patterns = [s.pattern.pattern for s in PostSnippet.select()]
     self.assertIn('\n'.join([
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         '',
         'for c in characters:',
     ]), snippets)
     self.assertIn('\n'.join([
         'import re',
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         '',
     ]), snippets)
     self.assertIn('re.findall', patterns)
     self.assertIn('"foo"', patterns)
예제 #2
0
 def test_find_multiple_snippets_in_one_post(self):
     create_post(body=self._make_post_body('\n'.join([
         'import re',
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         'for c in characters:',
         '    print c',
         '',
         'digits = re.findall(r"\w", string)',
         'for d in digits:',
         '    print d',
     ])))
     self._extract(['re.findall'])
     self.assertEqual(PostSnippet.select().count(), 2)
     snippets = [code.snippet for code in PostSnippet.select()]
     self.assertIn('\n'.join([
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         'for c in characters:',
         '    print c',
     ]), snippets)
     self.assertIn('\n'.join([
         '    print c',
         '',
         'digits = re.findall(r"\w", string)',
         'for d in digits:',
         '    print d',
     ]), snippets)
def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False):

    # Fetch all posts, filtering by those for which tags have been specified
    posts = Post.select(Post.id, Post.body)
    if tags is not None:
        posts = (
            posts
            .join(PostTag, on=(Post.id == PostTag.post_id))
            .join(Tag, on=(Tag.id == PostTag.tag_id))
            .where(Tag.tag_name << tags)
        )

    # Initialize the progress bar
    if show_progress:
        post_count = posts.count()
        progress_bar = ProgressBar(maxval=post_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Processing web page ', Counter(), ' / ' + str(post_count) + '.'
        ])
        progress_bar.start()

    # Zip all patterns with a scanner that scans for it
    pattern_scanner_pairs = []
    for pattern in patterns:
        snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern)
        extractor = PythonSnippetExtractor(pattern, lines_of_context)
        scanner = NodeScanner(extractor, tags=['pre', 'code'])
        pattern_scanner_pairs.append((snippet_pattern, scanner))

    # For each post, extract snippets for all patterns
    # Note that currently there is some repeated work: each extractor will
    # try to parse all relevant nodes as Python
    for post_index, post in enumerate(posts, start=1):
        document = BeautifulSoup(post.body, 'html.parser')

        for snippet_pattern, scanner in pattern_scanner_pairs:
            snippets = scanner.scan(document)

            # Store a record of each snippet that was found
            for snippet in snippets:
                PostSnippet.create(
                    post=post,
                    snippet=snippet,
                    compute_index=compute_index,
                    pattern=snippet_pattern,
                )

        if show_progress:
            progress_bar.update(post_index)

    if show_progress:
        progress_bar.finish()
예제 #4
0
def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False):

    # Fetch all posts, filtering by those for which tags have been specified
    posts = Post.select(Post.id, Post.body)
    if tags is not None:
        posts = (
            posts
            .join(PostTag, on=(Post.id == PostTag.post_id))
            .join(Tag, on=(Tag.id == PostTag.tag_id))
            .where(Tag.tag_name << tags)
        )

    # Initialize the progress bar
    if show_progress:
        post_count = posts.count()
        progress_bar = ProgressBar(maxval=post_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Processing web page ', Counter(), ' / ' + str(post_count) + '.'
        ])
        progress_bar.start()

    # Zip all patterns with a scanner that scans for it
    pattern_scanner_pairs = []
    for pattern in patterns:
        snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern)
        extractor = PythonSnippetExtractor(pattern, lines_of_context)
        scanner = NodeScanner(extractor, tags=['pre', 'code'])
        pattern_scanner_pairs.append((snippet_pattern, scanner))

    # For each post, extract snippets for all patterns
    # Note that currently there is some repeated work: each extractor will
    # try to parse all relevant nodes as Python
    for post_index, post in enumerate(posts, start=1):
        document = BeautifulSoup(post.body, 'html.parser')

        for snippet_pattern, scanner in pattern_scanner_pairs:
            snippets = scanner.scan(document)

            # Store a record of each snippet that was found
            for snippet in snippets:
                PostSnippet.create(
                    post=post,
                    snippet=snippet,
                    compute_index=compute_index,
                    pattern=snippet_pattern,
                )

        if show_progress:
            progress_bar.update(post_index)

    if show_progress:
        progress_bar.finish()
예제 #5
0
 def test_skip_nonpython_code(self):
     create_post(body=self._make_post_body('\n'.join([
         'var $ = require("jquery")',
         '$("div").text("div text")',
         'var ranomString = "re.match";',
     ])), view_count=375)
     self._extract(['re.match'])
     self.assertEqual(PostSnippet.select().count(), 0)
def main(patterns, tags, lines_of_context, show_progress, *args, **kwargs):

    # Create a new index for this computation
    last_compute_index = PostSnippet.select(fn.Max(PostSnippet.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Read patterns from a file
    with open(patterns) as patterns_file:
        pattern_list = [p.strip() for p in patterns_file.readlines()]

    # Run snippet extraction
    extract_snippets(pattern_list, tags, compute_index, lines_of_context, show_progress)
예제 #7
0
def main(patterns, tags, lines_of_context, show_progress, *args, **kwargs):

    # Create a new index for this computation
    last_compute_index = PostSnippet.select(fn.Max(PostSnippet.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Read patterns from a file
    with open(patterns) as patterns_file:
        pattern_list = [p.strip() for p in patterns_file.readlines()]

    # Run snippet extraction
    extract_snippets(pattern_list, tags, compute_index, lines_of_context, show_progress)
예제 #8
0
    def test_find_snippet(self):

        # First, we create the models in memory
        post = create_post(body=self._make_post_body('\n'.join([
            'import re',
            '',
            'string = "foo"',
            'characters = re.findall(r"\w", string)',
            '',
            'for c in characters:',
            '    print c',
        ])))

        # Here is the line of code that actually performs the extraction for a pattern
        # By default, it should run extraction for all posts
        self._extract(['re.findall'])

        # There are a few effects that we check
        # First, that the number of snippets has increased
        self.assertEqual(PostSnippet.select().count(), 1)

        # The content of this snippet should show context around the pattern
        self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([
            '',
            'string = "foo"',
            'characters = re.findall(r"\w", string)',
            '',
            'for c in characters:',
        ]))

        # The snippet should link back to the post that it was create from
        self.assertEqual(PostSnippet.select().first().post, post)

        # A model for the pattern should have been created
        self.assertEqual(SnippetPattern.select().count(), 1)
        self.assertEqual(SnippetPattern.select().first().pattern, 're.findall')

        # The snippet should be linked back to the pattern
        self.assertEqual(SnippetPattern.select().first(), PostSnippet.select().first().pattern)
예제 #9
0
 def test_handle_missing_post_context(self):
     # If there is no context available in the lines below the one where a pattern is found,
     # make sure that the extraction is still successful.
     create_post(body=self._make_post_body('\n'.join([
         'import re',
         '',
         'characters = re.findall(r"\w", string)',
     ])))
     self._extract(['re.findall'])
     self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([
         'import re',
         '',
         'characters = re.findall(r"\w", string)',
     ]))
예제 #10
0
    def test_find_snippet_with_tags(self):

        # These two posts are equivalent, except that only is tagged with a tag that we
        # will use for filtering in the test.
        post1 = create_post(body=self._make_post_body('\n'.join([
            'import re',
            'characters = re.findall(r"\w", "foo")',
            'for c in characters:',
            '    print c',
        ])))
        post2 = create_post(body=self._make_post_body('\n'.join([
            'import re',
            'characters = re.findall(r"\w", "foo")',
            'for c in characters:',
            '    print c',
        ])))
        tag1 = create_tag(tag_name='javascript')
        tag2 = create_tag(tag_name='python')
        PostTag.create(post_id=post1.id, tag_id=tag1.id)
        PostTag.create(post_id=post2.id, tag_id=tag2.id)

        self._extract(['re.findall'], tags=['python'])
        self.assertEqual(PostSnippet.select().count(), 1)
        self.assertEqual(PostSnippet.select().first().post, post2)
예제 #11
0
 def test_specify_lines_of_context(self):
     create_post(body=self._make_post_body('\n'.join([
         'import re',
         '',
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         '',
         'for c in characters:',
         '    print c',
     ])))
     self._extract(['re.findall'], lines_of_context=1)
     self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([
         'string = "foo"',
         'characters = re.findall(r"\w", string)',
         '',
     ]))
예제 #12
0
 def test_skip_non_code_nodes_plaintext(self):
     create_post(body='<p>re.findall</p>')
     self._extract(['re.findall'])
     self.assertEqual(PostSnippet.select().count(), 0)