def record_page_links(page):
    region = page.region
    links = extract_internal_links(page.content)
    for pagename, count in links.iteritems():
        qs = Link.objects.filter(source=page, region=region)
        link_exists = qs.filter(destination_slug=slugify(pagename)) | qs.filter(destination__slug=slugify(pagename))
        if link_exists:
            link = link_exists[0]
            if link.count == count:
                # No new links with this name on this page, so skip updating.
                continue
            link.count = count
        else:
            page_exists = Page.objects.filter(slug=slugify(pagename), region=region)
            if page_exists:
                destination = page_exists[0]
            else:
                destination = None

            # Exists for some reason already (probably running a script that's moving between regions?)
            if destination and Link.objects.filter(source=page, destination=destination).exists():
                continue

            link = Link(
                source=page,
                region=region,
                destination=destination,
                destination_name=pagename,
                destination_slug=slugify(pagename),
                count=count,
            )
        link.save()
示例#2
0
    def test_simple_extraction(self):
        html = """
<p>I love <a href="Parks">awesome parks</a>.</p>
        """
        links = extract_internal_links(html)
        self.assertTrue('Parks' in links)
        self.assertEqual(links['Parks'], 1)
    def forwards(self, orm):
        from pages.models import slugify
        from links import extract_internal_links

        for page in orm['pages.Page'].objects.all().iterator():
            region = page.region
            links = extract_internal_links(page.content)
            print "..recording page links on %s" % smart_str(page.name)
            for pagename, count in links.iteritems():
                page_exists = orm['pages.Page'].objects.filter(slug=slugify(pagename), region=region)
                if page_exists:
                    destination = page_exists[0]
                else:
                    destination = None
                if orm.Link.objects.filter(source=page, destination=destination).exists():
                    continue

                if orm.Link.objects.filter(source=page, destination_name__iexact=pagename).exists():
                    if destination:
                        link = orm.Link.objects.filter(source=page, destination_name__iexact=pagename)[0]
                        link.destination = destination
                        link.save()
                else:
                    link = orm.Link(
                        source=page,
                        region=region,
                        destination=destination,
                        destination_name=pagename,
                        count=count,
                    )
                    link.save()
示例#4
0
    def test_ignore_external_links(self):
        html = """
<p>I love <a href="Parks">outside</a>.</p>
<p>I love <a href="http://example.org/Night">test</a>.</p>
        """
        links = extract_internal_links(html)
        self.assertTrue('Parks' in links)
        self.assertEqual(len(links.keys()), 1)
示例#5
0
    def test_link_unquoting(self):
        html = """
<p>I love <a href="Cats%20and%20dogs">animals</a>.</p>
<p>I love <a href="Cats and dogs">animals</a>.</p>
        """
        links = extract_internal_links(html)
        self.assertTrue('Cats and dogs' in links)
        self.assertFalse('Cats%20and%20dogs' in links)
示例#6
0
    def test_ignore_anchors(self):
        html = """
<p>I love <a href="Parks">outside</a>.</p>
<p>I love <a href="#gohere">test</a>.</p>
<p>I love <a>test now</a>.</p>
        """
        links = extract_internal_links(html)
        self.assertTrue('Parks' in links)
        self.assertEqual(len(links.keys()), 1)
示例#7
0
    def test_count_links(self):
        html = """
<p>I love <a href="Parks">awesome parks</a>.</p>
<p>I hate <a href="Cats%20and%20dogs">animals</a>.</p>
<p>I love <a href="Parks">awesome parks</a>.</p>
<p>I love <a href="Parks">awesome parks</a>.</p>
<p>I love <a href="Cats%20and%20dogs">awesome parks</a>.</p>
        """
        links = extract_internal_links(html)
        self.assertTrue('Parks' in links)
        self.assertTrue('Cats and dogs' in links)
        self.assertEqual(links['Parks'], 3)
        self.assertEqual(links['Cats and dogs'], 2)
示例#8
0
def record_page_links(page):
    region = page.region
    links = extract_internal_links(page.content)
    for pagename, count in links.iteritems():
        qs = Link.objects.filter(source=page, region=region)
        link_exists = qs.filter(
            destination_slug=slugify(pagename)) | qs.filter(
                destination__slug=slugify(pagename))
        if link_exists:
            link = link_exists[0]
            if link.count == count:
                # No new links with this name on this page, so skip updating.
                continue
            link.count = count
        else:
            page_exists = Page.objects.filter(slug=slugify(pagename),
                                              region=region)
            if page_exists:
                destination = page_exists[0]
            else:
                destination = None

            # Exists for some reason already (probably running a script that's moving between regions?)
            if destination and Link.objects.filter(
                    source=page, destination=destination).exists():
                continue

            link = Link(
                source=page,
                region=region,
                destination=destination,
                destination_name=pagename,
                destination_slug=slugify(pagename),
                count=count,
            )
        link.save()
    def forwards(self, orm):
        from pages.models import slugify
        from links import extract_internal_links

        for page in orm['pages.Page'].objects.all().iterator():
            region = page.region
            links = extract_internal_links(page.content)
            print "..recording page links on %s" % smart_str(page.name)
            for pagename, count in links.iteritems():
                page_exists = orm['pages.Page'].objects.filter(
                    slug=slugify(pagename), region=region)
                if page_exists:
                    destination = page_exists[0]
                else:
                    destination = None
                if orm.Link.objects.filter(source=page,
                                           destination=destination).exists():
                    continue

                if orm.Link.objects.filter(
                        source=page,
                        destination_name__iexact=pagename).exists():
                    if destination:
                        link = orm.Link.objects.filter(
                            source=page, destination_name__iexact=pagename)[0]
                        link.destination = destination
                        link.save()
                else:
                    link = orm.Link(
                        source=page,
                        region=region,
                        destination=destination,
                        destination_name=pagename,
                        count=count,
                    )
                    link.save()
示例#10
0
 def test_ignore_plugins(self):
     html = """<a class="plugin includepage" href="seed">Include page seed</a></p>"""
     links = extract_internal_links(html)
     self.assertEqual(len(links.keys()), 0)