예제 #1
0
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int,
                                source_url: str, ref_url: str) -> bool:
    """Return true if the url should be skipped because it is a self linked domain within the topic.

    Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * ref_url matches SKIP_SELF_LINK_RE.
    """
    source_domain = get_url_distinctive_domain(source_url)
    ref_domain = get_url_distinctive_domain(ref_url)

    if source_domain != ref_domain:
        return False

    if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I):
        return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {
            'a': topics_id,
            'b': ref_domain
        }).hash()

    if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS:
        return True

    return False
예제 #2
0
파일: stories.py 프로젝트: rleir/mediacloud
def _url_domain_matches_medium(medium: dict, urls: list) -> bool:
    """Return true if the domain of any of the story urls matches the domain of the medium url."""
    medium_domain = get_url_distinctive_domain(medium['url'])

    story_domains = [get_url_distinctive_domain(u) for u in urls]

    matches = list(filter(lambda d: medium_domain == d, story_domains))

    return len(matches) > 0
def print_long_running_job_states(db: DatabaseHandler, limit: int):
    media = db.query("""
        select m.*, mh.*
        from media m
            join media_health mh using ( media_id ) 
        where dup_media_id is null
        order by m.media_id asc limit %(a)s
    """, {'a': limit}).hashes()

    media_groups = {}

    num_media = len(media)
    for i, medium in enumerate(media):
        domain = get_url_distinctive_domain(medium['url'])
        log.warning("%s [%d/%d]" % (domain, i, num_media))

        if domain not in media_groups:
            media_groups[domain] = []

        media_groups[domain].append(medium)

        medium['medium_domain'] = domain
        medium['dup_domain_matches'] = True

        dup_media = db.query(
            "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s",
            {'a': medium['media_id']}
        ).hashes()

        media_groups[domain].extend(dup_media)

        for dup_medium in dup_media:
            dup_domain = get_url_distinctive_domain(dup_medium['url'])
            medium['medium_domain'] = dup_domain
            medium['dup_domain_matches'] = domain == dup_domain

    db.query("DROP TABLE IF EXISTS media_dups")
    db.query(
        """
        CREATE TABLE media_dups (
            domain TEXT,
            media_id BIGINT
            )
        """)

    db.begin()
    for i, domain in enumerate(media_groups.keys()):
        log.warning("domain %s [%d/%d]" % (domain, i, len(media_groups.keys())))
        media = media_groups[domain]
        if len(media) > 1:
            for m in media:
                db.query("""
                    insert into media_dups (domain, media_id) values (%(a)s, %(b)s)
                """, {'a': domain, 'b': m['media_id']})
    db.commit()
예제 #4
0
    def test_skip_self_links(self):
        """Test that self links are skipped within extract_links_for_topic_story"""

        story_domain = get_url_distinctive_domain(self.test_story['url'])

        topic = create_test_topic(self.db, 'links')
        self.db.create(
            'topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id']
            })

        num_links = MAX_SELF_LINKS * 2
        content = ''
        for i in range(num_links):
            plain_text = "Sample sentence to make sure the links get extracted" * 10
            url = "http://%s/%d" % (story_domain, i)
            paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text,
                                                                 url)
            content = content + paragraph

        store_content(self.db, self.test_download, content)

        extract_links_for_topic_story(db=self.db,
                                      stories_id=self.test_story['stories_id'],
                                      topics_id=topic['topics_id'])

        topic_links = self.db.query(
            "select * from topic_links where topics_id = %(a)s", {
                'a': topic['topics_id']
            }).hashes()

        assert (len(topic_links) == MAX_SELF_LINKS)
    def test_skip_self_linked_domain(self) -> None:
        """Test skip_self_linked_domain."""

        # no topic_links_id should always return False
        assert (skip_self_linked_domain(self.db, {}) is False)

        # always skip search type pages
        story_domain = get_url_distinctive_domain(self.story['url'])
        regex_skipped_urls = [
            'http://%s/%s' % (story_domain, suffix)
            for suffix in ['search', 'author', 'tag']
        ]
        for url in regex_skipped_urls:
            tl = self.create_topic_link(self.topic, self.story, url, url)
            assert (skip_self_linked_domain(self.db, tl) is True)

        self_domain_url = 'http://%s/foo/bar' % story_domain
        for i in range(MAX_SELF_LINKS - len(regex_skipped_urls) - 1):
            url = self_domain_url + str(i)
            tl = self.create_topic_link(self.topic, self.story, url, url)
            assert (skip_self_linked_domain(self.db, tl) is False)

        num_tested_skipped_urls = 10
        for i in range(num_tested_skipped_urls):
            tl = self.create_topic_link(self.topic, self.story,
                                        self_domain_url, self_domain_url)
            assert (skip_self_linked_domain(self.db, tl) is True)

        other_domain_url = 'http://other.domain/foo/bar'
        num_tested_other_urls = 10
        for i in range(num_tested_other_urls):
            tl = self.create_topic_link(self.topic, self.story,
                                        other_domain_url, other_domain_url)
            assert (skip_self_linked_domain(self.db, tl) is False)
예제 #6
0
def test_get_url_distinctive_domain():
    # FIXME - some resulting domains look funny, not sure if I can change them easily though
    assert mc_url.get_url_distinctive_domain('http://www.nytimes.com/') == 'nytimes.com'
    assert mc_url.get_url_distinctive_domain('http://cyber.law.harvard.edu/') == 'law.harvard'
    assert mc_url.get_url_distinctive_domain('http://www.gazeta.ru/') == 'gazeta.ru'
    assert mc_url.get_url_distinctive_domain('http://www.whitehouse.gov/'), 'www.whitehouse'
    assert mc_url.get_url_distinctive_domain('http://info.info/') == 'info.info'
    assert mc_url.get_url_distinctive_domain('http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com'
    assert mc_url.get_url_distinctive_domain('http://status.livejournal.org/') == 'livejournal.org'

    # ".(gov|org|com).XX" exception
    assert mc_url.get_url_distinctive_domain('http://www.stat.gov.lt/') == 'stat.gov.lt'

    # "wordpress.com|blogspot|..." exception
    assert mc_url.get_url_distinctive_domain('https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'
예제 #7
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = get_url_distinctive_domain(story['url'])

    url_domain = get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
        insert into topic_domains (topics_id, domain, self_links)
            values(%(topics_id)s, %(domain)s, 1)
            on conflict (topics_id, md5(domain))
                do nothing
            returning *
        """, {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            update topic_domains set
                    self_links = topic_domains.self_links + 1
                where
                    topics_id = %(topics_id)s and
                    domain = %(domain)s
            """, {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            })
예제 #8
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = get_url_distinctive_domain(story['url'])

    url_domain = get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
            INSERT INTO topic_domains (topics_id, domain, self_links)
            VALUES (%(topics_id)s, %(domain)s, 1)
            ON CONFLICT (topics_id, md5(domain)) DO NOTHING
            RETURNING *
        """, {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            UPDATE topic_domains SET
                self_links = topic_domains.self_links + 1
            WHERE
                topics_id = %(topics_id)s AND
                domain = %(domain)s
            """, {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            })
예제 #9
0
    def __url_with_http_auth(url: str) -> str:
        """If there are HTTP auth credentials for the requested site, add them to the URL."""
        url = decode_object_from_bytes_if_needed(url)

        auth_lookup = UserAgent.__get_domain_http_auth_lookup()

        domain = get_url_distinctive_domain(url=url).lower()

        if domain in auth_lookup:
            auth = auth_lookup[domain]
            uri = furl(url)

            # https://stackoverflow.com/a/21629125/200603
            uri.username = auth['user']
            uri.password = auth['password']

            url = uri.url

        return url
예제 #10
0
    def __url_with_http_auth(url: str) -> str:
        """If there are HTTP auth credentials for the requested site, add them to the URL."""
        url = decode_object_from_bytes_if_needed(url)

        auth_lookup = UserAgent.__get_domain_http_auth_lookup()

        domain = get_url_distinctive_domain(url=url).lower()

        if domain in auth_lookup:
            auth = auth_lookup[domain]
            uri = furl(url)

            # https://stackoverflow.com/a/21629125/200603
            uri.username = auth['user']
            uri.password = auth['password']

            url = uri.url

        return url
    def test_increment_domain_links(self) -> None:
        """Test incremeber_domain_links()."""

        nomatch_domain = 'no.match'
        story_domain = get_url_distinctive_domain(self.story['url'])

        num_url_matches = 3
        for i in range(num_url_matches):
            self.create_topic_link(self.topic, self.story, story_domain,
                                   nomatch_domain)
            td = self.get_topic_domain(self.topic, nomatch_domain)

            assert (td is not None)
            assert (td['self_links'] == i + 1)

        num_redirect_matches = 3
        for i in range(num_redirect_matches):
            self.create_topic_link(self.topic, self.story, nomatch_domain,
                                   story_domain)
            td = self.get_topic_domain(self.topic, story_domain)

            assert (td is not None)
            assert (td['self_links'] == i + 1)
예제 #12
0
def test_get_url_distinctive_domain():
    # FIXME - some resulting domains look funny, not sure if I can change them easily though
    assert mc_url.get_url_distinctive_domain(
        'http://www.nytimes.com/') == 'nytimes.com'
    assert mc_url.get_url_distinctive_domain(
        'http://cyber.law.harvard.edu/') == 'harvard.edu'
    assert mc_url.get_url_distinctive_domain(
        'http://www.gazeta.ru/') == 'gazeta.ru'
    assert mc_url.get_url_distinctive_domain(
        'http://www.whitehouse.gov/'), 'whitehouse.gov'
    assert mc_url.get_url_distinctive_domain(
        'http://info.info/') == 'info.info'
    assert mc_url.get_url_distinctive_domain(
        'http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com'
    assert mc_url.get_url_distinctive_domain(
        'http://status.livejournal.org/') == 'livejournal.org'
    assert mc_url.get_url_distinctive_domain(
        'http://www.republicoftogo.com/') == 'republicoftogo.com'
    assert mc_url.get_url_distinctive_domain('http://www.fbi.gov') == 'fbi.gov'
    assert mc_url.get_url_distinctive_domain(
        'http://shrb.dzwww.com/') == 'dzwww.com'
    assert mc_url.get_url_distinctive_domain(
        'http://www.thecwsandiego.com/') == 'thecwsandiego.com'
    # assert mc_url.get_url_distinctive_domain('https://www.gov.uk/') == 'gov.uk'
    assert mc_url.get_url_distinctive_domain(
        'https://www.dailymail.co.uk/home/index.html') == 'dailymail.co.uk'

    # ".(gov|org|com).XX" exception
    assert mc_url.get_url_distinctive_domain(
        'http://www.stat.gov.lt/') == 'stat.gov.lt'

    # "wordpress.com|blogspot|..." exception
    assert mc_url.get_url_distinctive_domain(
        'https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'
예제 #13
0
def test_get_url_distinctive_domain():
    # FIXME - some resulting domains look funny, not sure if I can change them easily though
    assert mc_url.get_url_distinctive_domain('http://www.nytimes.com/') == 'nytimes.com'
    assert mc_url.get_url_distinctive_domain('http://cyber.law.harvard.edu/') == 'harvard.edu'
    assert mc_url.get_url_distinctive_domain('http://www.gazeta.ru/') == 'gazeta.ru'
    assert mc_url.get_url_distinctive_domain('http://www.whitehouse.gov/'), 'whitehouse.gov'
    assert mc_url.get_url_distinctive_domain('http://info.info/') == 'info.info'
    assert mc_url.get_url_distinctive_domain('http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com'
    assert mc_url.get_url_distinctive_domain('http://status.livejournal.org/') == 'livejournal.org'
    assert mc_url.get_url_distinctive_domain('http://www.republicoftogo.com/') == 'republicoftogo.com'
    assert mc_url.get_url_distinctive_domain('http://www.fbi.gov') == 'fbi.gov'
    assert mc_url.get_url_distinctive_domain('http://shrb.dzwww.com/') == 'dzwww.com'
    assert mc_url.get_url_distinctive_domain('http://www.thecwsandiego.com/') == 'thecwsandiego.com'
    # assert mc_url.get_url_distinctive_domain('https://www.gov.uk/') == 'gov.uk'
    assert mc_url.get_url_distinctive_domain('https://www.dailymail.co.uk/home/index.html') == 'dailymail.co.uk'

    # ".(gov|org|com).XX" exception
    assert mc_url.get_url_distinctive_domain('http://www.stat.gov.lt/') == 'stat.gov.lt'

    # "wordpress.com|blogspot|..." exception
    assert mc_url.get_url_distinctive_domain('https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'