示例#1
0
    def test_create_links(self):
        # no pages should exist
        self.assertEqual(Page.select().count(), 0)

        # create links
        links = create_links(link1, [link2])
        self.assertTrue(len(links), 1)
        link = links[0]
        self.assertEqual(link.from_page.url, link1)
        self.assertEqual(link.to_page.url, link2)

        # 2 pages should exist
        self.assertEqual(Page.select().count(), 2)
示例#2
0
    def test_create_links(self):
        # no pages should exist
        self.assertEqual(Page.select().count(), 0)

        # create links
        links = create_links(link1, [link2])
        self.assertTrue(len(links), 1)
        link = links[0]
        self.assertEqual(link.from_page.url, link1)
        self.assertEqual(link.to_page.url, link2)

        # 2 pages should exist
        self.assertEqual(Page.select().count(), 2)
示例#3
0
    def test_page_twice(self):
        # make sure database is cleared out between tests
        url = 'http://www.example.com/foo'
        Page.create(url=url, content='hi world', status_code=200)
        self.assertEqual(Page.select().count(), 1)

        self.assertRaises(IntegrityError, Page.create, url=url, content='hi world', status_code=200)
示例#4
0
    def test_page_twice(self):
        # make sure database is cleared out between tests
        url = 'http://www.example.com/foo'
        Page.create(url=url, content='hi world', status_code=200)
        self.assertEqual(Page.select().count(), 1)

        self.assertRaises(IntegrityError,
                          Page.create,
                          url=url,
                          content='hi world',
                          status_code=200)
示例#5
0
    def test_permanent_redirect(self, requests_get, requests_head):
        url = "http://www.example.com/foo"
        redirect_url = "http://www.example.com/bar"
        headers = {'location': redirect_url}
        requests_head.return_value = MagicMock(status_code=301,
                                               headers=headers)

        page = Page.create(url=url, content='', status_code=0)
        add_page_info_to_page(page)

        to_page = Page.select().where(Page.url == redirect_url).first()
        self.assertTrue(to_page)

        url_redirect_link = Link.select().where(Link.from_page == page,
                                                Link.to_page == to_page)
        self.assertTrue(url_redirect_link.exists())

        self.assertEqual(requests_head.call_count, 1)
        self.assertFalse(requests_get.called)

        self.assertEqual(page.content, redirect_url)
示例#6
0
    def test_permanent_redirect(self, requests_get, requests_head):
        url = "http://www.example.com/foo"
        redirect_url = "http://www.example.com/bar"
        headers = { 'location': redirect_url }
        requests_head.return_value = MagicMock(status_code=301, headers=headers)

        page = Page.create(url=url, content='', status_code=0)
        add_page_info_to_page(page)

        to_page = Page.select().where(Page.url == redirect_url).first()
        self.assertTrue(to_page)

        url_redirect_link = Link.select().where(
            Link.from_page == page,
            Link.to_page == to_page)
        self.assertTrue(url_redirect_link.exists())

        self.assertEqual(requests_head.call_count, 1)
        self.assertFalse(requests_get.called)

        self.assertEqual(page.content, redirect_url)
示例#7
0

def sizeof_fmt(num, suffix='B'):
    """ print formatted file size
    http://stackoverflow.com/a/1094933
    """
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


if __name__ == "__main__":
    initialize('corpus.db')
    page_count = Page.select().count()
    crawled_count = Page.select().where(
        (Page.status_code == 200) &
        ((Page.content_type == 'text/html') |
        (Page.content_type == 'text/plain')))\
        .count()
    redirect_count = Page.select().where(Page.status_code == 301).count()
    to_crawl_count = Page.select().where(Page.status_code == 0).count()
    other_count = page_count - crawled_count - redirect_count - to_crawl_count

    link_count = Link.select().count()

    corpus_size = os.stat('corpus.db').st_size
    corpus_size = sizeof_fmt(corpus_size)

    print('crawled pages: {}'.format(crawled_count))
示例#8
0
文件: report.py 项目: eupharis/zidane

def sizeof_fmt(num, suffix='B'):
    """ print formatted file size
    http://stackoverflow.com/a/1094933
    """
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


if __name__ == "__main__":
    initialize('corpus.db')
    page_count = Page.select().count()
    crawled_count = Page.select().where(
        (Page.status_code == 200) &
        ((Page.content_type == 'text/html') |
        (Page.content_type == 'text/plain')))\
        .count()
    redirect_count = Page.select().where(Page.status_code == 301).count()
    to_crawl_count = Page.select().where(Page.status_code == 0).count()
    other_count = page_count - crawled_count - redirect_count - to_crawl_count

    link_count = Link.select().count()

    corpus_size = os.stat('corpus.db').st_size
    corpus_size = sizeof_fmt(corpus_size)

    print('crawled pages: {}'.format(crawled_count))