-
Notifications
You must be signed in to change notification settings - Fork 0
/
mini_spider_test.py
125 lines (102 loc) · 3.32 KB
/
mini_spider_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
#-*- coding: UTF-8 -*-
"""
This module provide spider service
@date 2017.10.17
"""
import urllib2
import unittest
import mini_spider
from lib import logger
from lib import config
from lib import htmlparse
class TestConfig(unittest.TestCase):
"""test of config load
"""
def setUp(self):
"""set up for unit test"""
pass
def tearDown(self):
"""tear down for unit test"""
pass
def test_get(self):
"""test function _get"""
cfg = config.Config("./conf/spider.conf")
self.assertEqual(cfg.get("spider", "url_list_file"), "./urls")
self.assertEqual(cfg.get("spider", "output_directory"), "./output")
self.assertEqual(cfg.getint("spider", "crawl_interval"), 1)
self.assertEqual(cfg.getint("spider", "crawl_timeout"), 1)
self.assertEqual(cfg.get("spider", "target_url"), ".*.(htm|html)$")
class TestLog(unittest.TestCase):
"""
test Log
"""
def setUp(self):
"""set up for unit test"""
pass
def tearDown(self):
"""tear down for unit test"""
pass
def test_get_logger(self):
"""test function _get_logger"""
self.assertTrue(logger.get_logger("testlog"))
class TestHtmlparse(unittest.TestCase):
"""
test htmlparse
"""
def setUp(self):
"""set up for unit test"""
self.parser = htmlparse.UrlLister(".*.(htm|html)$")
self.url = "http://pycm.baidu.com:8081"
self.crawl_timeout = 1
self.url_content = """
<!DOCTYPE html>
<html>
<head>
<meta charset=utf8>
<title>Crawl Me</title>
</head>
<body>
<ul>
<li><a href=page1.html>page 1</a></li>
<li><a href="page2.html">page 2</a></li>
<li><a href='page3.html'>page 3</a></li>
<li><a href='mirror/index.html'>mirror</a></li>
<li><a href='javascript:location.href="page4.html"'>page 4</a></li>
</ul>
</body>
</html>
"""
self.parse_url = ['http://pycm.baidu.com:8081/mirror/index.html',
'http://pycm.baidu.com:8081/page2.html',
'http://pycm.baidu.com:8081/page1.html',
'http://pycm.baidu.com:8081/page3.html']
self.result_list = []
def tearDown(self):
"""tear down for unit test"""
pass
def test_get_content(self):
"""test function _get_urls"""
self.assertEqual(len(self.parser.get_content(self.url, self.crawl_timeout).strip()),
len(self.url_content.strip()))
def test_get_urls(self):
"""test parse"""
self.rusult_list = self.parser.get_urls(self.url_content, self.url)
self.assertEqual(self.parse_url.sort(), self.result_list.sort())
class TestSpider(unittest.TestCase):
"""
test spider
"""
def setUp(self):
"""set up for unit test"""
self.spider = mini_spider.Spider('./conf/spider.conf')
self.url = set(["http://pycm.baidu.com:8081/"])
def tearDown(self):
"""tear down for unit test"""
pass
def test_seedfile_load(self):
"""test function _seedfile_load"""
self.spider._seedfile_load()
self.assertEqual(self.spider.handled_set, self.url)
if __name__ == "__main__":
unittest.main()