示例#1
0
 def _crawl_link(self, link):
     spider = Spider(link, self.user_agent, get_tor_session(9150))
     spider.crawl()
     self.log.debug(
         'Creating document for: {0}, title {1}, body: {2}'.format(
             link, spider.title, spider.body[0::50]))
     self._create_document(link, spider.title, spider.html)
     self._manager.mark_link_crawled(link, spider.success)
     if spider.success:
         return spider.links
     else:
         return []
示例#2
0
class Test(TestCase):
    def setUp(self):
        client = pymongo.MongoClient()
        client.drop_database('test')
        db = client['test']
        self.term_code = '021'
        self.major_code = '0120123111'
        self.p = mock.patch(
            'spider.spider.Spider.iter_term_and_major',
            lambda v: ((self.term_code, None), (self.term_code, self.major_code))
        )
        self.p.start()
        self.shortcut = hfut.Student(2013217413, '123456789012', 'XC')

        self.job_manager = JobManager(pool_size=20)
        self.db_manager = DatabaseManager(db, batch_size=80)

        self.j = Spider(self.shortcut, self.job_manager, self.db_manager)

    def tearDown(self):
        self.p.stop()

    @profile
    def test_dfs_stability(self):
        # self.j.crawl()
        self.j.crawl()
        self.check()

    def check(self):
        # 专业和学期被 patch 掉了
        self.assertEqual(self.db_manager.db['major'].count(), 0)
        self.assertEqual(self.db_manager.db['term'].count(), 0)
        self.assertEqual(self.db_manager.db['course'].count(), 9)
        self.assertEqual(self.db_manager.db['plan'].count(), 9)
        self.assertEqual(self.db_manager.db['class'].count(), 201)
        self.assertEqual(self.db_manager.db['student'].count(), 2621)
        self.assertEqual(self.db_manager.db['class_student'].count(), 20236)