def test_parse(self): """ 测试了三个场景: 使用标准url 使用无效url 使用其他格式的url文档,如jpg :return: """ #parser = Page_parser() url1 = 'localhost:8081/page1.html' expect_sub_url = 'localhost:8081/1/page1_1.html' parser = PageParser(url1) links = parser.parse() self.assertIn(expect_sub_url, links) url2 = 'localhost:8081/page7.html' parser = PageParser(url2) links = parser.parse() self.assertEqual(links, set()) url3 = 'localhost:8081/3/image.jpg' parser = PageParser(url3) self.assertEqual(parser.parse(), set())
def dump(): client = HttpClient() torrent_id = get_torrent_id() res = get_dump() new_records = [] last_torrent_id = torrent_id direction = Direction.UP if direction == Direction.UP: increment = 1 else: increment = -1 i = 0 failed = 0 while run: last_torrent_id = last_torrent_id + increment print str(last_torrent_id) link = 'http://rutor.is/torrent/' + str(last_torrent_id) response = client.get_response(link) if not response.has_error: parser = PageParser(last_torrent_id, response.response_text) valid = parser.is_valid() if valid: failed = 0 torrent_info = parser.parse() if torrent_info.category == u'Зарубежные фильмы' or torrent_info.category == u'Наши фильмы': res.append(torrent_info) new_records.append(torrent_info) else: print str(last_torrent_id) + ' is invalid' failed = failed + 1 if failed == 10: print 'end of torrent list reached' last_torrent_id = last_torrent_id - 10 - 1 break i = i + 1 time.sleep(4) dump = json.dumps(res, cls=MyEncoder, ensure_ascii=False) save_dump(dump) save_history(last_torrent_id + increment) save_to_db(new_records) print 'finished'
def gen_docs(): page_list = [] with open(config.DATA_DIR + 'page_list.txt') as fin: for line in fin: page_list.append(line.rstrip()) template_name = config.TEMPLATE_DIR + 'doutula.template' template_parser = TemplateParser(template_name) page_parser = PageParser(template_parser.xpath_list) for page_url in page_list[1104:]: info_list = page_parser.parse(page_url) if len(info_list) > 0: for docinfo in info_list: print docinfo else: print 'page parse fail.'
def gen_docs(): page_list = [] with open(config.DATA_DIR + 'page_list.txt') as fin: for line in fin: page_list.append(line.rstrip()) template_name = config.TEMPLATE_DIR + 'doutula.template' template_parser = TemplateParser(template_name) page_parser = PageParser(template_parser.xpath_list) for page_url in page_list[1104: ]: info_list = page_parser.parse(page_url) if len(info_list) > 0: for docinfo in info_list: print docinfo else: print 'page parse fail.'
def crawl(self, url_q): """ spider的爬取逻辑, 调用page_retriever解析下载url, 将提取的子url返回来 并进行去重,加到队列中 :param url_q: 待解析的url地址,绝对路径 :return: """ if not isinstance(url_q, tuple): print("Type error") return if CrawlerThreadPool.interval_links_cnt > \ ConfReader.instance().get_max_links_count(): interval = ConfReader.instance().get_crawl_interval() if interval == 0: interval = 60 * 5 # default every 5 minutes logger.info("Thread %s begin to sleep, %d s later continue" % (threading.currentThread().getName(), interval)) print("Waiting for %d seconds ..." % interval) sleep(interval) #重新计数 self._lock.acquire() CrawlerThreadPool.interval_links_cnt = 0 self._lock.release() else: pass (url, depth) = url_q if depth > ConfReader.instance().get_max_depth(): print("Depth exceed. The max depth is {}".format(depth - 1)) return page_parser = PageParser(url) links = page_parser.parse() new_links = links.difference(CrawlerThreadPool.seen_urls) for new_link in new_links: self._q.put((new_link, depth + 1)) #statistic links number self._lock.acquire() CrawlerThreadPool.total_links += len(new_links) CrawlerThreadPool.interval_links_cnt += len(new_links) print("Spider have crawl {} links.".format( CrawlerThreadPool.total_links)) CrawlerThreadPool.seen_urls.update(new_links) self._lock.release()