Пример #1
0
    def test_parse(self):
        """
        测试了三个场景:
        使用标准url
        使用无效url
        使用其他格式的url文档,如jpg
        :return:
        """
        #parser = Page_parser()
        url1 = 'localhost:8081/page1.html'
        expect_sub_url = 'localhost:8081/1/page1_1.html'
        parser = PageParser(url1)
        links = parser.parse()
        self.assertIn(expect_sub_url, links)

        url2 = 'localhost:8081/page7.html'
        parser = PageParser(url2)
        links = parser.parse()
        self.assertEqual(links, set())

        url3 = 'localhost:8081/3/image.jpg'
        parser = PageParser(url3)
        self.assertEqual(parser.parse(), set())
Пример #2
0
def dump():
    client = HttpClient()
    torrent_id = get_torrent_id()
    res = get_dump()
    new_records = []

    last_torrent_id = torrent_id
    direction = Direction.UP

    if direction == Direction.UP:
        increment = 1
    else:
        increment = -1

    i = 0
    failed = 0

    while run:
        last_torrent_id = last_torrent_id + increment
        print str(last_torrent_id)
        link = 'http://rutor.is/torrent/' + str(last_torrent_id)

        response = client.get_response(link)
        if not response.has_error:
            parser = PageParser(last_torrent_id, response.response_text)
            valid = parser.is_valid()
            if valid:
                failed = 0
                torrent_info = parser.parse()
                if torrent_info.category == u'Зарубежные фильмы' or torrent_info.category == u'Наши фильмы':
                    res.append(torrent_info)
                    new_records.append(torrent_info)
            else:
                print str(last_torrent_id) + ' is invalid'
                failed = failed + 1
                if failed == 10:
                    print 'end of torrent list reached'
                    last_torrent_id = last_torrent_id - 10 - 1
                    break

        i = i + 1

        time.sleep(4)

    dump = json.dumps(res, cls=MyEncoder, ensure_ascii=False)
    save_dump(dump)
    save_history(last_torrent_id + increment)
    save_to_db(new_records)
    print 'finished'
Пример #3
0
def gen_docs():
    page_list = []
    with open(config.DATA_DIR + 'page_list.txt') as fin:
        for line in fin:
            page_list.append(line.rstrip())
    template_name = config.TEMPLATE_DIR + 'doutula.template'
    template_parser = TemplateParser(template_name)
    page_parser = PageParser(template_parser.xpath_list)
    for page_url in page_list[1104:]:
        info_list = page_parser.parse(page_url)
        if len(info_list) > 0:
            for docinfo in info_list:
                print docinfo
        else:
            print 'page parse fail.'
Пример #4
0
def gen_docs():
    page_list = []
    with open(config.DATA_DIR + 'page_list.txt') as fin:
        for line in fin:
            page_list.append(line.rstrip())
    template_name = config.TEMPLATE_DIR + 'doutula.template'
    template_parser = TemplateParser(template_name)
    page_parser = PageParser(template_parser.xpath_list)
    for page_url in page_list[1104: ]:
        info_list = page_parser.parse(page_url)
        if len(info_list) > 0:
            for docinfo in info_list:
                print docinfo
        else:
            print 'page parse fail.'
Пример #5
0
    def crawl(self, url_q):
        """
        spider的爬取逻辑, 调用page_retriever解析下载url, 将提取的子url返回来
        并进行去重,加到队列中
        :param url_q: 待解析的url地址,绝对路径
        :return:
        """
        if not isinstance(url_q, tuple):
            print("Type error")
            return

        if CrawlerThreadPool.interval_links_cnt > \
                ConfReader.instance().get_max_links_count():
            interval = ConfReader.instance().get_crawl_interval()
            if interval == 0:
                interval = 60 * 5  # default every 5 minutes

            logger.info("Thread %s begin to sleep, %d s later continue" %
                        (threading.currentThread().getName(), interval))
            print("Waiting for %d seconds ..." % interval)
            sleep(interval)

            #重新计数
            self._lock.acquire()
            CrawlerThreadPool.interval_links_cnt = 0
            self._lock.release()
        else:
            pass

        (url, depth) = url_q
        if depth > ConfReader.instance().get_max_depth():
            print("Depth exceed. The max depth is {}".format(depth - 1))
            return
        page_parser = PageParser(url)
        links = page_parser.parse()
        new_links = links.difference(CrawlerThreadPool.seen_urls)
        for new_link in new_links:
            self._q.put((new_link, depth + 1))

        #statistic links number
        self._lock.acquire()
        CrawlerThreadPool.total_links += len(new_links)
        CrawlerThreadPool.interval_links_cnt += len(new_links)
        print("Spider have crawl {} links.".format(
            CrawlerThreadPool.total_links))
        CrawlerThreadPool.seen_urls.update(new_links)
        self._lock.release()