Exemplo n.º 1
0
 def test_fill_disallow_urls_from_robot(self):
     with patch.object(requests, 'get') as mock_get:
         with open('fake_robots.txt', 'r') as fake_robots_txt:
             mock_get.return_value = FakeResponse()
             mock_get.return_value.text = fake_robots_txt.read()
             test_crawler = Crawler(
                 'https://a/',
                 [''], {})
             test_crawler.fill_disallow_urls(URL('https://a/'))
             test_crawler.close()
             self.assertEqual({re.compile('https://a/b.+', re.IGNORECASE)},
                              test_crawler.disallow_urls)
Exemplo n.º 2
0
 def test_searcher_with_seen_urls(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {}, 2)
             test_crawler.seen_urls.add(Page(URL('http://scala-lang.org')))
             test_result = test_crawler.crawl()
             test_crawler.close()
             assert 'http://scala-lang.org' not in test_result
Exemplo n.º 3
0
 def test_crawler_zero_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['dog'],
                 {},
                 2)
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(test_result, set())
Exemplo n.º 4
0
 def test_update_parents(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://a/c/></a>' \
                                      '<a href=http://a/b/></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
         test_crawler = Crawler(
             'http://a',
             [''], {}, max_urls_count=3)
         test_result = test_crawler.crawl()
         test_crawler.close()
         for page in test_result:
             if page.parent:
                 self.assertEqual(page.parent,
                                  Page(URL('http://a')))
Exemplo n.º 5
0
def crawler_sqlmap(entry_url,
                   depth=-1,
                   level=1,
                   threads=2,
                   timeout=30,
                   checkhost=True):
    """启动sqlmap扫描的入口函数。

    :param entry_url: 扫描网站的入口地址
    :param depth: 网页爬虫爬取页面深度,-1则表示不设置深度,默认-1
    :param level: sqlmap扫描测试等级:1-5(默认为1),等级越高使用的测试样例越多,结果越精确,时间也越长
    :param threads: sqlmap多线程扫描设置(默认为2)
    :param timeout: sqlmap扫描超时时间(默认30s)
    :param checkhost: 检查爬取链接是否属于同一域
    :return: 返回值为四元组(ret, url, simple, content)
            ret: 执行结果, False为失败, True为成功
            url: 扫描目标地址
            simple: 解析content抽取重要数据生成的报告,字典类型
            content: sqlmap返回的完整报告,字典类型
            若执行结果为False,那么把扫描错误信息存在扫描关键结果(simple)这个位置
    """
    settings = Setting(handle=False)
    settings.depth = depth
    settings.nocheckhost = not checkhost
    settings.level = level
    settings.threads = threads
    settings.timeout = timeout

    sqlmap, crawler = None, None
    try:
        sqlmap, ip, port = start_sqlmap()
        # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号
        crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings)
        crawler.run()
        cont, simple = crawler.raw_report()
        return True, entry_url, simple, cont
    except:
        logger.error(traceback.format_exc())
        return False, entry_url, traceback.format_exc(), {}
    finally:
        if crawler: crawler.close()
        if sqlmap: sqlmap.terminate()
Exemplo n.º 6
0
 def test_searcher_with_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>' \
                                      '<a href=https://scala2.html></a>' \
                                      '<a href=https://scala3.html></a>' \
                                      '<a href=https://scala4.html></a>' \
                                      '<a href=https://scala5.html></a>' \
                                      '<a href=https://scala6.html></a>' \
                                      '<a href=https://scala7.html></a>' \
                                      '<a href=https://scala8.html></a>' \
                                      '<a href=https://scala9.html></a>' \
                                      '<a href=https://scala10.html></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {})
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(len(test_result), 10)
Exemplo n.º 7
0
                                  default='pages',
                                  help='Directory for downloaded pages')
    arguments_parser.add_argument('-g', action='store_true', help='Show graph')
    arguments_parser.add_argument('-w',
                                  action='store_true',
                                  help='Save founded pages')
    args = arguments_parser.parse_args()
    white_domains = []
    for domain in args.wildcard:
        if domain.startswith('*'):
            white_domains.append(re.compile(fr'[^.]+.{domain[1::]}'))
        else:
            white_domains.append(domain)
    if args.start_url[-1] == '/':
        url = args.start_url[:-1]
    else:
        url = args.start_url
    crawler = Crawler(url, args.request, white_domains, args.d, args.f, args.w)
    try:
        result = crawler.crawl()
        if args.g:
            show_graph(result)
        for link in result:
            print(link)
        print('Program is completed')
        plt.show()
    except KeyboardInterrupt:
        print('Program is completed')
    finally:
        crawler.close()