예제 #1
0
def main():
    parser = Parser(HTML.REDIS, PARSER.ELASTICSEARCH)
    """Crawler start"""
    crawler = Crawler(DB.MYSQL, HTML.REDIS, parser)

    try:
        crawler.run()
    except KeyboardInterrupt:
        crawler.stop()
        sys.exit(0)
예제 #2
0
def launch(dir_for_docs, dir_checkpoints, checkpoints_name, description_file,
           lock, inv_index, frontier, documents, step_count):
    crawler = Crawler(frontier, dir_for_docs, dir_checkpoints, checkpoints_name, lock, inv_index, description_file)
    if documents is None:
        open(crawler.file_description, 'w').close()  # Wipe file
    else:
        crawler.documents = documents
    if step_count is not None:
        crawler.steps_count = step_count

    crawler.run()
예제 #3
0
    def start(self):
        for game_type, crawl_range in self.crawl_range().items():
            crawler = Crawler(game_type)
            for date in pd.date_range(start=crawl_range["begin"],
                                      end=crawl_range["end"]):
                logging.debug(f"command: crawling {game_type} at {date}")
                crawler.run(self.format_date(date))

                # random sleep
                time.sleep(
                    abs(
                        np.random.normal(
                            self.config["commander"]["queryPeriod"])))
예제 #4
0
def quick_test(request):
    """ quick test page """
    form, results, url_to_test = None, None, u''    
    if "POST" == request.method:
        form = QuickTestCheckForm(request.POST)
        if form.is_valid():
            url_to_test = form.cleaned_data["url"] 

    if "url-to-test" in request.session:
        url_to_test = request.session.pop("url-to-test")

    if url_to_test:
        # lets check
        c = Crawler(url_to_test)
        raw_results = c.run()
        results = {"error": raw_results["error"],
                   "results_by_category": ((u'External links', 'ext', raw_results["external"], len(raw_results["external"]["web"]) + len(raw_results["external"]["img"])),
                                           (u'Internal links', 'int', raw_results["internal"], len(raw_results["internal"]["web"]) + len(raw_results["internal"]["img"])),
                                           (u'System', 'system', raw_results["system"], len(raw_results["system"]["css"]) + len(raw_results["system"]["js"])),
            )
        }
    if form is None:
        initial = {}
        if url_to_test:
            initial.update({"url": url_to_test})
        form = QuickTestCheckForm(initial=initial)
    return render_to_response('index/quick-test.html', {"form": form, "results": results}, context_instance=RequestContext(request))
예제 #5
0
def crawler_sqlmap(entry_url,
                   depth=-1,
                   level=1,
                   threads=2,
                   timeout=30,
                   checkhost=True):
    """启动sqlmap扫描的入口函数。

    :param entry_url: 扫描网站的入口地址
    :param depth: 网页爬虫爬取页面深度,-1则表示不设置深度,默认-1
    :param level: sqlmap扫描测试等级:1-5(默认为1),等级越高使用的测试样例越多,结果越精确,时间也越长
    :param threads: sqlmap多线程扫描设置(默认为2)
    :param timeout: sqlmap扫描超时时间(默认30s)
    :param checkhost: 检查爬取链接是否属于同一域
    :return: 返回值为四元组(ret, url, simple, content)
            ret: 执行结果, False为失败, True为成功
            url: 扫描目标地址
            simple: 解析content抽取重要数据生成的报告,字典类型
            content: sqlmap返回的完整报告,字典类型
            若执行结果为False,那么把扫描错误信息存在扫描关键结果(simple)这个位置
    """
    settings = Setting(handle=False)
    settings.depth = depth
    settings.nocheckhost = not checkhost
    settings.level = level
    settings.threads = threads
    settings.timeout = timeout

    sqlmap, crawler = None, None
    try:
        sqlmap, ip, port = start_sqlmap()
        # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号
        crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings)
        crawler.run()
        cont, simple = crawler.raw_report()
        return True, entry_url, simple, cont
    except:
        logger.error(traceback.format_exc())
        return False, entry_url, traceback.format_exc(), {}
    finally:
        if crawler: crawler.close()
        if sqlmap: sqlmap.terminate()
예제 #6
0
from crawler.crawler import Crawler

if __name__ == '__main__':
    crawler = Crawler()
    crawler.run()
예제 #7
0
def main():
    crawler = Crawler(num_workers=NUM_WORKERS, domain=DOMAIN)
    crawler.run()