Пример #1
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print "Dependences errors: "
            for err in deps_errors:
                print "  %s" % err
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print e
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv

        probe_cmd = get_node_cmd()
        if not probe_cmd:  # maybe useless
            print "Error: unable to find node executable"
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_options.extend([
                "-y",
                "%s:%s:%s" % (Shared.options['proxy']['proto'],
                              Shared.options['proxy']['host'],
                              Shared.options['proxy']['port'])
            ])
        if not Shared.options['headless_chrome']:
            probe_options.append("-l")
        probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'],
                                 proxy=json.dumps(Shared.options['proxy']),
                                 extra_headers=json.dumps(
                                     Shared.options['extra_headers']),
                                 cookies=json.dumps(start_cookies))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)" % (
            self.db_file, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
Пример #2
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        probe_cmd = get_phantomjs_cmd()
        if not probe_cmd:
            print "Error: unable to find phantomjs executable"
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        display_progress = True
        verbose = False
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False
        user_script = None

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                if v == "tor": v = "socks5:127.0.0.1:9150"
                proxy = v.split(":")
                if proxy[0] not in ("http", "socks5"):
                    print "only http and socks5 proxies are supported"
                    sys.exit(1)
                Shared.options['proxy'] = {
                    "proto": proxy[0],
                    "host": proxy[1],
                    "port": proxy[2]
                }
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                verbose = True
            elif o == "-u":
                if os.path.isfile(v):
                    user_script = os.path.abspath(v)
                else:
                    print "error: unable to open USER_SCRIPT"
                    sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = self.parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_cmd.append("--proxy-type=%s" %
                             Shared.options['proxy']['proto'])
            probe_cmd.append("--proxy=%s:%s" %
                             (Shared.options['proxy']['host'],
                              Shared.options['proxy']['port']))

        probe_cmd.append(self.base_dir + 'probe/analyze.js')

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        if user_script:
            probe_options.extend(("-u", user_script))

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        if user_script and initial_checks:
            self.check_user_script_syntax(probe_cmd, user_script)

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        fname = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(fname, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'])

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            fname, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database, display_progress,
                       verbose)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
Пример #3
0
	def main(self, argv):
		Shared.options = self.defaults		
		Shared.th_condition = threading.Condition()
		Shared.main_condition = threading.Condition()


		probe_cmd = self.get_phantomjs_cmd()
		if not probe_cmd:
			print "Error: unable to find phantomjs executable"
			sys.exit(1)

		start_cookies = []
		start_referer = None
		
		probe_options = ["-R", self.randstr(20)]
		threads = []
		num_threads = self.defaults['num_threads']			
	
		out_file = ""
		out_file_overwrite = self.defaults['out_file_overwrite']	
		cookie_string = None
		display_progress = True
		check_starturl = True
		http_auth = None
		get_robots_txt = True
		save_html = False

		try:
			opts, args = getopt.getopt(argv, 'hc:t:jn:x:O:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:')
		except getopt.GetoptError as err:
			print str(err)	
			sys.exit(1)
		
		
		if len(args) < 2:
			self.usage()
			sys.exit(1)
		

		for o, v in opts:
			if o == '-h':
				self.usage()
				sys.exit(0)
			elif o == '-c':
				cookie_string = v
			elif o == '-C':
				try:
					with open(v) as cf:
						cookie_string = cf.read()
				except Exception as e:				
					print "error reading cookie file"		
					sys.exit(1)			
			elif o == '-r':
				start_referer = v
			elif o == '-n':
				num_threads = int(v)
			elif o == '-t':
				Shared.options['process_timeout'] = int(v)
			elif o == '-q':
				display_progress = False				
			elif o == '-A':
				http_auth = v
			elif o == '-p':			
				if v == "tor": v = "socks5:127.0.0.1:9150"
				proxy =  v.split(":")
				if proxy[0] not in ("http", "https"): 
					print "only http and socks5 proxies are supported"
					sys.exit(1)
				Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]}
			elif o == '-d':			
				for ad in v.split(","):
					# convert *.domain.com to *.\.domain\.com
					pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)")
					Shared.allowed_domains.add(pattern)
			elif o == '-x':	
				for eu in v.split(","):					
					Shared.excluded_urls.add(eu)								
			elif o == "-G":
				Shared.options['group_qs'] = True			
			elif o == "-w":
				out_file_overwrite = True			
			elif o == "-R":
				Shared.options['max_redirects'] = int(v)
			elif o == "-U":
				Shared.options['useragent'] = v			
			elif o == "-s":
				if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL):					
					self.usage()
					print "* ERROR: wrong scope set '%s'" % v
					sys.exit(1)
				Shared.options['scope'] = v
			elif o == "-m":
				if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE):
					self.usage()
					print "* ERROR: wrong mode set '%s'" % v					
					sys.exit(1)
				Shared.options['mode'] = v
			elif o == "-S":
				check_starturl = False
			elif o == "-I":
				get_robots_txt = False	
			elif o == "-H":
				save_html = True
			elif o == "-D":
				Shared.options['max_depth'] = int(v)
			elif o == "-P":
				Shared.options['max_post_depth'] = int(v)				
			elif o == "-F":
				Shared.options['crawl_forms'] = False


		if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0:
			print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

		if cookie_string:
			try:
				start_cookies = self.parse_cookie_string(cookie_string)
			except Exception as e:				
				print "error decoding cookie string"		
				sys.exit(1)			

		

		if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
			probe_options.append("-f") # dont fill values
		if Shared.options['mode'] == CRAWLMODE_PASSIVE:
			probe_options.append("-t") # dont trigger events

		if Shared.options['proxy']:
			probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
			probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
		
		if len(Shared.excluded_urls) > 0:			
			probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

		if save_html:
			probe_options.append("-H")

		probe_options.extend(("-x", str(Shared.options['process_timeout']) ))
		probe_options.extend(("-A", Shared.options['useragent']))

		probe_cmd.append(self.base_dir + 'probe/analyze.js')
		
		Shared.probe_cmd = probe_cmd + probe_options 


		Shared.starturl = normalize_url(args[0])
		out_file = args[1]
		
		purl = urlsplit(Shared.starturl)
		Shared.allowed_domains.add(purl.hostname)


		for sc in start_cookies:		
			Shared.start_cookies.append(Cookie(sc, Shared.starturl))
		

		start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer)
		
		stdoutw("Initializing . ")

		start_requests = self.init_crawl(start_req, check_starturl, get_robots_txt)
				
		database = None
		fname = self.generate_filename(out_file, out_file_overwrite)
		try:
			database = self.init_db(fname, out_file)
		except Exception as e:
			print str(e)
			sys.exit(1)

		database.save_crawl_info(
			htcap_version = get_program_infos()['version'],
			target = Shared.starturl,
			start_date = self.crawl_start_time,
			commandline = cmd_to_str(argv),
			user_agent = Shared.options['useragent']
		)

		database.connect()
		database.begin()		
		for req in start_requests:
			database.save_request(req)
		database.commit()
		database.close()

		print "done"
		print "Database %s initialized, crawl started with %d threads" % (fname, num_threads)

		for n in range(0, num_threads):	
			thread = CrawlerThread()			
			threads.append(thread)		
			thread.start()
		

		self.main_loop(threads, start_requests, database, display_progress)		
		
		self.kill_threads(threads)

		self.crawl_end_time = int(time.time())

		print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60)

		database.save_crawl_info(end_date=self.crawl_end_time)
Пример #4
0
parser = argparse.ArgumentParser(description="Crawls the web looking for 3D object models.")
parser.add_argument("--config", action="store", type=str)
parser.add_argument("--instances", action="store", type=int, default=10)

if __name__ == "__main__":
    # TODO(brunonery): verify arguments and fail gracefully if necessary.
    args = parser.parse_args()
    config = CrawlerConfig(open(args.config))
    # Prepare database and locks.
    database_handler = DatabaseHandler(config.database_address())
    database_handler.Init()
    url_lock = threading.Lock()
    # Prepare download queue.
    download_queue = Queue.Queue()
    # Start all threads.
    crawler_thread_list = []
    for i in range(args.instances):
        current_thread = CrawlerThread(database_handler, download_queue, url_lock)
        crawler_thread_list.append(current_thread)
        current_thread.start()
    downloader_thread_list = []
    # TODO(brunonery): have different number of crawler and downloader threads.
    for i in range(args.instances):
        current_thread = DownloaderThread(download_queue, config.download_folder(), config.zip_size_limit())
        current_thread.daemon = True
        downloader_thread_list.append(current_thread)
        current_thread.start()
    # Wait for all crawler threads to finish.
    for thread in crawler_thread_list:
        thread.join()