示例#1
0
    def start(self, heart_beat_config):
        self._server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self._server_socket.bind(('', heart_beat_config["server_port"]))
            self._server_socket.listen(heart_beat_config["backlog"])
            self._interval = heart_beat_config["server_interval"]
            self._heart_beat_config = heart_beat_config
            self._last_notification_time = None

            signal.signal(signal.SIGTERM, HeartBeatServer._stop)
            signal.signal(signal.SIGINT, HeartBeatServer._stop) # for ctrl-c

            thread.start_new_thread(self._run, ())

            logging.info("heartbeat server started")
            while not HeartBeatServer.global_stop_event.is_set():
                client_socket, _ = self._server_socket.accept()
                raw_data = client_socket.recv(heart_beat_config["max_data_size"])
                message = simplejson.loads(raw_data)
                logging.debug("heartbeat server received message", message)
                heartbeatdb.save_heartbeat(message)
                client_socket.close()
        except socket.error as e:
            logging.warn("socket error for heartbeat server!!!", exception = e)
        finally:
            self._server_socket.close()
            logging.info("heartbeat server terminated")
示例#2
0
    def _wait(self):
        if not self._validation_enabled:
            return

        while self._client.get(self._valid_key) != "1": #Notes: all clients will wait until the __valid_redis field is set to 1
            if self._stop_condition is not None and self._stop_condition():
                logging.warn("whole process is terminating")
                raise Exception("whole process is terminating")
            else:
                logging.warn("redis server is loading data")
                time.sleep(5)
    def evaluate(self, url, source, url_info, extras = None):
        crawl_priority = -1
        crawl_depth = -1
        # if exist in url info, read it
        if url_info["crawl_priority"] is not None:
            crawl_priority = url_info["crawl_priority"]
        if url_info["crawl_depth"] is not None:
            crawl_depth = url_info["crawl_depth"]

        #url validation,
        if not url_validator.validate(url, url_info["parent_url"]):
            logging.warn("invalid crawl url", url = url, parent_url = url_info["parent_url"])
            return False, crawl_priority, crawl_depth

        #for non-parsed urls, determined based on domains or defaults.
        if source == "offline" or source == "online" or source == "post_ondemand":
            if url_info["crawl_priority"] is None or url_info["crawl_depth"] is None:
                # determine priority and depth by souce
                crawl_priority, crawl_depth = self._determine(url, source)
                #use default explicit ones
                if url_info["crawl_priority"] is not None:
                    crawl_priority = url_info["crawl_priority"]
                if url_info["crawl_depth"] is not None:
                    crawl_depth = url_info["crawl_depth"]

        #for parsed urls, priority += 1, depth -= 1
        # TODO why priority + 1?
        elif source == "parsed" or source == "redirected":
            crawl_priority = url_info["crawl_priority"]
            if crawl_priority < self._settings["total_priority_count"] - 1:
                crawl_priority += 1

            crawl_depth = url_info["crawl_depth"] - 1

            #handle external url
            if url_analyser.is_external_url(url, url_info["parent_url"]):
                mode = self._settings["general_crawl_policies"]["external_crawl_mode"]
                # mode could be continue or new. if new, use source determine
                # new priority and depth.
                if mode == "new":
                    crawl_priority, crawl_depth = self._determine(url, "external")
        else:
            raise Exception("unsupported source %s", source = source)

        # raise exceed expetion
        if crawl_priority < 0 or crawl_priority >= self._settings["total_priority_count"]:
            raise Exception("priority exceeded %s" % crawl_priority)

        if crawl_depth < 0:
            raise Exception("crawl_depth can't be less than 0 %s" % crawl_depth)

        return True, crawl_priority, crawl_depth
示例#4
0
    def _once(self):
        now = datetime2timestamp(datetime.datetime.utcnow())
        message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name,
            "pid" : self._process_id, "handler_key" : self._handler_key}

        try:
            self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self._client_socket.connect((self._server_address, self._server_port))
            self._client_socket.send(simplejson.dumps(message))
            logging.debug("heartbeat client sent message", message)
        except socket.error as e:
            logging.warn("socket error for heartbeat client", exception = e)
        finally:
            self._client_socket.close()
示例#5
0
    def _decode_fields(self, url, message):
        #decode some message fields
        if message["page_last_modified"] is not None:
            message["page_last_modified"] = decoder.decode_string(message["page_last_modified"])
            if message["page_last_modified"] is None:
                logging.warn("decode page_last_modified failed", url = url)

        if message["headers"] is not None:
            decoded_headers = {}
            for key in message["headers"].keys():
                value = message["headers"].get(key, "")
                decoded_key = decoder.decode_string(key)
                if decoded_key is None:
                    logging.warn("decoded http response header key failed", url = url, field = unicode({"key" : key, "value" : value}))
                    continue
                if not re.match("^[a-zA-Z0-9-]+$", decoded_key):
                    logging.warn("filtered invalid http response header key", url = url, field = unicode({"key" : key, "value" : value}))
                    continue

                decoded_value = decoder.decode_string(value)
                if decoded_value is None:
                    logging.warn("decoded http response header value failed", url = url, field = unicode({"key" : key, "value" : value}))
                    continue
                decoded_headers[decoded_key] = decoded_value
            message["headers"] = decoded_headers
示例#6
0
    def _reconnect(args, msg, e):
        if len(args) == 0 or not isinstance(args[0], RedisClient):
            raise Exception("unsupported decorator, it should be applied to RedisClient methods")
        self = args[0]

        while self._stop_condition is None or not self._stop_condition():
            logging.error("redis connection error: %s, %s reconnecting..." % (msg, e))
            time.sleep(5)
            try:
                self._wait()
                return True
            except redis.exceptions.ConnectionError as e:
                pass

        logging.warn("whole process is terminating")
        raise Exception("whole process is terminating")
    def normalize_url(self, url, base_url = None):
        if url is None or len(url) == 0:
            return None

        original_url = url
        #Note: here asume all non-unicode urls are encoded by utf-8
        if isinstance(url, str):
            url = url.decode("utf-8")

        if not isinstance(url, unicode):
            logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url)
            return None

        url = url.replace('%20', ' ').strip()

        #fix http scheme:
        url = self._fix_http_scheme(url)

        #handle relative url
        if base_url is not None:
            url = urlparse.urljoin(base_url, url)

        #common normlization
        try:
            url = urlnorm.norm(url)
        except Exception as e:
            logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e)
            return None

        try:
            parse_result = urlparse.urlparse(url)
        except Exception as e:
            logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url)
            return None

        if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]:
            logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url)
            return None


        netloc = parse_result.netloc
        host = parse_result.netloc.split(':')[0]
        if ip_regex.match(host) is None: #if it's an ip host

            #check if domain and tld exists
            subdomain, domain, tld = tldextract.extract(host)
            if len(domain) == 0 or len(tld) == 0:
                logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url)
                return None

            #fix chinese punctuation
            for i in range(len(chinese_punctuation_map[0])):
                src = chinese_punctuation_map[0][i]
                dst = chinese_punctuation_map[1][i]
                netloc = netloc.replace(src, dst)

            #add www if not exists
            if len(subdomain) == 0:
                netloc = "www." + netloc

        fragment = parse_result.fragment
        if not fragment.startswith("!"): #Google's recommendation for ajax request
            fragment = ""
        if len(parse_result.scheme) == 0 or len(netloc) == 0:
            logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url)
            return None

        url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment))

        #canonicalize url
        #Note: it's too strong, and sometimes change the url semantics.
        #url = ccrawler.utils.url.canonicalize_url(url)

        url = url.strip()
        if len(url) > self._settings["general_crawl_policies"]["max_url_length"]:
            logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url)
            return None
        elif len(url) == 0:
            logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url)
            return None
        else:
            return url