示例#1
0
    def __init__(self, mongo_host='localhost'):

        self.mongo_client = MongoClient(mongo_host, 27017)

        self.db = self.mongo_client.spider

        self.server = ServerSocket(self.on_message)

        self.server.start()
 def __init__(self, mongo_client=None, mongo_host='localhost'):
     self.server = ServerSocket(self.on_message)
     self.server.start()
class CrawlMaster:
    clients = {}

    server_status = pc.STATUS_RUNNING

    last_rereoder_time = time.time()

    mongo_mgr = MongoManager()

    def __init__(self, mongo_client=None, mongo_host='localhost'):
        self.server = ServerSocket(self.on_message)
        self.server.start()

    def on_message(self, msg):
        print('Heart Beat request', msg)
        request = json.loads(msg)
        type = request[pc.MSG_TYPE]
        client_state = {}
        response = {}
        response[pc.SERVER_STATUS] = self.server_status
        if type == pc.REGISTER:
            client_id = self.get_free_id()
            client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time()
            self.clients[client_id] = client_state
            return client_id
        elif type == pc.UNREGISTER:
            client_id = request.get(pc.CLIENT_ID)
            del self.clients[client_id]
            return json.dumps(response)
        elif type == pc.LOCATIONS:
            items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.CRAWL_DELAY] = 2
            response[pc.DATA] = json.dumps(items)
            return json.dumps(response)
        elif type == pc.TRIPLES:
            items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.DATA] = json.dumps(items)
            return json.dumps(response)

        client_id = request.get(pc.CLIENT_ID)
        if client_id is None:
            response[pc.ERROR] = pc.ERR_NOT_FOUND
            return json.dumps(response)
        if type == pc.HEARTBEAT:
            if self.server_status is not self.clients[client_id]['status']:
                if self.server_status == pc.STATUS_RUNNING:
                    response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
                elif self.server_status == pc.STATUS_PAUSED:
                    response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
                elif self.server_status == pc.STATUS_SHUTDOWN:
                    response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
                return json.dumps(response)
        else:
            client_state['status'] = type
            client_state['time'] = time.time()
            self.clients[client_id] = client_state

        return json.dumps(response)

    def get_free_id(self):
        i = 0
        for key in self.clients:
            if i < int(key):
                break
            i += 1
        return str(i)

    def reorder_queue(self):
        g = nx.DiGraph()
        print("test11111")
        cursor = self.db.urlpr.find()
        for site in cursor:
            url = site['url']
            links = site['links']
            for link in links:
                g.add_edge(url, link)
        pageranks = nx.pagerank(g, 0.9)
        for url, pr in pageranks.iteritems():
            print('updating %s pr: %f' % (url, pr))
            record = {'pr': pr}
            self.db.mfw.update_one({'_id': hashlib.md5(url).hexdigest()},
                                   {'$set': record},
                                   upsert=False)

    def periodical_check(self):
        while True:
            clients_status_ok = True
            ###检查是否需要排序
            if self.is_reordering is False and time.time(
            ) - self.last_rereoder_time > constants['reorder_period']:
                self.server_status = pc.STATUS_PAUSED
                self.is_reordering = True
            ##检查客服端是否掉线
            for cid, state in self.clients.iteritems():
                # no heart beat for 2 mins, remove it
                if time.time(
                ) - state['time'] > constants['connection_lost_period']:
                    # remove it from client list
                    # del client[cid]
                    # set client status to be CONNECTION_LIST
                    self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
                    continue
                #检查是否与服务端之前的要求一致
                if state['status'] != self.server_status:
                    clients_status_ok = False
                    break
            #检查是否满足排序条件
            if clients_status_ok and self.server_status == pc.STATUS_PAUSED and self.is_reordering:
                self.reorder_queue()
                self.last_rereoder_time = time.time()
                self.is_reordering = False
                self.server_status = pc.STATUS_RUNNING

            time.sleep(constants['status_check_intervel'])
示例#4
0
class CrawlMaster:
    clients = {}

    server_status = pc.STATUS_RUNNING

    last_rereoder_time = time.time()

    mongo_mgr = MongoManager()

    def __init__(self, mongo_client=None, mongo_host='localhost'):
        self.server = ServerSocket(self.on_message)
        self.server.start()

    def on_message(self, msg):
        print('Heart Beat request', msg)
        request = json.loads(msg)
        type = request[pc.MSG_TYPE]
        client_state = {}
        response = {}
        response[pc.SERVER_STATUS] = self.server_status
        if type == pc.REGISTER:
            client_id = self.get_free_id()
            client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time()
            self.clients[client_id] = client_state
            return client_id
        elif type == pc.UNREGISTER:
            client_id = request.get(pc.CLIENT_ID)
            del self.clients[client_id]
            return json.dumps(response)
        elif type == pc.LOCATIONS:
            items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.CRAWL_DELAY] = 2
            response[pc.DATA] = json.dumps(items)
            return json.dumps(response)
        elif type == pc.TRIPLES:
            items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.DATA] = json.dumps(items)
            return json.dumps(response)

        client_id = request.get(pc.CLIENT_ID)
        if client_id is None:
            response[pc.ERROR] = pc.ERR_NOT_FOUND
            return json.dumps(response)
        if type == pc.HEARTBEAT:
            if self.server_status is not self.clients[client_id]['status']:
                if self.server_status == pc.STATUS_RUNNING:
                    response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
                elif self.server_status == pc.STATUS_PAUSED:
                    response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
                elif self.server_status == pc.STATUS_SHUTDOWN:
                    response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
                return json.dumps(response)
        else:
            client_state['status'] = type
            client_state['time'] = time.time()
            self.clients[client_id] = client_state

        return json.dumps(response)

    def get_free_id(self):
        i = 0
        for key in self.clients:
            if i < int(key):
                break
            i += 1
        return str(i)

    def periodical_check(self):
        clients_status_ok = True

        for cid, state in self.clients.items():
            # no heart beat for 2 mins, remove it
            if time.time(
            ) - state['time'] > constants['connection_lost_period']:
                # remove it from client list
                # del client[cid]
                # set client status to be CONNECTION_LIST
                self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
                continue

            if state['status'] != self.server_status:
                clients_status_ok = False
                break
示例#5
0
 def __init__(self, mongo_client=None, mongo_host='localhost'):
     mongo_client = MongoClient(
         mongo_host, 27017) if mongo_client is None else mongo_client
     self.db = mongo_client.spider
     self.server = ServerSocket(self.on_message)
     self.server.start()
示例#6
0
class CrawlMaster(object):
    # 客户端注册表,{'client_id': {'time':'xx', 'status':'xx'}}
    clients = {}

    server_status = pc.STATUS_RUNNING

    last_rereoder_time = time.time()

    dbmanager = MongoRedisUrlManager()

    def __init__(self, mongo_client=None, mongo_host='127.0.0.1'):
        self.server = ServerSocket(self.on_message)
        self.server.start()

    def on_message(self, msg):
        #msg 是client发送过来的心跳信息
        request = json.loads(msg)
        type = request[pc.MSG_TYPE]
        client_state = {}
        response = {}
        response[pc.SERVER_STATUS] = self.server_status
        if type == pc.REGISTER:
            client_id = self.get_free_id()
            client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time()
            self.clients[client_id] = client_state
            return client_id
        elif type == pc.UNREGISTER:
            client_id = request.get(pc.CLIENT_ID)
            del self.clients[client_id]
            return json.dumps(response)
        elif type == pc.LOCATIONS:
            crawl_urls = self.dbmanager.dequeueUrls(size=pc.REQUEST_SIZE)
            print(crawl_urls)
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.CRAWL_DELAY] = pc.CRAWL_DELAY_TIME
            response[pc.DATA] = crawl_urls
            self.flash_hbtime(request)
            return json.dumps(response)
        elif type == pc.TRIPLES:
            crawl_urls = self.dbmanager.dequeueUrls(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.DATA] = crawl_urls
            self.flash_hbtime(request)
            return json.dumps(response)
        elif type == pc.FINISHED_ITEMS:
            # new urls from client save to db by master
            save_urls = request.get(pc.FINISHED_ITEMS)
            self.dbmanager.enqueueUrls(save_urls)
            self.flash_hbtime(request)
            return json.dumps(response)


        client_id = request.get(pc.CLIENT_ID)
        if client_id is None:
            response[pc.ERROR] = pc.ERR_NOT_FOUND
            return json.dumps(response)
        if type == pc.HEARTBEAT:
            if self.server_status is not self.clients[client_id]['status']:
                if self.server_status == pc.STATUS_RUNNING:
                    response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
                elif self.server_status == pc.STATUS_PAUSED:
                    response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
                elif self.server_status == pc.STATUS_SHUTDOWN:
                    response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
                return json.dumps(response)
            else:
                # a normal heart beat
                self.flash_hbtime(request)
                return json.dumps(response)
        else:
            if type == pc.PAUSED:
                client_state['status'] = pc.STATUS_PAUSED
            elif type == pc.RESUMED:
                client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time() #flash hb time
            self.clients[client_id] = client_state

            return json.dumps(response)

    def periodical_check(self):
        # check heart beat
        # clients_status_ok = True
        while True:
            lost_cid = []
            for cid, state in self.clients.items():
                if time.time() - state['time'] > constants['connection_lost_period']:
                    # del self.clients[cid] -> reason:dictionary changed size during iteration
                    self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
                    lost_cid.append(cid)
                    continue

            for cid in lost_cid:
                if self.clients[cid]['status'] != self.server_status:
                    # remove if from client list
                    del self.clients[cid]

            time.sleep(PERIODICAL_CHECK_TIME)


    def get_free_id(self):
        i = 0
        for key in self.clients:
            if i < int(key):
                break
            i += 1
        return str(i)

    def flash_hbtime(self, request):
        client_id = request.get(pc.CLIENT_ID)
        self.clients[client_id]['time'] = time.time()
示例#7
0
class CrawlMaster:
	clients = {}

	server_status = pc.STATUS_RUNNING

	last_rereoder_time = time.time()

	is_reordering = False

	def __init__(self, mongo_host='localhost'):
		self.mongo_client = MongoClient(mongo_host, 27017)
		self.db = self.mongo_client.spider
		self.server = ServerSocket(self.on_message)
		self.server.start()

	def on_message(self, msg):
		# print 'Heart Beat request' + msg
		request = json.loads(msg)
		type = request[pc.MSG_TYPE]
		client_state = {}
		response = {}
		response[pc.SERVER_STATUS] = self.server_status
		if type == pc.REGISTER:
			client_id = self.get_free_id()
			client_state['status'] = pc.STATUS_RUNNING
			client_state['time'] = time.time()
			self.clients[client_id] = client_state
			print client_id + ' registerd'
			return client_id
		elif type == pc.UNREGISTER:
			client_id = request.get(pc.CLIENT_ID)
			del self.clients[client_id]
			print client_id + ' unregisterd'
			return json.dumps(response)
		
		client_id = request.get(pc.CLIENT_ID)

		if client_id is None:
			response[pc.ERROR] = pc.ERR_NOT_FOUND
			return json.dumps(response)
		print client_id + ' heartbeat msg: ' + type
		if type == pc.HEARTBEAT:
			if self.server_status is not self.clients[client_id]['status']:
				if self.server_status == pc.STATUS_RUNNING:
					response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
				elif self.server_status == pc.STATUS_PAUSED:
					response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
				elif self.server_status == pc.STATUS_SHUTDOWN:
					response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
				return json.dumps(response)
		else:
			client_state['status'] = type
			client_state['time'] = time.time()
			self.clients[client_id] = client_state

		return json.dumps(response)

	def get_free_id(self):
		i = 0
		for key in self.clients:
			if i < int(key):
				break
			i += 1
		return str(i)


	def reorder_queue(self):
		g = nx.DiGraph()
		cursor = self.db.urlpr.find()
		for site in cursor:
			url = site['url']
			links = site['links']
			for link in links:
				g.add_edge(url, link)
		pageranks = nx.pagerank(g, 0.9)
		for url, pr in pageranks.iteritems():
			print 'updating %s pr: %f' % (url, pr)
			record = {'pr': pr}
			self.db.mfw.update_one({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False)


	def periodical_check(self):
		clients_status_ok = True

		if self.is_reordering is False and time.time() - self.last_rereoder_time > constants['reorder_period']:
			self.server_status = pc.STATUS_PAUSED
			self.is_reordering = True
		
		for cid, state in self.clients.iteritems():
			# no heart beat for 2 mins, remove it
			if time.time() - state['time'] > constants['connection_lost_period']:
				# remove it from client list 
				# del client[cid]
				# set client status to be CONNECTION_LIST
				self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
				continue

			if state['status'] != self.server_status:
				clients_status_ok = False
				break

		if clients_status_ok and self.server_status == pc.STATUS_PAUSED and self.is_reordering:
			self.reorder_queue()
			self.last_rereoder_time = time.time()
			is_reordering = False
			self.server_status = pc.STATUS_RUNNING
示例#8
0
def main():
    socket_server = ServerSocket(socket.AF_INET, socket.SOCK_STREAM)
    while True:
        socket_server.accept()