def __init__(self, mongo_host='localhost'): self.mongo_client = MongoClient(mongo_host, 27017) self.db = self.mongo_client.spider self.server = ServerSocket(self.on_message) self.server.start()
def __init__(self, mongo_client=None, mongo_host='localhost'): self.server = ServerSocket(self.on_message) self.server.start()
class CrawlMaster: clients = {} server_status = pc.STATUS_RUNNING last_rereoder_time = time.time() mongo_mgr = MongoManager() def __init__(self, mongo_client=None, mongo_host='localhost'): self.server = ServerSocket(self.on_message) self.server.start() def on_message(self, msg): print('Heart Beat request', msg) request = json.loads(msg) type = request[pc.MSG_TYPE] client_state = {} response = {} response[pc.SERVER_STATUS] = self.server_status if type == pc.REGISTER: client_id = self.get_free_id() client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() self.clients[client_id] = client_state return client_id elif type == pc.UNREGISTER: client_id = request.get(pc.CLIENT_ID) del self.clients[client_id] return json.dumps(response) elif type == pc.LOCATIONS: items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.CRAWL_DELAY] = 2 response[pc.DATA] = json.dumps(items) return json.dumps(response) elif type == pc.TRIPLES: items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.DATA] = json.dumps(items) return json.dumps(response) client_id = request.get(pc.CLIENT_ID) if client_id is None: response[pc.ERROR] = pc.ERR_NOT_FOUND return json.dumps(response) if type == pc.HEARTBEAT: if self.server_status is not self.clients[client_id]['status']: if self.server_status == pc.STATUS_RUNNING: response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED elif self.server_status == pc.STATUS_PAUSED: response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED elif self.server_status == pc.STATUS_SHUTDOWN: response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED return json.dumps(response) else: client_state['status'] = type client_state['time'] = time.time() self.clients[client_id] = client_state return json.dumps(response) def get_free_id(self): i = 0 for key in self.clients: if i < int(key): break i += 1 return str(i) def reorder_queue(self): g = nx.DiGraph() print("test11111") cursor = self.db.urlpr.find() for site in cursor: url = site['url'] links = site['links'] for link in links: g.add_edge(url, link) pageranks = nx.pagerank(g, 0.9) for url, pr in pageranks.iteritems(): print('updating %s pr: %f' % (url, pr)) record = {'pr': pr} self.db.mfw.update_one({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False) def periodical_check(self): while True: clients_status_ok = True ###检查是否需要排序 if self.is_reordering is False and time.time( ) - self.last_rereoder_time > constants['reorder_period']: self.server_status = pc.STATUS_PAUSED self.is_reordering = True ##检查客服端是否掉线 for cid, state in self.clients.iteritems(): # no heart beat for 2 mins, remove it if time.time( ) - state['time'] > constants['connection_lost_period']: # remove it from client list # del client[cid] # set client status to be CONNECTION_LIST self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST continue #检查是否与服务端之前的要求一致 if state['status'] != self.server_status: clients_status_ok = False break #检查是否满足排序条件 if clients_status_ok and self.server_status == pc.STATUS_PAUSED and self.is_reordering: self.reorder_queue() self.last_rereoder_time = time.time() self.is_reordering = False self.server_status = pc.STATUS_RUNNING time.sleep(constants['status_check_intervel'])
class CrawlMaster: clients = {} server_status = pc.STATUS_RUNNING last_rereoder_time = time.time() mongo_mgr = MongoManager() def __init__(self, mongo_client=None, mongo_host='localhost'): self.server = ServerSocket(self.on_message) self.server.start() def on_message(self, msg): print('Heart Beat request', msg) request = json.loads(msg) type = request[pc.MSG_TYPE] client_state = {} response = {} response[pc.SERVER_STATUS] = self.server_status if type == pc.REGISTER: client_id = self.get_free_id() client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() self.clients[client_id] = client_state return client_id elif type == pc.UNREGISTER: client_id = request.get(pc.CLIENT_ID) del self.clients[client_id] return json.dumps(response) elif type == pc.LOCATIONS: items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.CRAWL_DELAY] = 2 response[pc.DATA] = json.dumps(items) return json.dumps(response) elif type == pc.TRIPLES: items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.DATA] = json.dumps(items) return json.dumps(response) client_id = request.get(pc.CLIENT_ID) if client_id is None: response[pc.ERROR] = pc.ERR_NOT_FOUND return json.dumps(response) if type == pc.HEARTBEAT: if self.server_status is not self.clients[client_id]['status']: if self.server_status == pc.STATUS_RUNNING: response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED elif self.server_status == pc.STATUS_PAUSED: response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED elif self.server_status == pc.STATUS_SHUTDOWN: response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED return json.dumps(response) else: client_state['status'] = type client_state['time'] = time.time() self.clients[client_id] = client_state return json.dumps(response) def get_free_id(self): i = 0 for key in self.clients: if i < int(key): break i += 1 return str(i) def periodical_check(self): clients_status_ok = True for cid, state in self.clients.items(): # no heart beat for 2 mins, remove it if time.time( ) - state['time'] > constants['connection_lost_period']: # remove it from client list # del client[cid] # set client status to be CONNECTION_LIST self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST continue if state['status'] != self.server_status: clients_status_ok = False break
def __init__(self, mongo_client=None, mongo_host='localhost'): mongo_client = MongoClient( mongo_host, 27017) if mongo_client is None else mongo_client self.db = mongo_client.spider self.server = ServerSocket(self.on_message) self.server.start()
class CrawlMaster(object): # 客户端注册表,{'client_id': {'time':'xx', 'status':'xx'}} clients = {} server_status = pc.STATUS_RUNNING last_rereoder_time = time.time() dbmanager = MongoRedisUrlManager() def __init__(self, mongo_client=None, mongo_host='127.0.0.1'): self.server = ServerSocket(self.on_message) self.server.start() def on_message(self, msg): #msg 是client发送过来的心跳信息 request = json.loads(msg) type = request[pc.MSG_TYPE] client_state = {} response = {} response[pc.SERVER_STATUS] = self.server_status if type == pc.REGISTER: client_id = self.get_free_id() client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() self.clients[client_id] = client_state return client_id elif type == pc.UNREGISTER: client_id = request.get(pc.CLIENT_ID) del self.clients[client_id] return json.dumps(response) elif type == pc.LOCATIONS: crawl_urls = self.dbmanager.dequeueUrls(size=pc.REQUEST_SIZE) print(crawl_urls) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.CRAWL_DELAY] = pc.CRAWL_DELAY_TIME response[pc.DATA] = crawl_urls self.flash_hbtime(request) return json.dumps(response) elif type == pc.TRIPLES: crawl_urls = self.dbmanager.dequeueUrls(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.DATA] = crawl_urls self.flash_hbtime(request) return json.dumps(response) elif type == pc.FINISHED_ITEMS: # new urls from client save to db by master save_urls = request.get(pc.FINISHED_ITEMS) self.dbmanager.enqueueUrls(save_urls) self.flash_hbtime(request) return json.dumps(response) client_id = request.get(pc.CLIENT_ID) if client_id is None: response[pc.ERROR] = pc.ERR_NOT_FOUND return json.dumps(response) if type == pc.HEARTBEAT: if self.server_status is not self.clients[client_id]['status']: if self.server_status == pc.STATUS_RUNNING: response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED elif self.server_status == pc.STATUS_PAUSED: response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED elif self.server_status == pc.STATUS_SHUTDOWN: response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED return json.dumps(response) else: # a normal heart beat self.flash_hbtime(request) return json.dumps(response) else: if type == pc.PAUSED: client_state['status'] = pc.STATUS_PAUSED elif type == pc.RESUMED: client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() #flash hb time self.clients[client_id] = client_state return json.dumps(response) def periodical_check(self): # check heart beat # clients_status_ok = True while True: lost_cid = [] for cid, state in self.clients.items(): if time.time() - state['time'] > constants['connection_lost_period']: # del self.clients[cid] -> reason:dictionary changed size during iteration self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST lost_cid.append(cid) continue for cid in lost_cid: if self.clients[cid]['status'] != self.server_status: # remove if from client list del self.clients[cid] time.sleep(PERIODICAL_CHECK_TIME) def get_free_id(self): i = 0 for key in self.clients: if i < int(key): break i += 1 return str(i) def flash_hbtime(self, request): client_id = request.get(pc.CLIENT_ID) self.clients[client_id]['time'] = time.time()
class CrawlMaster: clients = {} server_status = pc.STATUS_RUNNING last_rereoder_time = time.time() is_reordering = False def __init__(self, mongo_host='localhost'): self.mongo_client = MongoClient(mongo_host, 27017) self.db = self.mongo_client.spider self.server = ServerSocket(self.on_message) self.server.start() def on_message(self, msg): # print 'Heart Beat request' + msg request = json.loads(msg) type = request[pc.MSG_TYPE] client_state = {} response = {} response[pc.SERVER_STATUS] = self.server_status if type == pc.REGISTER: client_id = self.get_free_id() client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() self.clients[client_id] = client_state print client_id + ' registerd' return client_id elif type == pc.UNREGISTER: client_id = request.get(pc.CLIENT_ID) del self.clients[client_id] print client_id + ' unregisterd' return json.dumps(response) client_id = request.get(pc.CLIENT_ID) if client_id is None: response[pc.ERROR] = pc.ERR_NOT_FOUND return json.dumps(response) print client_id + ' heartbeat msg: ' + type if type == pc.HEARTBEAT: if self.server_status is not self.clients[client_id]['status']: if self.server_status == pc.STATUS_RUNNING: response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED elif self.server_status == pc.STATUS_PAUSED: response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED elif self.server_status == pc.STATUS_SHUTDOWN: response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED return json.dumps(response) else: client_state['status'] = type client_state['time'] = time.time() self.clients[client_id] = client_state return json.dumps(response) def get_free_id(self): i = 0 for key in self.clients: if i < int(key): break i += 1 return str(i) def reorder_queue(self): g = nx.DiGraph() cursor = self.db.urlpr.find() for site in cursor: url = site['url'] links = site['links'] for link in links: g.add_edge(url, link) pageranks = nx.pagerank(g, 0.9) for url, pr in pageranks.iteritems(): print 'updating %s pr: %f' % (url, pr) record = {'pr': pr} self.db.mfw.update_one({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False) def periodical_check(self): clients_status_ok = True if self.is_reordering is False and time.time() - self.last_rereoder_time > constants['reorder_period']: self.server_status = pc.STATUS_PAUSED self.is_reordering = True for cid, state in self.clients.iteritems(): # no heart beat for 2 mins, remove it if time.time() - state['time'] > constants['connection_lost_period']: # remove it from client list # del client[cid] # set client status to be CONNECTION_LIST self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST continue if state['status'] != self.server_status: clients_status_ok = False break if clients_status_ok and self.server_status == pc.STATUS_PAUSED and self.is_reordering: self.reorder_queue() self.last_rereoder_time = time.time() is_reordering = False self.server_status = pc.STATUS_RUNNING
def main(): socket_server = ServerSocket(socket.AF_INET, socket.SOCK_STREAM) while True: socket_server.accept()