def dispatch_req(self): # initial data req = self.__req_queue.get() dict_req = dict() dict_req["task_id"] = req.task_id dict_req["req_id"] = req.req_id dict_req["req_type"] = req.req_type dict_req["urls_count"] = req.urls_count dict_req["urls_args"] = req.urls_args json_req = json.dumps(dict_req) # node select node_queue = self.__node_queue node = node_queue.get() # tcp connect s = TCPManager().get_dispatcher_connect(node.node_ip) s.send(json_req.encode("utf8")) json_ret = s.recv(1024) logging.info("received return data %s\n" % json_ret) if not json_ret: # the response from web spider is empty logging.info("received empty return data\n") else: # handle the response from web spider obj_req = self.__req_set[req.req_id] dict_ret = json.loads(json_ret.decode('utf8')) if dict_ret['req_status'] == Global.get_status_crawling(): obj_req.req_status = Global.get_status_collecting() logging.info("dispatched req %s to %s success\n" % (req.req_id, node.node_ip)) else: obj_req.req_status = Global.get_status_uncompleted() logging.error("dispatched req %s to %s fail\n" % (req.req_id, node.node_ip)) # nodes in turns node_queue.put(node)
def __new__(cls): q = QueueManager() cls.__req_local_queue = q.get(Global.get_queue_req()) cls.__node_local_queue = q.get(Global.get_queue_node()) d = DataManager() cls.__data_req_set = d.get(Global.get_data_req()) return object.__new__(cls)
def manage_task(self, sock, addr): logging.info('go to manage task\n') data = '' while True: json_data = sock.recv(1024) logging.info('go to manage task %s\n' % json_data) if json_data: data += json_data.decode('utf8') if len(json_data) < 1024: break logging.info('%s\n' % data) sock.close() dict_data = json.loads(data) if 'op_type' not in dict_data.keys(): logging.error('invalid operation message.\n ') else: logging.info('submit task %s\n' % dict_data) op_type = dict_data['op_type'] op_data = dict_data['op_data'] if op_type == Global.get_op_submit(): self.submit_task(op_data) elif op_type == Global.get_op_cancel(): self.cancel_task(op_data) elif op_type == Global.get_op_pause(): self.pause_task(op_data) elif op_type == Global.get_op_resume(): self.resume_task(op_data) else: logging.error('unknown operation %s\n' % op_type) logging.info('received task %s from %s' % (data, addr))
def get(self, data_type): if data_type == Global.get_data_task(): dict_data = self.__data_task_set elif data_type == Global.get_data_req(): dict_data = self.__data_req_set elif data_type == Global.get_data_res(): dict_data = self.__data_res_set elif data_type == Global.get_data_node(): dict_data = self.__data_node_set else: return None return dict_data
def send_node_msg(self, obj_node): dict_msg = dict() dict_msg['message_type'] = Global.get_msg_node() dict_msg['node_ip'] = obj_node.node_ip dict_msg['node_status'] = obj_node.node_status self.send_comm_msg(dict_msg) return 0
def process_task(op_type, op_data): """ TaskProcessor.process_task(op_type, op_data) :param op_type: submit, cancel, pause or resume :param op_data: dict data of operation :return: 0 if process successfully. -1 represents error, otherwise. """ if op_type == Global.get_op_submit(): dict_op_msg = TaskProcessor.submit_task(op_data) logging.info('submit task %s\n' % dict_op_msg) else: # except submitting task if 'task_id' not in op_data.keys(): return -1 task_id = op_data['task_id'] if op_type == Global.get_op_cancel(): dict_op_msg = TaskProcessor.cancel_task(task_id) elif op_type == Global.get_op_pause(): dict_op_msg = TaskProcessor.pause_task(task_id) elif op_type == Global.get_op_resume(): dict_op_msg = TaskProcessor.resume_task(task_id) else: logging.error('unknown operation: %s\n' % op_type) return -1 if dict_op_msg is not None: dict_op_msg['op_type'] = op_type s = TCPManager().get_cmd_connect() json_op_msg = json.dumps(dict_op_msg) s.send(json_op_msg.encode('utf-8')) json_ret = s.recv(1024) if not json_ret: # empty response message return -1 else: dict_ret = json.loads(json_ret.decode('utf8')) if dict_ret['op_status'] == Global.get_status_completed(): op_type = dict_ret['op_type'] op_data = dict_ret['op_data'] logging.info("executed operation %s to task %s success\n" % (op_type, op_data)) return 0 else: logging.error("executed operation %s to task %s fail\n" % (op_type, op_data)) return -1 else: return -1
def process_cmd(self): try: opts, args = getopt.getopt(self.__argv, "hs:c:p:r:n:") except getopt.GetoptError: print('unknown args!') sys.exit(2) for opt, arg in opts: if opt == '-h': print("Usage:") print("opt_processor.py -s <taskfile>") print("opt_processor.py -c <taskid>") print("opt_processor.py -p <taskid>") print("opt_processor.py -r <taskid>") sys.exit(0) elif opt == "-s": # submit task_file_path = arg dict_data = dict() dict_data['file_path'] = task_file_path TaskProcessor.process_task(Global.get_op_submit(), dict_data) elif opt == '-p': # pause task_id = arg dict_data = dict() dict_data['task_id'] = task_id TaskProcessor.process_task(Global.get_op_cancel(), dict_data) elif opt == '-c': # cancel task_id = arg dict_data = dict() dict_data['task_id'] = task_id TaskProcessor.process_task(Global.get_op_cancel(), dict_data) elif opt == '-r': # resume task_id = arg dict_data = dict() dict_data['task_id'] = task_id TaskProcessor.process_task(Global.get_op_resume(), dict_data) elif opt == '-n': node_type = arg if node_type == 's': t = Scheduler() t.start() elif node_type == 'c': w = WebSpider() w.start() else: print('unknown args!') else: print('unknown args!')
def receive_message(self, json_data, addr): logging.info('received message[%s]' % json_data.decode('utf8')) dict_data = json.loads(json_data.decode('utf8')) dict_msg = dict() dict_msg['ip'] = addr # sock dict_msg['msg'] = dict_data msg_type = dict_data['message_type'] if msg_type == Global.get_msg_node(): self.__node_msg_queue.put(dict_msg) self.process_node_message() logging.info('processed node message[%s]' % dict_msg) elif msg_type == Global.get_msg_res(): self.__res_msg_queue.put(dict_msg) self.process_res_message() logging.info('processed res message[%s]' % dict_msg) else: logging.error('skipped message[%s]' % dict_msg)
def send_res_msg(self, obj_res): dict_msg = dict() dict_msg['message_type'] = Global.get_msg_res() dict_msg['task_id'] = obj_res.task_id dict_msg['req_id'] = obj_res.req_id dict_msg['req_status'] = obj_res.req_status dict_msg['pages_count'] = obj_res.pages_count dict_msg['pages_args'] = obj_res.pages_args self.send_comm_msg(dict_msg) return 0
def process_node_message(self): dict_msg = self.__node_msg_queue.get() msg = dict_msg['msg'] node_ip = msg['node_ip'] node_status = msg['node_status'] node = WebSpiderNode(node_ip, node_status) nm = DataManager() dict_node_set = nm.get(Global.get_data_node()) dict_node_set[node.node_ip] = node self.__node_queue.put(node)
def separate_task(self): task = self.__task_queue.get() # if empty, block here if int(task.task_type) == 0: conf = configparser.ConfigParser() conf.read(task.task_path) from_year = conf.getint('extend', 'from_year') to_year = conf.getint('extend', 'to_year') from_page = conf.getint('extend', 'from_page') to_page = conf.getint('extend', 'to_page') # template_url = conf.get('base', 'template_url') template_url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&year=%s&pIndex=%s' logging.info("separating task's year from %s to %s and its page from %d to %s\n" % ( from_year, to_year, from_page, to_page)) req_count = 0 for item in range(from_year, to_year): for ele in range(from_page, to_page, 5): req = Request() req.task_id = task.task_id req.req_id = task.task_id + "_" + str(req_count) req.req_type = task.task_type remain_page = to_page - ele if remain_page < 5: req.urls_count = remain_page else: req.urls_count = 5 dict_args = dict() dict_args['from_year'] = item dict_args['to_year'] = item + 1 dict_args['from_page'] = ele dict_args['to_page'] = ele + req.urls_count dict_args['template_url'] = template_url req_count += 1 req.urls_args = dict_args req.req_status = Global.get_status_dispatching() self.__req_queue.put(req) self.__req_set[req.req_id] = req logging.info("separated task %s into req %s\n" % (task.task_id, req.req_id)) self.__task_set[task.task_id].task_status = Global.get_status_dispatching()
def __new__(cls): q = QueueManager() cls.__nd_msg_queue = q.get(Global.get_msg_node()) cls.__rs_msg_queue = q.get(Global.get_msg_res()) cls.__rq_msg_queue = q.get(Global.get_msg_req()) cls.__n_queue = q.get(Global.get_queue_node()) cls.__res_local_queue = q.get(Global.get_queue_res()) d = DataManager() cls.__data_res_set = d.get(Global.get_data_req()) return object.__new__(cls)
def receive_request(self, sock, addr): json_req = sock.recv(1024) if not json_req: logging.info("received empty request %s" % json_req) dict_req = json.loads(json_req.decode('utf8')) obj_req = Request() obj_req.task_id = dict_req['task_id'] obj_req.req_id = dict_req['req_id'] obj_req.req_type = dict_req['req_type'] obj_req.urls_count = dict_req['urls_count'] obj_req.urls_args = dict_req['urls_args'] self.__req_queue.put(obj_req) dict_ret = dict() dict_ret['req_status'] = Global.get_status_crawling() json_ret = json.dumps(dict_ret) sock.send(json_ret.encode('utf8')) sock.close() logging.info("received request %s from %s" % (dict_req, addr))
def submit_task(self, op_data): """ accept task submitted by user using tcp connection :param task_path: user's task absolute file path :return: """ task_path = op_data['file_path'] t = Task() t.task_id = '0' t.task_status = Global.get_status_separating() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) parser.setContentHandler(t) parser.parse(task_path) self.__task_set[t.task_id] = t self.__task_queue.put(t) logging.info("submitted task %s\n" % t.task_name)
def start(self): if self.__instance: print("Spider is already running...") else: print("Spider is already starting...") self.__instance = 1 cn_logger = Logger('c') cn_logger.execute() mp = MsgProcessor() node = WebSpiderNode('127.0.0.1', Global.get_status_active()) mp.send_node_msg(node) rp = ReqReceiver('127.0.0.1') rp.start() # rc = ReqPreprocessor() # rc.start() rd = ReqDownloader() rd.start()
def get(self, queue_type): if queue_type in self.__queue_set.keys(): return self.__queue_set.get(queue_type) else: if queue_type == Global.get_queue_task(): q = TaskQueue() elif queue_type == Global.get_queue_req(): q = ReqQueue() elif queue_type == Global.get_queue_res(): q = ResQueue() elif queue_type == Global.get_queue_node(): q = NodeQueue() elif queue_type == Global.get_msg_node(): q = MsgQueue() elif queue_type == Global.get_msg_req(): q = MsgQueue() elif queue_type == Global.get_msg_res(): q = MsgQueue() else: return None self.__queue_set[queue_type] = q return q
def __new__(cls, *args, **kwargs): q = QueueManager() cls.__request_queue = q.get(Global.get_queue_req()) return object.__new__(cls)
def __init__(self): q = QueueManager() node_type = Global.get_node_all_type() self.__node_queue = q.get(node_type).queue self.__node_set = dict() self.__node_cnt = 0
def __new__(cls): q = QueueManager() cls.__task_local_queue = q.get(Global.get_queue_task()) d = DataManager() cls.__data_task_set = d.get(Global.get_data_task()) return object.__new__(cls)