예제 #1
0
 def dispatch_req(self):
     # initial data
     req = self.__req_queue.get()
     dict_req = dict()
     dict_req["task_id"] = req.task_id
     dict_req["req_id"] = req.req_id
     dict_req["req_type"] = req.req_type
     dict_req["urls_count"] = req.urls_count
     dict_req["urls_args"] = req.urls_args
     json_req = json.dumps(dict_req)
     # node select
     node_queue = self.__node_queue
     node = node_queue.get()
     # tcp connect
     s = TCPManager().get_dispatcher_connect(node.node_ip)
     s.send(json_req.encode("utf8"))
     json_ret = s.recv(1024)
     logging.info("received return data %s\n" % json_ret)
     if not json_ret:
         # the response from web spider is empty
         logging.info("received empty return data\n")
     else:
         # handle the response from web spider
         obj_req = self.__req_set[req.req_id]
         dict_ret = json.loads(json_ret.decode('utf8'))
         if dict_ret['req_status'] == Global.get_status_crawling():
             obj_req.req_status = Global.get_status_collecting()
             logging.info("dispatched req %s to %s success\n" % (req.req_id, node.node_ip))
         else:
             obj_req.req_status = Global.get_status_uncompleted()
             logging.error("dispatched req %s to %s fail\n" % (req.req_id, node.node_ip))
     # nodes in turns
     node_queue.put(node)
예제 #2
0
 def __new__(cls):
     q = QueueManager()
     cls.__req_local_queue = q.get(Global.get_queue_req())
     cls.__node_local_queue = q.get(Global.get_queue_node())
     d = DataManager()
     cls.__data_req_set = d.get(Global.get_data_req())
     return object.__new__(cls)
예제 #3
0
    def manage_task(self, sock, addr):
        logging.info('go to manage task\n')
        data = ''
        while True:
            json_data = sock.recv(1024)
            logging.info('go to manage task %s\n' % json_data)
            if json_data:
                data += json_data.decode('utf8')
            if len(json_data) < 1024:
                break
        logging.info('%s\n' % data)

        sock.close()
        dict_data = json.loads(data)
        if 'op_type' not in dict_data.keys():
            logging.error('invalid operation message.\n ')
        else:
            logging.info('submit task %s\n' % dict_data)
            op_type = dict_data['op_type']
            op_data = dict_data['op_data']
            if op_type == Global.get_op_submit():
                self.submit_task(op_data)
            elif op_type == Global.get_op_cancel():
                self.cancel_task(op_data)
            elif op_type == Global.get_op_pause():
                self.pause_task(op_data)
            elif op_type == Global.get_op_resume():
                self.resume_task(op_data)
            else:
                logging.error('unknown operation %s\n' % op_type)
        logging.info('received task %s from %s' % (data, addr))
예제 #4
0
파일: data_manager.py 프로젝트: jiegerl/DTS
 def get(self, data_type):
     if data_type == Global.get_data_task():
         dict_data = self.__data_task_set
     elif data_type == Global.get_data_req():
         dict_data = self.__data_req_set
     elif data_type == Global.get_data_res():
         dict_data = self.__data_res_set
     elif data_type == Global.get_data_node():
         dict_data = self.__data_node_set
     else:
         return None
     return dict_data
예제 #5
0
 def send_node_msg(self, obj_node):
     dict_msg = dict()
     dict_msg['message_type'] = Global.get_msg_node()
     dict_msg['node_ip'] = obj_node.node_ip
     dict_msg['node_status'] = obj_node.node_status
     self.send_comm_msg(dict_msg)
     return 0
예제 #6
0
    def process_task(op_type, op_data):
        """
        TaskProcessor.process_task(op_type, op_data)

        :param op_type: submit, cancel, pause or resume
        :param op_data: dict data of operation
        :return: 0 if process successfully. -1 represents error, otherwise.
        """
        if op_type == Global.get_op_submit():
            dict_op_msg = TaskProcessor.submit_task(op_data)
            logging.info('submit task %s\n' % dict_op_msg)
        else:
            # except submitting task
            if 'task_id' not in op_data.keys():
                return -1
            task_id = op_data['task_id']
            if op_type == Global.get_op_cancel():
                dict_op_msg = TaskProcessor.cancel_task(task_id)
            elif op_type == Global.get_op_pause():
                dict_op_msg = TaskProcessor.pause_task(task_id)
            elif op_type == Global.get_op_resume():
                dict_op_msg = TaskProcessor.resume_task(task_id)
            else:
                logging.error('unknown operation: %s\n' % op_type)
                return -1

        if dict_op_msg is not None:
            dict_op_msg['op_type'] = op_type
            s = TCPManager().get_cmd_connect()
            json_op_msg = json.dumps(dict_op_msg)
            s.send(json_op_msg.encode('utf-8'))
            json_ret = s.recv(1024)
            if not json_ret:
                # empty response message
                return -1
            else:
                dict_ret = json.loads(json_ret.decode('utf8'))
                if dict_ret['op_status'] == Global.get_status_completed():
                    op_type = dict_ret['op_type']
                    op_data = dict_ret['op_data']
                    logging.info("executed operation %s to task %s success\n" % (op_type, op_data))
                    return 0
                else:
                    logging.error("executed operation %s to task %s fail\n" % (op_type, op_data))
                    return -1
        else:
            return -1
예제 #7
0
    def process_cmd(self):
        try:
            opts, args = getopt.getopt(self.__argv, "hs:c:p:r:n:")
        except getopt.GetoptError:
            print('unknown args!')
            sys.exit(2)

        for opt, arg in opts:
            if opt == '-h':
                print("Usage:")
                print("opt_processor.py -s <taskfile>")
                print("opt_processor.py -c <taskid>")
                print("opt_processor.py -p <taskid>")
                print("opt_processor.py -r <taskid>")
                sys.exit(0)
            elif opt == "-s":  # submit
                task_file_path = arg
                dict_data = dict()
                dict_data['file_path'] = task_file_path
                TaskProcessor.process_task(Global.get_op_submit(), dict_data)
            elif opt == '-p':  # pause
                task_id = arg
                dict_data = dict()
                dict_data['task_id'] = task_id
                TaskProcessor.process_task(Global.get_op_cancel(), dict_data)
            elif opt == '-c':  # cancel
                task_id = arg
                dict_data = dict()
                dict_data['task_id'] = task_id
                TaskProcessor.process_task(Global.get_op_cancel(), dict_data)
            elif opt == '-r':  # resume
                task_id = arg
                dict_data = dict()
                dict_data['task_id'] = task_id
                TaskProcessor.process_task(Global.get_op_resume(), dict_data)
            elif opt == '-n':
                node_type = arg
                if node_type == 's':
                    t = Scheduler()
                    t.start()
                elif node_type == 'c':
                    w = WebSpider()
                    w.start()
                else:
                    print('unknown args!')
            else:
                print('unknown args!')
예제 #8
0
 def receive_message(self, json_data, addr):
     logging.info('received message[%s]' % json_data.decode('utf8'))
     dict_data = json.loads(json_data.decode('utf8'))
     dict_msg = dict()
     dict_msg['ip'] = addr  # sock
     dict_msg['msg'] = dict_data
     msg_type = dict_data['message_type']
     if msg_type == Global.get_msg_node():
         self.__node_msg_queue.put(dict_msg)
         self.process_node_message()
         logging.info('processed node message[%s]' % dict_msg)
     elif msg_type == Global.get_msg_res():
         self.__res_msg_queue.put(dict_msg)
         self.process_res_message()
         logging.info('processed res message[%s]' % dict_msg)
     else:
         logging.error('skipped message[%s]' % dict_msg)
예제 #9
0
 def send_res_msg(self, obj_res):
     dict_msg = dict()
     dict_msg['message_type'] = Global.get_msg_res()
     dict_msg['task_id'] = obj_res.task_id
     dict_msg['req_id'] = obj_res.req_id
     dict_msg['req_status'] = obj_res.req_status
     dict_msg['pages_count'] = obj_res.pages_count
     dict_msg['pages_args'] = obj_res.pages_args
     self.send_comm_msg(dict_msg)
     return 0
예제 #10
0
 def process_node_message(self):
     dict_msg = self.__node_msg_queue.get()
     msg = dict_msg['msg']
     node_ip = msg['node_ip']
     node_status = msg['node_status']
     node = WebSpiderNode(node_ip, node_status)
     nm = DataManager()
     dict_node_set = nm.get(Global.get_data_node())
     dict_node_set[node.node_ip] = node
     self.__node_queue.put(node)
예제 #11
0
    def separate_task(self):
        task = self.__task_queue.get()  # if empty, block here
        if int(task.task_type) == 0:
            conf = configparser.ConfigParser()
            conf.read(task.task_path)
            from_year = conf.getint('extend', 'from_year')
            to_year = conf.getint('extend', 'to_year')
            from_page = conf.getint('extend', 'from_page')
            to_page = conf.getint('extend', 'to_page')
            # template_url = conf.get('base', 'template_url')
            template_url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&year=%s&pIndex=%s'
            logging.info("separating task's year from %s to %s and its page from %d to %s\n" % (
            from_year, to_year, from_page, to_page))

            req_count = 0
            for item in range(from_year, to_year):
                for ele in range(from_page, to_page, 5):
                    req = Request()
                    req.task_id = task.task_id
                    req.req_id = task.task_id + "_" + str(req_count)
                    req.req_type = task.task_type
                    remain_page = to_page - ele
                    if remain_page < 5:
                        req.urls_count = remain_page
                    else:
                        req.urls_count = 5
                    dict_args = dict()
                    dict_args['from_year'] = item
                    dict_args['to_year'] = item + 1
                    dict_args['from_page'] = ele
                    dict_args['to_page'] = ele + req.urls_count
                    dict_args['template_url'] = template_url
                    req_count += 1
                    req.urls_args = dict_args
                    req.req_status = Global.get_status_dispatching()
                    self.__req_queue.put(req)
                    self.__req_set[req.req_id] = req
                    logging.info("separated task %s into req %s\n" % (task.task_id, req.req_id))

            self.__task_set[task.task_id].task_status = Global.get_status_dispatching()
예제 #12
0
 def __new__(cls):
     q = QueueManager()
     cls.__nd_msg_queue = q.get(Global.get_msg_node())
     cls.__rs_msg_queue = q.get(Global.get_msg_res())
     cls.__rq_msg_queue = q.get(Global.get_msg_req())
     cls.__n_queue = q.get(Global.get_queue_node())
     cls.__res_local_queue = q.get(Global.get_queue_res())
     d = DataManager()
     cls.__data_res_set = d.get(Global.get_data_req())
     return object.__new__(cls)
예제 #13
0
파일: req_recevier.py 프로젝트: jiegerl/DTS
 def receive_request(self, sock, addr):
     json_req = sock.recv(1024)
     if not json_req:
         logging.info("received empty request %s" % json_req)
     dict_req = json.loads(json_req.decode('utf8'))
     obj_req = Request()
     obj_req.task_id = dict_req['task_id']
     obj_req.req_id = dict_req['req_id']
     obj_req.req_type = dict_req['req_type']
     obj_req.urls_count = dict_req['urls_count']
     obj_req.urls_args = dict_req['urls_args']
     self.__req_queue.put(obj_req)
     dict_ret = dict()
     dict_ret['req_status'] = Global.get_status_crawling()
     json_ret = json.dumps(dict_ret)
     sock.send(json_ret.encode('utf8'))
     sock.close()
     logging.info("received request %s from %s" % (dict_req, addr))
예제 #14
0
    def submit_task(self, op_data):
        """
        accept task submitted by user using tcp connection
        :param task_path: user's task absolute file path
        :return:
        """

        task_path = op_data['file_path']
        t = Task()
        t.task_id = '0'
        t.task_status = Global.get_status_separating()
        parser = xml.sax.make_parser()
        parser.setFeature(xml.sax.handler.feature_namespaces, 0)
        parser.setContentHandler(t)
        parser.parse(task_path)
        self.__task_set[t.task_id] = t
        self.__task_queue.put(t)
        logging.info("submitted task %s\n" % t.task_name)
예제 #15
0
    def start(self):
        if self.__instance:
            print("Spider is already running...")
        else:
            print("Spider is already starting...")
            self.__instance = 1
            cn_logger = Logger('c')
            cn_logger.execute()

            mp = MsgProcessor()
            node = WebSpiderNode('127.0.0.1', Global.get_status_active())
            mp.send_node_msg(node)

            rp = ReqReceiver('127.0.0.1')
            rp.start()

            # rc = ReqPreprocessor()
            # rc.start()

            rd = ReqDownloader()
            rd.start()
예제 #16
0
 def get(self, queue_type):
     if queue_type in self.__queue_set.keys():
         return self.__queue_set.get(queue_type)
     else:
         if queue_type == Global.get_queue_task():
             q = TaskQueue()
         elif queue_type == Global.get_queue_req():
             q = ReqQueue()
         elif queue_type == Global.get_queue_res():
             q = ResQueue()
         elif queue_type == Global.get_queue_node():
             q = NodeQueue()
         elif queue_type == Global.get_msg_node():
             q = MsgQueue()
         elif queue_type == Global.get_msg_req():
             q = MsgQueue()
         elif queue_type == Global.get_msg_res():
             q = MsgQueue()
         else:
             return None
         self.__queue_set[queue_type] = q
         return q
예제 #17
0
파일: req_recevier.py 프로젝트: jiegerl/DTS
 def __new__(cls, *args, **kwargs):
     q = QueueManager()
     cls.__request_queue = q.get(Global.get_queue_req())
     return object.__new__(cls)
예제 #18
0
 def __init__(self):
     q = QueueManager()
     node_type = Global.get_node_all_type()
     self.__node_queue = q.get(node_type).queue
     self.__node_set = dict()
     self.__node_cnt = 0
예제 #19
0
 def __new__(cls):
     q = QueueManager()
     cls.__task_local_queue = q.get(Global.get_queue_task())
     d = DataManager()
     cls.__data_task_set = d.get(Global.get_data_task())
     return object.__new__(cls)