Пример #1
0
    def _crawl(self, url, **kwargs):
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, basestring) and hasattr(self, callback):
                func = getattr(self, callback)
            elif hasattr(callback, 'im_self') and callback.im_self is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in func._config.iteritems():
                    kwargs.setdefault(k, v)

        if hasattr(self, 'crawl_config'):
            for k, v in self.crawl_config.iteritems():
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        if schedule:
            task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        if fetch:
            task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        if process:
            task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or md5string(url)

        self._follows.append(task)
        return task
Пример #2
0
    def _update_project(self, project):
        '''update one project'''
        if project['name'] not in self.projects:
            self.projects[project['name']] = {}
        self.projects[project['name']].update(project)
        self.projects[project['name']]['md5sum'] = utils.md5string(project['script'])
        if not self.projects[project['name']].get('active_tasks', None):
            self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS)

        # load task queue when project is running and delete task_queue when project is stoped
        if project['status'] in ('RUNNING', 'DEBUG'):
            if project['name'] not in self.task_queue:
                self._load_tasks(project['name'])
            self.task_queue[project['name']].rate = project['rate']
            self.task_queue[project['name']].burst = project['burst']

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project['name'],
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': ['min_tick', ],
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })
        else:
            if project['name'] in self.task_queue:
                self.task_queue[project['name']].rate = 0
                self.task_queue[project['name']].burst = 0
                del self.task_queue[project['name']]
Пример #3
0
    def _crawl(self, url, **kwargs):
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, basestring) and hasattr(self, callback):
                func = getattr(self, callback)
            elif hasattr(callback, 'im_self') and callback.im_self is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in func._config.iteritems():
                    kwargs.setdefault(k, v)

        if hasattr(self, 'crawl_config'):
            for k, v in self.crawl_config.iteritems():
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        if schedule:
            task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        if fetch:
            task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        if process:
            task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or md5string(url)

        self._follows.append(task)
        return task
Пример #4
0
    def _update_project(self, project):
        '''update one project'''
        if project['name'] not in self.projects:
            self.projects[project['name']] = {}
        self.projects[project['name']].update(project)
        self.projects[project['name']]['md5sum'] = utils.md5string(project['script'])
        if not self.projects[project['name']].get('active_tasks', None):
            self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS)

        # load task queue when project is running and delete task_queue when project is stoped
        if project['status'] in ('RUNNING', 'DEBUG'):
            if project['name'] not in self.task_queue:
                self._load_tasks(project['name'])
            self.task_queue[project['name']].rate = project['rate']
            self.task_queue[project['name']].burst = project['burst']

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project['name'],
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': ['min_tick', 'retry_delay'],
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })
        else:
            if project['name'] in self.task_queue:
                self.task_queue[project['name']].rate = 0
                self.task_queue[project['name']].burst = 0
                del self.task_queue[project['name']]
Пример #5
0
    def crawl_list_page(self, response):
        
        db_cookie = self.get_cookie() or {}
        r_cookie = response.cookies
        
        print db_cookie
        print r_cookie
        
        db_ctime = int(db_cookie.get('ctime', 0))
        r_ctime = int(r_cookie.get('ctime', 0))
        
        if self.check_captcha(response):
            if db_ctime <= r_ctime:
                db_cookie = self.verify_vcode(response)
                if not db_cookie:
                    raise Exception('sougou_weixin refresh cookies fail!')
            
            #self.crawl(response.url, callback=self.crawl_list_page, cookies=db_cookie, save=response.save, force_update=True)
        else:
            # response.cookies.update(cookies)
            # 更新cookies 会导致无法转跳到 detail页面

            for each in response.doc(self.LIST_ANCHOR_SEL).items():
                taskid = md5string(each.text())
                self.crawl(each.attr.href, taskid=taskid, callback=self.detail_page, save=response.save, cookies=response.cookies)
Пример #6
0
    def update(self, project_info):
        self.project_info = project_info

        self.name = project_info['name']
        self.group = project_info['group']
        self.db_status = project_info['status']
        self.updatetime = project_info['updatetime']

        md5sum = utils.md5string(project_info['script'])
        if self.md5sum != md5sum:
            self.waiting_get_info = True
            self.md5sum = md5sum
        if self.waiting_get_info and self.active:
            self._send_on_get_info = True

        if self.active:
            self.task_queue.rate = project_info['rate']
            self.task_queue.burst = project_info['burst']
        else:
            self.task_queue.rate = 0
            self.task_queue.burst = 0

        logger.info('project %s updated, status:%s, paused:%s, %d tasks',
                    self.name, self.db_status, self.paused,
                    len(self.task_queue))
Пример #7
0
def send_message(ctx, scheduler_rpc, project, message):
    """
    Send Message to project from command line
    """
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        scheduler_rpc = connect_rpc(
            ctx, None, 'http://%s/' %
            (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')

    return scheduler_rpc.send_task({
        'taskid':
        utils.md5string('data:,on_message'),
        'project':
        project,
        'url':
        'data:,on_message',
        'fetch': {
            'save': ('__command__', message),
        },
        'process': {
            'callback': '_on_message',
        }
    })
Пример #8
0
    def _update_project(self, project):
        """update one project"""
        if project["name"] not in self.projects:
            self.projects[project["name"]] = {}
        self.projects[project["name"]].update(project)
        self.projects[project["name"]]["md5sum"] = utils.md5string(project["script"])
        if not self.projects[project["name"]].get("active_tasks", None):
            self.projects[project["name"]]["active_tasks"] = deque(maxlen=self.ACTIVE_TASKS)

        # load task queue when project is running and delete task_queue when project is stoped
        if project["status"] in ("RUNNING", "DEBUG"):
            if project["name"] not in self.task_queue:
                self._load_tasks(project["name"])
            self.task_queue[project["name"]].rate = project["rate"]
            self.task_queue[project["name"]].burst = project["burst"]

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task(
                {
                    "taskid": "_on_get_info",
                    "project": project["name"],
                    "url": "data:,_on_get_info",
                    "status": self.taskdb.SUCCESS,
                    "fetch": {"save": ["min_tick"]},
                    "process": {"callback": "_on_get_info"},
                }
            )
        else:
            if project["name"] in self.task_queue:
                self.task_queue[project["name"]].rate = 0
                self.task_queue[project["name"]].burst = 0
                del self.task_queue[project["name"]]
Пример #9
0
    def save(self, project, taskid, url, result, capture_phase):
        tablename = self._tablename(project)
        #result[0] is primary key result[1] is the dictionary
        key = result[0]
        items = result[1]
        if project not in self.projects:
            self._create_project(project, key, items)
            self._list_project()
        obj = None
        obj_result = result[1]
        if key and key not in ['taskid', 'url']:
            obj = {
                key: str(obj_result[key][0] if isinstance(obj_result[key], list) else obj_result[key]),
                'taskid': taskid,
                'url': url,
                'updatetime': time.time(),
            }
            obj_result.update(obj)
            key_value = obj_result[key]
        else:
            obj = {
                'taskid': taskid,
                'url': url,
                'updatetime': time.time(),
            }
            suid = ''
            for k in obj_result.keys():
                suid += str(obj_result[k][0] if isinstance(obj_result[k], list) else obj_result[k])
            suid = utils.md5string(suid)
            obj_result.update(obj)
            obj_result['suid'] = suid
            key_value = suid
            key = 'suid'
            
        for o in obj_result.keys():
            if obj_result[o] == None:
                obj_result[o] = ''
            else:
                obj_result[o] = str(obj_result[o][0] if isinstance(obj_result[o], list) else obj_result[o] )
        try:
            obj_result_copy = copy.deepcopy(obj_result)
            del obj_result_copy['updatetime']
            
            fields = tuple(obj_result_copy.keys())
            tasks = self.get(project, key, key_value, fields)
            if tasks:
                print 'cunzai   ', obj_result_copy

                where = "%s = %s" % (self.escape(key), self.placeholder)
                where_values=[obj_result_copy[key]]
                return self._update(tablename, where=where, where_values=[obj_result_copy[key]], **self._stringify(obj_result_copy))
            else:
                print '不存在', obj_result
                return self._replace(tablename, **self._stringify(obj_result))
        except Exception, e:
            print str(e)
Пример #10
0
 def test_put(n):
     logger.info("message queue put %d", n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % i
         task['taskid'] = md5string(task['url'])
         queue.put(task, block=True, timeout=1)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Пример #11
0
 def test_put(n):
     logger.info("message queue put %d", n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % i
         task['taskid'] = md5string(task['url'])
         queue.put(task, block=True, timeout=1)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms", cost_time,
                 n * 1.0 / cost_time, cost_time / n * 1000)
Пример #12
0
 def test_insert(n, start=0):
     logger.info("taskdb insert %d", n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l={0:d}'.format((i + start))
         task['taskid'] = md5string(task['url'])
         task['track'] = {}
         taskdb.insert(task['project'], task['taskid'], task)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Пример #13
0
 def test_update(n, start=0):
     logger.info("taskdb update %d" % n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.update(task['project'], task['taskid'], task)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms", cost_time,
                 n * 1.0 / cost_time, cost_time / n * 1000)
Пример #14
0
 def test_update(n, start=0):
     logger.info("taskdb update %d" % n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.update(task['project'], task['taskid'], task)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Пример #15
0
    def _update_project(self, project):
        """update one project"""
        if project['name'] not in self.projects:
            self.projects[project['name']] = {}
        try:
            project = rebuild_project(project)
        except Exception as e:
            logger.error("update project fail: %s" % str(e))
            return
        self.projects[project['name']].update(project)
        self.projects[project['name']]['md5sum'] = utils.md5string(
            project['script'])
        if not self.projects[project['name']].get('active_tasks', None):
            if self.ACTIVE_TASKS == 0:
                self.projects[project['name']]['active_tasks'] = deque()
            else:
                self.projects[project['name']]['active_tasks'] = deque(
                    maxlen=self.ACTIVE_TASKS)
        # load task queue when project is running and delete task_queue when project is stoped
        if project['status'] in ('RUNNING', 'DEBUG'):
            if project['name'] not in self.task_queue:
                self._load_tasks(project['name'])
            self.task_queue[project['name']].rate = project['rate']
            self.task_queue[project['name']].burst = project['burst']

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project['name'],
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': ['min_tick', 'retry_delay'],
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })
            logger.info('get info of project: %s', project['name'])
        else:
            if project['name'] in self.task_queue:
                self.task_queue[project['name']].rate = 0
                self.task_queue[project['name']].burst = 0
                del self.task_queue[project['name']]
                logger.info('delete queue of project: %s', project['name'])
            if project not in self._cnt['all']:
                self._update_project_cnt(project['name'])
Пример #16
0
 def test_get(n, start=0, random=True, fields=request_task_fields):
     logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
     range_n = list(range(n))
     if random:
         from random import shuffle
         shuffle(range_n)
     start_time = time.time()
     for i in range_n:
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.get_task(task['project'], task['taskid'], fields=fields)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Пример #17
0
 def test_get(n, start=0, random=True, fields=request_task_fields):
     logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
     range_n = list(range(n))
     if random:
         from random import shuffle
         shuffle(range_n)
     start_time = time.time()
     for i in range_n:
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.get_task(task['project'], task['taskid'], fields=fields)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms", cost_time,
                 n * 1.0 / cost_time, cost_time / n * 1000)
Пример #18
0
def send_message(ctx, scheduler_rpc, project, message):
    """
    Send Message to project from command line
    """
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get("SCHEDULER_NAME"):
        scheduler_rpc = connect_rpc(ctx, None, "http://%s/" % (os.environ["SCHEDULER_PORT_23333_TCP"][len("tcp://") :]))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, "http://127.0.0.1:23333/")

    return scheduler_rpc.send_task(
        {
            "taskid": utils.md5string("data:,on_message"),
            "project": project,
            "url": "data:,on_message",
            "fetch": {"save": ("__command__", message)},
            "process": {"callback": "_on_message"},
        }
    )
Пример #19
0
    def update(self, project_info):
        self.project_info = project_info

        self.name = project_info['name']
        self.group = project_info['group']
        self.db_status = project_info['status']
        self.updatetime = project_info['updatetime']

        md5sum = utils.md5string(project_info['script'])
        if (self.md5sum != md5sum or self.waiting_get_info) and self.active:
            self._send_on_get_info = True
            self.waiting_get_info = True
        self.md5sum = md5sum

        if self.active:
            self.task_queue.rate = project_info['rate']
            self.task_queue.burst = project_info['burst']
        else:
            self.task_queue.rate = 0
            self.task_queue.burst = 0
Пример #20
0
    def update(self, project_info):
        self.project_info = project_info

        self.name = project_info['name']
        self.group = project_info['group']
        self.db_status = project_info['status']
        self.updatetime = project_info['updatetime']

        md5sum = utils.md5string(project_info['script'])
        if (self.md5sum != md5sum or self.waiting_get_info) and self.active:
            self._send_on_get_info = True
            self.waiting_get_info = True
        self.md5sum = md5sum

        if self.active:
            self.task_queue.rate = project_info['rate']
            self.task_queue.burst = project_info['burst']
        else:
            self.task_queue.rate = 0
            self.task_queue.burst = 0
Пример #21
0
def send_message(ctx, scheduler_rpc, project, message):
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % (
            os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, 'http://localhost:23333/')

    return scheduler_rpc.send_task({
        'taskid': utils.md5string('data:,on_message'),
        'project': project,
        'url': 'data:,on_message',
        'fetch': {
            'save': ('__command__', message),
        },
        'process': {
            'callback': '_on_message',
        }
    })
Пример #22
0
 def _load_project(self, project):
     try:
         project['md5sum'] = utils.md5string(project['script'])
         ret = self.build_module(project, self.env)
         self.projects[project['name']] = ret
     except Exception as e:
         logger.exception("load project %s error", project.get('name', None))
         ret = {
             'loader': None,
             'module': None,
             'class': None,
             'instance': None,
             'exception': e,
             'exception_log': traceback.format_exc(),
             'info': project,
             'load_time': time.time(),
         }
         self.projects[project['name']] = ret
         return False
     logger.debug('project: %s updated.', project.get('name', None))
     return True
Пример #23
0
 def _load_project(self, project):
     '''Load project into self.projects from project info dict'''
     try:
         project['md5sum'] = utils.md5string(project['script'])
         ret = self.build_module(project, self.env)
         self.projects[project['name']] = ret
     except Exception as e:
         logger.exception("load project %s error", project.get('name', None))
         ret = {
             'loader': None,
             'module': None,
             'class': None,
             'instance': None,
             'exception': e,
             'exception_log': traceback.format_exc(),
             'info': project,
             'load_time': time.time(),
         }
         self.projects[project['name']] = ret
         return False
     logger.debug('project: %s updated.', project.get('name', None))
     return True
Пример #24
0
 def add_seed(self, seed_path, project, callback):
     new_tasks = list()
     from pyspider.libs.utils import md5string
     with open(seed_path) as fi:
         for line in fi:
             url = line.strip().decode('utf-8')
             task = {}
             task['url'] = url
             task['project'] = project
             task['status'] = 1
             task.setdefault('schedule', {'force_update': True})
             task['taskid'] = md5string(task['url'])
             task['fetch'] = {}
             task['process'] = {}
             task['process']['callback'] = callback
             task['depth'] = 0
             new_tasks.append(task)
     for each in (new_tasks[x:x + 1000]
                  for x in range(0, len(new_tasks), 1000)):
         self.newtask_queue.put(
             [utils.unicode_obj(newtask) for newtask in each])
     logger.info("add seed success for project %s!" % project)
Пример #25
0
    def update(self, project_info):
        self.project_info = project_info

        self.name = project_info['name']
        self.group = project_info['group']
        self.db_status = project_info['status']
        self.updatetime = project_info['updatetime']

        md5sum = utils.md5string(project_info['script'])
        if (self.md5sum != md5sum or self.waiting_get_info) and self.active:
            self._send_on_get_info = True
            self.waiting_get_info = True
        self.md5sum = md5sum

        if self.active:
            self.task_queue.rate = project_info['rate']
            self.task_queue.burst = project_info['burst']
        else:
            self.task_queue.rate = 0
            self.task_queue.burst = 0

        logger.info('project %s updated, status:%s, paused:%s, %d tasks',
                    self.name, self.db_status, self.paused, len(self.task_queue))
Пример #26
0
 def get_taskid(self, task):
     '''Generate taskid by information of task md5(url) by default, override me'''
     return md5string(task['url'])
Пример #27
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            if project not in self.projects:
                raise LookupError("no such project: %s" % project)
            project_data = self.projects[project]
            ret = project_data['instance'].run(
                project_data['module'], task, response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'time': response.time,
                        'status_code': response.status_code,
                        'headers': dict(response.headers),
                        'encoding': response.encoding,
                        'content': (
                            response.content[:500]
                            if not response.isok() or ret.exception else
                            None
                        ),
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        for newtask in ret.follows:
            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.newtask_queue.put(utils.unicode_obj(newtask))

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #28
0
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:100],
                            'logs': ret.logstr()[-200:],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
Пример #29
0
    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime,
                                                    md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(
                    logs=(project_data.get('exception_log'), ),
                    exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok':
                        response.isok(),
                        'redirect_url':
                        response.url
                        if response.url != response.orig_url else None,
                        'time':
                        response.time,
                        'error':
                        response.error,
                        'status_code':
                        response.status_code,
                        'encoding':
                        getattr(response, '_encoding', None),
                        'headers':
                        track_headers,
                        'content':
                        response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok':
                        not ret.exception,
                        'time':
                        process_time,
                        'follows':
                        len(ret.follows),
                        'result': (None if ret.result is None else utils.text(
                            ret.result)[:self.RESULT_RESULT_LIMIT]),
                        'logs':
                        ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception':
                        ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            for each in (ret.follows[x:x + 1000]
                         for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put(
                    [utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task(
                    {
                        'taskid': utils.md5string(url),
                        'project': project,
                        'url': url,
                        'process': {
                            'callback': '_on_message',
                        }
                    }, {
                        'status_code': 200,
                        'url': url,
                        'save': (task['project'], msg),
                    })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r'
            % (task['project'], task['taskid'], task.get('url'),
               response.status_code, len(response.content), ret.result,
               len(ret.follows), len(ret.messages), ret.exception))

        return True
Пример #30
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('updatetime', None)
            project_data = self.project_manager.get(project, updatetime)
            if not project_data:
                logger.error("no such project: %s", project)
                return False
            ret = project_data['instance'].run(project_data['module'], task,
                                               response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok':
                        response.isok(),
                        'time':
                        response.time,
                        'status_code':
                        response.status_code,
                        'headers':
                        dict(response.headers),
                        'encoding':
                        response.encoding,
                        'content':
                        (response.content[:500] if ret.exception else None),
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result':
                        utils.text(ret.result)[:self.RESULT_RESULT_LIMIT],
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        self.newtask_queue.put(
            [utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r'
            % (task['project'], task['taskid'], task.get('url'),
               response.status_code, len(response.content), ret.result,
               len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #31
0
    def save(self, project, taskid, url, result, capture_phase):
        key = result[0]
        key_value = None
        items = result[1]
        if project not in self.projects:
            self._create_project(project, key, items)
            self._list_project()
            self.tables[project] = copy.deepcopy(self.table)

        #if self.table is None:
        if not self.tables.has_key(project):
            self._init_table(project, key, items)
            self.tables[project] = copy.deepcopy(self.table)

        table = self.tables[project]
        table.name = self._tablename(project)

        obj_result = result[1]
        if key and key not in ['taskid', 'url']:
            obj = {
                key:
                result[1][key][0]
                if isinstance(result[1][key], list) else result[1][key],
                'taskid':
                taskid,
                'url':
                url,
                'updatetime':
                time.time(),
            }
            obj_result.update(obj)
            key_value = result[1][key][0] if isinstance(
                result[1][key], list) else result[1][key]
        else:
            key = 'suid'
            obj = {
                'taskid': taskid,
                'url': url,
                'updatetime': time.time(),
            }
            #计算suid
            suid = ''
            for k in obj_result.keys():
                suid += str(obj_result[k][0] if isinstance(
                    obj_result[k], list) else obj_result[k])
            suid = utils.md5string(suid)

            obj_result.update(obj)
            obj_result['suid'] = suid
            key_value = suid

        #将unicode转换成utf-8
        for o in obj_result.keys():
            if obj_result[o] == None:
                obj_result[o] = ''
            else:
                obj_result[o] = str(obj_result[o][0] if isinstance(
                    obj_result[o], list) else obj_result[o])

        obj_result['recently'] = 0
        try:
            obj_result_copy = copy.deepcopy(obj_result)
            del obj_result_copy['updatetime']

            fields = tuple(obj_result_copy.keys())
            tasks = self.get(project, key, key_value, fields)
            if tasks:
                need_update = False
                for key in fields:
                    if str(obj_result_copy[key]) != str(tasks[key.upper()]):
                        #need update
                        need_update = True
                column = None
                for x in table.c:
                    if key in str(x):
                        column = x
                if column is not None and need_update:
                    try:
                        sql = table.update().where(column == key_value).values(
                            **self._stringify(obj_result))
                        return self.engine.execute(sql)
                    except Exception, e:
                        logging.error('update data failed. error: %s  %s' %
                                      (e, str(obj_result)))
            else:
def get_taskid(self, task):  # 重写get_taskid方法
    return md5string(
        task['url'] +
        json.dumps(task['fetch'].get('data', '')))  # 利用URl和POST参数生成ID,
Пример #33
0
 def get_taskid(self, task):
     """Generate taskid by information of task md5(url) by default, override me"""
     return md5string(task["url"])
Пример #34
0
    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(logs=(project_data.get('exception_log'), ),
                                      exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': response.encoding,
                        'headers': track_headers,
                        'content': response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        # logger.info('process follows :%s' % ret.follows)
        # logger.info('process messages :%s' % ret.messages)
        
        if ret.follows:
            for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                    }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #35
0
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
                            'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                        }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
Пример #36
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('updatetime', None)
            project_data = self.project_manager.get(project, updatetime)
            if not project_data:
                logger.error("no such project: %s", project)
                return False
            ret = project_data['instance'].run(
                project_data['module'], task, response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': response.encoding,
                        'headers': track_headers,
                        'content': response.content[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #37
0
 def get_taskid(self, task):
     return md5string(task['url'] +
                      json.dumps(task['fetch'].get('data', '')) +
                      str(datetime.date.today()) + 'v7.0')
Пример #38
0
 def get_taskid(self, task):
     return md5string(
         task['url'] + json.dumps(task['fetch'].get('data', '')) + json.dumps(task['fetch'].get('params', '')))