Пример #1
0
 def __init__(self, result, follows, messages, logs, exception, extinfo):
     self.result = unicode_obj(result)
     self.follows = unicode_obj(follows)
     self.messages = unicode_obj(messages)
     self.logs = logs
     self.exception = unicode_obj(exception)
     self.extinfo = unicode_obj(extinfo)
Пример #2
0
def run(project):
    task = utils.decode_unicode_obj(json.loads(request.form['task']))
    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    fetch_result = {}
    start_time = time.time()
    try:
        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)
        module = build_module(project_info, {
            'debugger': True
        })
        ret = module['instance'].run(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
Пример #3
0
        def get_active_tasks(project=None, limit=100):
            allowed_keys = set((
                'taskid',
                'project',
                'status',
                'url',
                'lastcrawltime',
                'updatetime',
                'track',
            ))

            iters = [iter(x['active_tasks']) for k, x in iteritems(self.projects)
                     if x and (k == project if project else True)]
            tasks = [next(x, None) for x in iters]
            result = []

            while len(result) < limit and tasks and not all(x is None for x in tasks):
                updatetime, task = t = max(tasks)
                i = tasks.index(t)
                tasks[i] = next(iters[i], None)
                for key in list(task):
                    if key == 'track':
                        track = {}
                        if 'fetch' in task['track'] and 'ok' in task['track']['fetch']:
                            track['fetch'] = {'ok': task['track']['fetch']['ok']}
                        if 'process' in task['track'] and 'ok' in task['track']['process']:
                            track['process'] = {'ok': task['track']['process']['ok']}
                        task['track'] = track
                    elif key in allowed_keys:
                        continue
                    del task[key]
                result.append(t)
            # fix for "<type 'exceptions.TypeError'>:dictionary key must be string"
            # have no idea why
            return utils.unicode_obj(json.loads(json.dumps(result)))
Пример #4
0
def get_script(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    info = projectdb.get(project, fields=['name', 'script'])
    return json.dumps(utils.unicode_obj(info)), \
           200, {'Content-Type': 'application/json'}
Пример #5
0
def get_script(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    info = projectdb.get(project, fields=['name', 'script'])
    return json.dumps(utils.unicode_obj(info)), \
        200, {'Content-Type': 'application/json'}
Пример #6
0
def spiderweb_get_data(project):
    projectdb = app.config['projectdb']
    print(request.values)
    url1 = request.form.get('url')
    print(url1)
    conn = pymysql.connect(host='127.0.0.1',port=3306,user='******',password='******',db='repository',charset='utf8')
    cur= conn.cursor()
    cur.execute("select url,title,content,publish_time,crawl_time from shuiliting where project_id = %s",url1)
    result = cur.fetchall()
    return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
Пример #7
0
def save(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    script = request.form['script']
    project_info = projectdb.get(project, fields=['name', 'status', 'group'])
    if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \
            and not login.current_user.is_active():
        return app.login_response

    if project_info:
        info = {
            'script': script,
        }
        if project_info.get('status') in (
                'DEBUG',
                'RUNNING',
        ):
            info['status'] = 'CHECKING'
        projectdb.update(project, utils.unicode_obj(info))
    else:
        info = {
            'name': project,
            'script': script,
            'status': 'TODO',
            'rate': app.config.get('max_rate', 1),
            'burst': app.config.get('max_burst', 3),
        }
        projectdb.insert(project, utils.unicode_obj(info))

    rpc = app.config['scheduler_rpc']
    if rpc is not None:
        try:
            rpc.update_project()
        except socket.error as e:
            app.logger.warning('connect to scheduler rpc error: %r', e)
            return 'rpc error', 200

    return 'ok', 200
Пример #8
0
 def restart(self, project, status):
     restart_tasks = list()
     for task in self.taskdb.load_tasks(status, project):
         url = task['url']
         if task['taskid'] in self.task_queue[task['project']]:
             continue
         url = task['url']
         if url.startswith('data') or url.startswith('curl'):
             continue
         task.setdefault('schedule', self.default_schedule)
         task['schedule']['force_update'] = True
         restart_tasks.append(task)
     for each in (restart_tasks[x:x + 1000]
                  for x in range(0, len(restart_tasks), 1000)):
         self.newtask_queue.put(
             [utils.unicode_obj(newtask) for newtask in each])
     logger.info("restart tasks of project %s success!" % project)
Пример #9
0
        def get_active_tasks(project=None, limit=100):
            allowed_keys = set((
                'taskid',
                'project',
                'status',
                'url',
                'lastcrawltime',
                'updatetime',
                'track',
            ))

            iters = [
                iter(x['active_tasks']) for k, x in iteritems(self.projects)
                if x and (k == project if project else True)
            ]
            tasks = [next(x, None) for x in iters]
            result = []

            while len(result) < limit and tasks and not all(x is None
                                                            for x in tasks):
                updatetime, task = t = max(tasks)
                i = tasks.index(t)
                tasks[i] = next(iters[i], None)
                for key in list(task):
                    if key == 'track':
                        track = {}
                        if 'fetch' in task['track'] and 'ok' in task['track'][
                                'fetch']:
                            track['fetch'] = {
                                'ok': task['track']['fetch']['ok']
                            }
                        if 'process' in task['track'] and 'ok' in task[
                                'track']['process']:
                            track['process'] = {
                                'ok': task['track']['process']['ok']
                            }
                        task['track'] = track
                    elif key in allowed_keys:
                        continue
                    del task[key]
                result.append(t)
            # fix for "<type 'exceptions.TypeError'>:dictionary key must be string"
            # have no idea why
            return utils.unicode_obj(json.loads(json.dumps(result)))
Пример #10
0
 def add_seed(self, seed_path, project, callback):
     new_tasks = list()
     from pyspider.libs.utils import md5string
     with open(seed_path) as fi:
         for line in fi:
             url = line.strip().decode('utf-8')
             task = {}
             task['url'] = url
             task['project'] = project
             task['status'] = 1
             task.setdefault('schedule', {'force_update': True})
             task['taskid'] = md5string(task['url'])
             task['fetch'] = {}
             task['process'] = {}
             task['process']['callback'] = callback
             task['depth'] = 0
             new_tasks.append(task)
     for each in (new_tasks[x:x + 1000]
                  for x in range(0, len(new_tasks), 1000)):
         self.newtask_queue.put(
             [utils.unicode_obj(newtask) for newtask in each])
     logger.info("add seed success for project %s!" % project)
Пример #11
0
def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(lambda x, f=from_connection, t=to_connection:
                 taskdb_migrating(x, f, t),
                 f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(lambda x, f=from_connection, t=to_connection:
                 resultdb_migrating(x, f, t),
                 f.projects)
Пример #12
0
def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
            f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
            f.projects)
Пример #13
0
    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime,
                                                    md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(
                    logs=(project_data.get('exception_log'), ),
                    exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok':
                        response.isok(),
                        'redirect_url':
                        response.url
                        if response.url != response.orig_url else None,
                        'time':
                        response.time,
                        'error':
                        response.error,
                        'status_code':
                        response.status_code,
                        'encoding':
                        getattr(response, '_encoding', None),
                        'headers':
                        track_headers,
                        'content':
                        response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok':
                        not ret.exception,
                        'time':
                        process_time,
                        'follows':
                        len(ret.follows),
                        'result': (None if ret.result is None else utils.text(
                            ret.result)[:self.RESULT_RESULT_LIMIT]),
                        'logs':
                        ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception':
                        ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            for each in (ret.follows[x:x + 1000]
                         for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put(
                    [utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task(
                    {
                        'taskid': utils.md5string(url),
                        'project': project,
                        'url': url,
                        'process': {
                            'callback': '_on_message',
                        }
                    }, {
                        'status_code': 200,
                        'url': url,
                        'save': (task['project'], msg),
                    })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r'
            % (task['project'], task['taskid'], task.get('url'),
               response.status_code, len(response.content), ret.result,
               len(ret.follows), len(ret.messages), ret.exception))

        return True
Пример #14
0
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = utils.unicode_obj({
                    'taskid': task['taskid'],
                    'project': task['project'],
                    'url': task.get('url'),
                    'track': {
                        'fetch': {
                            'ok': response.isok(),
                            'time': response.time,
                            'status_code': response.status_code,
                            'headers': dict(response.headers),
                            'encoding': response.encoding,
                            'content': response.content[:500] \
                                    if not response.isok() or ret.exception else None,
                            },
                        'process': {
                            'ok': not ret.exception,
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:100],
                            'logs': ret.logstr()[-200:],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)
Пример #15
0
def run(project):
    start_time = time.time()
    try:
        task = utils.decode_unicode_obj(json.loads(request.form['task']))
    except Exception:
        result = {
            'fetch_result': "",
            'logs': u'task json error',
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), \
            200, {'Content-Type': 'application/json'}

    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    if request.form.get('webdav_mode') == 'true':
        projectdb = app.config['projectdb']
        info = projectdb.get(project, fields=['name', 'script'])
        if not info:
            result = {
                'fetch_result': "",
                'logs': u' in wevdav mode, cannot load script',
                'follows': [],
                'messages': [],
                'result': None,
                'time': time.time() - start_time,
            }
            return json.dumps(utils.unicode_obj(result)), \
                200, {'Content-Type': 'application/json'}
        project_info['script'] = info['script']

    fetch_result = {}
    try:
        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)
        module = ProjectManager.build_module(project_info, {
            'debugger': True
        })
        ret = module['instance'].run_task(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text
        if (response.headers.get('content-type', '').startswith('image')):
            result['fetch_result']['dataurl'] = dataurl.encode(
                response.content, response.headers['content-type'])

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
Пример #16
0
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = utils.unicode_obj({
                    'taskid': task['taskid'],
                    'project': task['project'],
                    'url': task.get('url'),
                    'track': {
                        'fetch': {
                            'ok': response.isok(),
                            'time': response.time,
                            'status_code': response.status_code,
                            'headers': dict(response.headers),
                            'encoding': response.encoding,
                            'content': response.content[:500] \
                                    if not response.isok() or ret.exception else None,
                            },
                        'process': {
                            'ok': not ret.exception,
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
                            'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)
Пример #17
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('updatetime', None)
            project_data = self.project_manager.get(project, updatetime)
            if not project_data:
                logger.error("no such project: %s", project)
                return False
            ret = project_data['instance'].run(
                project_data['module'], task, response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': response.encoding,
                        'headers': track_headers,
                        'content': response.content[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #18
0
                                    if not response.isok() or ret.exception else None,
                            },
                        'process': {
                            'ok': not ret.exception,
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
                            'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                            'exception': ret.exception,
                            },
                        },
                    }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        for newtask in ret.follows:
            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.newtask_queue.put(utils.unicode_obj(newtask))

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                        }
                }, {
Пример #19
0
def run(project):
    start_time = time.time()
    try:
        task = utils.decode_unicode_obj(json.loads(request.form['task']))
    except Exception:
        result = {
            'fetch_result': "",
            'logs': u'task json error',
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), \
               200, {'Content-Type': 'application/json'}

    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    if request.form.get('webdav_mode') == 'true':
        projectdb = app.config['projectdb']
        info = projectdb.get(project, fields=['name', 'script'])
        if not info:
            result = {
                'fetch_result': "",
                'logs': u' in wevdav mode, cannot load script',
                'follows': [],
                'messages': [],
                'result': None,
                'time': time.time() - start_time,
            }
            return json.dumps(utils.unicode_obj(result)), \
                   200, {'Content-Type': 'application/json'}
        project_info['script'] = info['script']

    fetch_result = {}
    try:
        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)
        module = ProjectManager.build_module(project_info, {'debugger': True})
        ret = module['instance'].run_task(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text
        if (response.headers.get('content-type', '').startswith('image')):
            result['fetch_result']['dataurl'] = dataurl.encode(
                response.content, response.headers['content-type'])

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps(utils.unicode_obj(result)), 200, {
            'Content-Type': 'application/json'
        }
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {
            'Content-Type': 'application/json'
        }
Пример #20
0
    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(logs=(project_data.get('exception_log'), ),
                                      exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': response.encoding,
                        'headers': track_headers,
                        'content': response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        # logger.info('process follows :%s' % ret.follows)
        # logger.info('process messages :%s' % ret.messages)
        
        if ret.follows:
            for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                    }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #21
0
def api_debug_run(project):
    start_time = time.time()
    try:
        task = utils.decode_unicode_obj(json.loads(request.form['task']))
    except Exception:
        result = {
            'fetch_result': "",
            'logs': u'task json error',
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps({
            'Status': 1,
            'Result': utils.unicode_obj(result)
        }), 200, cors_resp_header

    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    if request.form.get('webdav_mode') == 'true':
        projectdb = app.config['projectdb']
        info = projectdb.get(project, fields=['name', 'script'])
        if not info:
            result = {
                'fetch_result': "",
                'logs': u' in wevdav mode, cannot load script',
                'follows': [],
                'messages': [],
                'result': None,
                'time': time.time() - start_time,
            }
            return json.dumps({
                'Status': 1,
                'Result': utils.unicode_obj(result)
            }), 200, cors_resp_header

        project_info['script'] = info['script']

    fetch_result = {}
    try:
        module = ProjectManager.build_module(
            project_info, {
                'debugger': True,
                'process_time_limit': app.config['process_time_limit'],
            })

        # The code below is to mock the behavior that crawl_config been joined when selected by scheduler.
        # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True`
        # crawl_config = module['instance'].crawl_config
        # task = module['instance'].task_join_crawl_config(task, crawl_config)

        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)

        ret = module['instance'].run_task(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text
        if (response.headers.get('content-type', '').startswith('image')):
            result['fetch_result']['dataurl'] = dataurl.encode(
                response.content, response.headers['content-type'])

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps({
            'Status': 1,
            'Result': utils.unicode_obj(result)
        }), 200, cors_resp_header

    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps({
            'Status': 1,
            'Result': utils.unicode_obj(result)
        }), 200, cors_resp_header
Пример #22
0
def run(project):
    start_time = time.time()
    try:
        task = utils.decode_unicode_obj(json.loads(request.form['task']))
    except Exception:
        result = {
            'fetch_result': "",
            'logs': u'task json error',
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {
            'Content-Type': 'application/json'
        }

    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    fetch_result = {}
    try:
        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)
        module = ProjectManager.build_module(project_info, {'debugger': True})
        ret = module['instance'].run(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps(utils.unicode_obj(result)), 200, {
            'Content-Type': 'application/json'
        }
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {
            'Content-Type': 'application/json'
        }
Пример #23
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('updatetime', None)
            project_data = self.project_manager.get(project, updatetime)
            if not project_data:
                logger.error("no such project: %s", project)
                return False
            ret = project_data['instance'].run(project_data['module'], task,
                                               response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok':
                        response.isok(),
                        'time':
                        response.time,
                        'status_code':
                        response.status_code,
                        'headers':
                        dict(response.headers),
                        'encoding':
                        response.encoding,
                        'content':
                        (response.content[:500] if ret.exception else None),
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result':
                        utils.text(ret.result)[:self.RESULT_RESULT_LIMIT],
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        self.newtask_queue.put(
            [utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r'
            % (task['project'], task['taskid'], task.get('url'),
               response.status_code, len(response.content), ret.result,
               len(ret.follows), len(ret.messages), ret.exception))
        return True
Пример #24
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            if project not in self.projects:
                raise LookupError("no such project: %s" % project)
            project_data = self.projects[project]
            ret = project_data['instance'].run(
                project_data['module'], task, response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'time': response.time,
                        'status_code': response.status_code,
                        'headers': dict(response.headers),
                        'encoding': response.encoding,
                        'content': (
                            response.content[:500]
                            if not response.isok() or ret.exception else
                            None
                        ),
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        for newtask in ret.follows:
            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.newtask_queue.put(utils.unicode_obj(newtask))

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True