예제 #1
0
파일: stats.py 프로젝트: zzy0302/crawlab
    def get_home_stats(self):
        """
        Get stats for home page
        """
        # overview stats
        task_count = db_manager.count('tasks', {})
        spider_count = db_manager.count('spiders', {})
        node_count = db_manager.count('nodes', {})
        deploy_count = db_manager.count('deploys', {})

        # daily stats
        cur = db_manager.aggregate('tasks', [{
            '$project': {
                'date': {
                    '$dateToString': {
                        'format': '%Y-%m-%d',
                        'date': '$create_ts'
                    }
                }
            }
        }, {
            '$group': {
                '_id': '$date',
                'count': {
                    '$sum': 1
                }
            }
        }, {
            '$sort': {
                '_id': 1
            }
        }])
        date_cache = {}
        for item in cur:
            date_cache[item['_id']] = item['count']
        start_date = datetime.now() - timedelta(31)
        end_date = datetime.now() - timedelta(1)
        date = start_date
        daily_tasks = []
        while date < end_date:
            date = date + timedelta(1)
            date_str = date.strftime('%Y-%m-%d')
            daily_tasks.append({
                'date': date_str,
                'count': date_cache.get(date_str) or 0,
            })

        return {
            'status': 'ok',
            'overview_stats': {
                'task_count': task_count,
                'spider_count': spider_count,
                'node_count': node_count,
                'deploy_count': deploy_count,
            },
            'daily_tasks': daily_tasks
        }
예제 #2
0
    def get_results(self, id: str) -> (dict, tuple):
        """
        Get a list of results crawled in a given task.
        :param id: task_id
        """
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        items = db_manager.list(col_name, {'task_id': id},
                                skip=page_size * (page_num - 1),
                                limit=page_size)
        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': db_manager.count(col_name, {'task_id': id}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
예제 #3
0
def get_last_n_day_tasks_count(spider_id: ObjectId, n: int) -> list:
    return db_manager.count(col_name='tasks',
                            cond={
                                'spider_id': spider_id,
                                'create_ts': {
                                    '$gte': (datetime.now() - timedelta(n))
                                }
                            })
예제 #4
0
파일: base.py 프로젝트: masdude/crawlab
    def get(self, id=None, action=None):
        args = self.parser.parse_args()

        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # list items
        elif id is None:
            # filter
            cond = {}
            if args.get('filter') is not None:
                cond = args.filter
                # cond = json.loads(args.filter)

            # page number
            page = 1
            if args.get('page') is not None:
                page = args.page
                # page = int(args.page)

            # page size
            page_size = 10
            if args.get('page_size') is not None:
                page_size = args.page_size
                # page = int(args.page_size)

            # TODO: sort functionality

            # total count
            total_count = db_manager.count(col_name=self.col_name, cond=cond)

            # items
            items = db_manager.list(col_name=self.col_name,
                                    cond=cond,
                                    skip=(page - 1) * page_size,
                                    limit=page_size)

            # TODO: getting status for node

            return jsonify({
                'status': 'ok',
                'total_count': total_count,
                'page': page,
                'page_size': page_size,
                'items': items
            })

        # get item by id
        else:
            return jsonify(db_manager.get(col_name=self.col_name, id=id))
예제 #5
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        elif id is not None:
            task = db_manager.get('tasks', id=id)
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                if not task.get('status'):
                    task['status'] = _task['status']
            task['result'] = _task['result']
            task['spider_name'] = _spider['name']
            try:
                with open(task['log_file_path']) as f:
                    task['log'] = f.read()
            except Exception as err:
                task['log'] = ''
            return jsonify(task)

        # list tasks
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1
        tasks = db_manager.list('tasks', {},
                                limit=page_size,
                                skip=page_size * (page_num - 1),
                                sort_key='finish_ts')
        items = []
        for task in tasks:
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                task['status'] = _task['status']
            else:
                task['status'] = TaskStatus.UNAVAILABLE
            task['spider_name'] = _spider['name']
            items.append(task)
        return {
            'status': 'ok',
            'total_count': db_manager.count('tasks', {}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
예제 #6
0
    def get_results(self, id: str) -> (dict, tuple):
        """
        Get a list of results crawled in a given task.
        :param id: task_id
        """
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        fields = list(set(fields) - set(IGNORE_FIELD))
        items = db_manager.list(col_name, {'task_id': id})

        # 避免内容过长,做一下限制;同时剔除无用的字段不展示
        adjust_items = []
        for item in items:
            adjust_item = {}
            for key, value in item.items():
                if isinstance(value, str) == False:
                    continue
                if key in IGNORE_FIELD:
                    continue
                if len(value) > 500:
                    value = value[:500] + '...'
                adjust_item[key] = value
            adjust_items += [adjust_item]

        total_count = db_manager.count(col_name, {'task_id': id})
        page_num = len(adjust_items) / page_size
        if isinstance(page_num, float):
            page_num = int(page_num) + 1

        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': len(adjust_items),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(adjust_items)
        }
예제 #7
0
    def get_results(self, id):
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        items = db_manager.list(col_name, {'task_id': id})
        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': db_manager.count(col_name, {'task_id': id}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
예제 #8
0
파일: base.py 프로젝트: tom2jack/crawlab
    def get(self, id: str = None, action: str = None) -> (dict, tuple):
        """
        GET method for retrieving item information.
        If id is specified and action is not, return the object of the given id;
        If id and action are both specified, execute the given action results of the given id;
        If neither id nor action is specified, return the list of items given the page_size, page_num and filter
        :param id:
        :param action:
        :return:
        """
        # import pdb
        # pdb.set_trace()
        args = self.parser.parse_args()

        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # list items
        elif id is None:
            # filter
            cond = {}
            if args.get('filter') is not None:
                cond = args.filter
                # cond = json.loads(args.filter)

            # page number
            page = 1
            if args.get('page_num') is not None:
                page = args.page
                # page = int(args.page)

            # page size
            page_size = 10
            if args.get('page_size') is not None:
                page_size = args.page_size
                # page = int(args.page_size)

            # TODO: sort functionality

            # total count
            total_count = db_manager.count(col_name=self.col_name, cond=cond)

            # items
            items = db_manager.list(col_name=self.col_name,
                                    cond=cond,
                                    skip=(page - 1) * page_size,
                                    limit=page_size)

            # TODO: getting status for node

            return {
                'status': 'ok',
                'total_count': total_count,
                'page_num': page,
                'page_size': page_size,
                'items': jsonify(items)
            }

        # get item by id
        else:
            return jsonify(db_manager.get(col_name=self.col_name, id=id))
예제 #9
0
파일: tasks.py 프로젝트: zzy0302/crawlab
    def get(self, id: str = None, action: str = None):
        """
        GET method of TaskAPI.
        :param id: item id
        :param action: action
        """
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        elif id is not None:
            task = db_manager.get(col_name=self.col_name, id=id)
            spider = db_manager.get(col_name='spiders',
                                    id=str(task['spider_id']))

            # spider
            task['num_results'] = 0
            if spider:
                task['spider_name'] = spider['name']
                if spider.get('col'):
                    col = spider.get('col')
                    num_results = db_manager.count(col,
                                                   {'task_id': task['_id']})
                    task['num_results'] = num_results

            # duration
            if task.get('finish_ts') is not None:
                task['duration'] = (task['finish_ts'] -
                                    task['create_ts']).total_seconds()
                task['avg_num_results'] = round(
                    task['num_results'] / task['duration'], 1)

            try:
                with open(task['log_file_path']) as f:
                    task['log'] = f.read()
            except Exception as err:
                task['log'] = ''
            return jsonify(task)

        # list tasks
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1
        filter_str = args.get('filter')
        filter_ = {}
        if filter_str is not None:
            filter_ = json.loads(filter_str)
            if filter_.get('spider_id'):
                filter_['spider_id'] = ObjectId(filter_['spider_id'])
        tasks = db_manager.list(col_name=self.col_name,
                                cond=filter_,
                                limit=page_size,
                                skip=page_size * (page_num - 1),
                                sort_key='create_ts')
        items = []
        for task in tasks:
            # get spider
            _spider = db_manager.get(col_name='spiders',
                                     id=str(task['spider_id']))

            # status
            if task.get('status') is None:
                task['status'] = TaskStatus.UNAVAILABLE

            # spider
            task['num_results'] = 0
            if _spider:
                # spider name
                task['spider_name'] = _spider['name']

                # number of results
                if _spider.get('col'):
                    col = _spider.get('col')
                    num_results = db_manager.count(col,
                                                   {'task_id': task['_id']})
                    task['num_results'] = num_results

            # duration
            if task.get('finish_ts') is not None:
                task['duration'] = (task['finish_ts'] -
                                    task['create_ts']).total_seconds()
                task['avg_num_results'] = round(
                    task['num_results'] / task['duration'], 1)

            items.append(task)

        return {
            'status': 'ok',
            'total_count': db_manager.count('tasks', filter_),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
예제 #10
0
파일: stats.py 프로젝트: zzy0302/crawlab
    def get_spider_stats(self):
        args = self.parser.parse_args()
        spider_id = args.get('spider_id')
        spider = db_manager.get('spiders', id=spider_id)
        tasks = db_manager.list(col_name='tasks',
                                cond={
                                    'spider_id': spider['_id'],
                                    'create_ts': {
                                        '$gte': datetime.now() - timedelta(30)
                                    }
                                },
                                limit=9999999)

        # task count
        task_count = len(tasks)

        # calculate task count stats
        task_count_by_status = defaultdict(int)
        task_count_by_node = defaultdict(int)
        total_seconds = 0
        for task in tasks:
            task_count_by_status[task['status']] += 1
            task_count_by_node[task.get('node_id')] += 1
            if task['status'] == TaskStatus.SUCCESS and task.get('finish_ts'):
                duration = (task['finish_ts'] -
                            task['create_ts']).total_seconds()
                total_seconds += duration

        # task count by node
        task_count_by_node_ = []
        for status, value in task_count_by_node.items():
            task_count_by_node_.append({'name': status, 'value': value})

        # task count by status
        task_count_by_status_ = []
        for status, value in task_count_by_status.items():
            task_count_by_status_.append({'name': status, 'value': value})

        # success rate
        success_rate = task_count_by_status[TaskStatus.SUCCESS] / task_count

        # average duration
        avg_duration = total_seconds / task_count

        # calculate task count by date
        cur = db_manager.aggregate('tasks', [{
            '$match': {
                'spider_id': spider['_id']
            }
        }, {
            '$project': {
                'date': {
                    '$dateToString': {
                        'format': '%Y-%m-%d',
                        'date': '$create_ts'
                    }
                },
                'duration': {
                    '$subtract': ['$finish_ts', '$create_ts']
                }
            }
        }, {
            '$group': {
                '_id': '$date',
                'count': {
                    '$sum': 1
                },
                'duration': {
                    '$avg': '$duration'
                }
            }
        }, {
            '$sort': {
                '_id': 1
            }
        }])
        date_cache = {}
        for item in cur:
            date_cache[item['_id']] = {
                'duration': (item['duration'] or 0) / 1000,
                'count': item['count']
            }
        start_date = datetime.now() - timedelta(31)
        end_date = datetime.now() - timedelta(1)
        date = start_date
        daily_tasks = []
        while date < end_date:
            date = date + timedelta(1)
            date_str = date.strftime('%Y-%m-%d')
            d = date_cache.get(date_str)
            row = {
                'date': date_str,
            }
            if d is None:
                row['count'] = 0
                row['duration'] = 0
            else:
                row['count'] = d['count']
                row['duration'] = d['duration']
            daily_tasks.append(row)

        # calculate total results
        result_count = 0
        col_name = spider.get('col')
        if col_name is not None:
            for task in tasks:
                result_count += db_manager.count(col_name,
                                                 {'task_id': task['_id']})

        # top tasks
        # top_10_tasks = db_manager.list('tasks', {'spider_id': spider['_id']})

        return {
            'status': 'ok',
            'overview': {
                'task_count': task_count,
                'result_count': result_count,
                'success_rate': success_rate,
                'avg_duration': avg_duration
            },
            'task_count_by_status': task_count_by_status_,
            'task_count_by_node': task_count_by_node_,
            'daily_stats': daily_tasks,
        }
예제 #11
0
    def get(self, id: str = None, action: str = None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        elif id is not None:
            site = db_manager.get(col_name=self.col_name, id=id)
            return jsonify(site)

        # list tasks
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1
        filter_str = args.get('filter')
        keyword = args.get('keyword')
        filter_ = {}
        if filter_str is not None:
            filter_ = json.loads(filter_str)
        if keyword is not None:
            filter_['$or'] = [{
                'description': {
                    '$regex': keyword
                }
            }, {
                'name': {
                    '$regex': keyword
                }
            }, {
                'domain': {
                    '$regex': keyword
                }
            }]

        items = db_manager.list(col_name=self.col_name,
                                cond=filter_,
                                limit=page_size,
                                skip=page_size * (page_num - 1),
                                sort_key='rank',
                                sort_direction=ASCENDING)

        sites = []
        for site in items:
            # get spider count
            site['spider_count'] = db_manager.count('spiders',
                                                    {'site': site['_id']})

            sites.append(site)

        return {
            'status': 'ok',
            'total_count': db_manager.count(self.col_name, filter_),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(sites)
        }