def deploy_all(self): # active nodes nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE}) # all spiders spiders = db_manager.list('spiders', {'cmd': {'$exists': True}}) # iterate all nodes for node in nodes: node_id = node['_id'] for spider in spiders: spider_id = spider['_id'] spider_src = spider['src'] output_file_name = '%s_%s.zip' % (datetime.now().strftime( '%Y%m%d%H%M%S'), str(random())[2:12]) output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name) # zip source folder to zip file zip_file(source_dir=spider_src, output_filename=output_file_path) # upload to api files = {'file': open(output_file_path, 'rb')} r = requests.post( 'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % ( node.get('ip'), node.get('port'), spider_id, node_id, ), files=files) return {'status': 'ok', 'message': 'success'}
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('deploys', id=id)) # get a list of items else: items = db_manager.list('deploys', {}) deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return {'status': 'ok', 'items': jsonify(deploys)}
def deploy(self, id): spider = db_manager.get('spiders', id=id) nodes = db_manager.list('nodes', {}) for node in nodes: node_id = node['_id'] output_file_name = '%s_%s.zip' % ( datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12]) output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name) # zip source folder to zip file zip_file(source_dir=spider['src'], output_filename=output_file_path) # upload to api files = {'file': open(output_file_path, 'rb')} r = requests.post( 'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % ( node.get('ip'), node.get('port'), id, node_id, ), files=files) return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
def get_spider_col_fields(col_name): items = db_manager.list(col_name, {}, limit=100, sort_key='_id') fields = set() for item in items: for k in item.keys(): fields.add(k) return list(fields)
def update(self): # remove all existing periodic jobs self.scheduler.remove_all_jobs() # add new periodic jobs from database spiders = db_manager.list('spiders', {'cron_enabled': CronEnabled.ON}) for spider in spiders: cron = spider.get('cron') cron_arr = cron.split(' ') second = cron_arr[0] minute = cron_arr[1] hour = cron_arr[2] day = cron_arr[3] month = cron_arr[4] day_of_week = cron_arr[5] self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']), ), jobstore='mongo', day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute, second=second)
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def deploy(self, id: str) -> (dict, tuple): """ Submit HTTP requests to deploy the given spider to all nodes. :param id: :return: """ spider = db_manager.get('spiders', id=id) nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE}) for node in nodes: node_id = node['_id'] output_file_name = '%s_%s.zip' % ( datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12]) output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name) # zip source folder to zip file zip_file(source_dir=spider['src'], output_filename=output_file_path) # upload to api files = {'file': open(output_file_path, 'rb')} r = requests.post( 'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % ( node.get('ip'), node.get('port'), id, node_id, ), files=files) # TODO: checkpoint for errors return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
def update(self): print('updating...') # remove all existing periodic jobs self.scheduler.remove_all_jobs() self.mongo[MONGO_DB][self.task_col].remove() periodical_tasks = db_manager.list('schedules', {}) for task in periodical_tasks: cron = task.get('cron') cron_arr = cron.split(' ') second = cron_arr[0] minute = cron_arr[1] hour = cron_arr[2] day = cron_arr[3] month = cron_arr[4] day_of_week = cron_arr[5] self.scheduler.add_job(func=self.execute_spider, args=( str(task['spider_id']), task.get('params'), ), trigger='cron', jobstore='mongo', day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute, second=second) self.scheduler.print_jobs(jobstore='mongo') print(f'state: {self.scheduler.state}') print(f'running: {self.scheduler.running}')
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return db_manager.get('nodes', id=id) # get a list of items else: # get a list of active nodes from flower and save to db update_nodes_status() # iterate db nodes to update status nodes = db_manager.list('nodes', {}) return { 'status': 'ok', 'items': jsonify(nodes) }
def get(self, id: str = None, action: str = None) -> (dict, tuple): """ GET method of DeployAPI. :param id: deploy_id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('deploys', id=id)) # get a list of items else: items = db_manager.list('deploys', {}) deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) if spider is None: db_manager.remove('deploys', {'spider_id':spider_id}) item['spider_name'] = spider['name'] deploys.append(item) return { 'status': 'ok', 'items': jsonify(deploys) }
def get_results(self, id): task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}) return jsonify({'status': 'ok', 'fields': fields, 'items': items})
def get(self, id=None, action=None): args = self.parser.parse_args() # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # list items elif id is None: # filter cond = {} if args.get('filter') is not None: cond = args.filter # cond = json.loads(args.filter) # page number page = 1 if args.get('page') is not None: page = args.page # page = int(args.page) # page size page_size = 10 if args.get('page_size') is not None: page_size = args.page_size # page = int(args.page_size) # TODO: sort functionality # total count total_count = db_manager.count(col_name=self.col_name, cond=cond) # items items = db_manager.list(col_name=self.col_name, cond=cond, skip=(page - 1) * page_size, limit=page_size) # TODO: getting status for node return jsonify({ 'status': 'ok', 'total_count': total_count, 'page': page, 'page_size': page_size, 'items': items }) # get item by id else: return jsonify(db_manager.get(col_name=self.col_name, id=id))
def get_tasks(self, id): items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] return { 'status': 'ok', 'items': jsonify(items) }
def get_last_n_run_errors_count(spider_id: ObjectId, n: int) -> list: tasks = db_manager.list(col_name='tasks', cond={'spider_id': spider_id}, sort_key='create_ts', limit=n) count = 0 for task in tasks: if task['status'] == TaskStatus.FAILURE: count += 1 return count
def get_spider_col_fields(col_name: str) -> list: """ Get spider collection fields :param col_name: collection name """ items = db_manager.list(col_name, {}, limit=100, sort_key='_id') fields = set() for item in items: for k in item.keys(): fields.add(k) return list(fields)
def get_deploys(self, id): items = db_manager.list('deploys', {'node_id': id}, limit=10, sort_key='finish_ts') deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return {'status': 'ok', 'items': jsonify(deploys)}
def download_results(self, id: str): task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return send_csv([], f'results_{col_name}_{round(time())}.csv') items = db_manager.list(col_name, {'task_id': id}, limit=999999999) fields = get_spider_col_fields(col_name, task_id=id, limit=999999999) return send_csv(items, filename=f'results_{col_name}_{round(time())}.csv', fields=fields, encoding='utf-8')
def get_deploys(self, id): items = db_manager.list('deploys', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts') deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return jsonify({'status': 'ok', 'items': deploys})
def get_tasks(self, id): items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] if item.get('status') is None: item['status'] = TaskStatus.UNAVAILABLE return { 'status': 'ok', 'items': jsonify(items) }
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get('tasks', id=id) _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: if not task.get('status'): task['status'] = _task['status'] task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['spider_name'] = _spider['name'] items.append(task) return { 'status': 'ok', 'total_count': db_manager.count('tasks', {}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_tasks(self, id): items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] _task = db_manager.get('tasks_celery', id=item['_id']) if _task: item['status'] = _task['status'] else: item['status'] = TaskStatus.UNAVAILABLE return {'status': 'ok', 'items': jsonify(items)}
def get_tasks(self, id): items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] task = db_manager.get('tasks_celery', id=item['_id']) if task is not None: item['status'] = task['status'] else: item['status'] = TaskStatus.UNAVAILABLE return jsonify({'status': 'ok', 'items': items})
def get_tasks(self, id: str) -> (dict, tuple): """ Get a list of latest tasks of given spider_id :param id: """ items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] if item.get('status') is None: item['status'] = TaskStatus.UNAVAILABLE return {'status': 'ok', 'items': jsonify(items)}
def get_deploys(self, id: str) -> (dict, tuple): """ Get a list of latest deploys of given spider_id :param id: spider_id """ items = db_manager.list('deploys', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts') deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return {'status': 'ok', 'items': jsonify(deploys)}
def get_spider_col_fields(col_name: str, task_id: str = None, limit: int = 100) -> list: """ Get spider collection fields :param col_name: collection name :param task_id: task_id :param limit: limit """ filter_ = {} if task_id is not None: filter_['task_id'] = task_id items = db_manager.list(col_name, filter_, limit=limit, sort_key='_id') fields = set() for item in items: for k in item.keys(): fields.add(k) return list(fields)
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) fields = list(set(fields) - set(IGNORE_FIELD)) items = db_manager.list(col_name, {'task_id': id}) # 避免内容过长,做一下限制;同时剔除无用的字段不展示 adjust_items = [] for item in items: adjust_item = {} for key, value in item.items(): if isinstance(value, str) == False: continue if key in IGNORE_FIELD: continue if len(value) > 500: value = value[:500] + '...' adjust_item[key] = value adjust_items += [adjust_item] total_count = db_manager.count(col_name, {'task_id': id}) page_num = len(adjust_items) / page_size if isinstance(page_num, float): page_num = int(page_num) + 1 return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': len(adjust_items), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(adjust_items) }
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get('tasks', id=id) _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) tasks = db_manager.list('tasks', {}, limit=1000, sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['spider_name'] = _spider['name'] items.append(task) return jsonify({ 'status': 'ok', 'items': items })
def get_results(self, id): args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get(self, id: str = None, action: str = None) -> (dict, tuple): """ GET method for retrieving item information. If id is specified and action is not, return the object of the given id; If id and action are both specified, execute the given action results of the given id; If neither id nor action is specified, return the list of items given the page_size, page_num and filter :param id: :param action: :return: """ # import pdb # pdb.set_trace() args = self.parser.parse_args() # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # list items elif id is None: # filter cond = {} if args.get('filter') is not None: cond = args.filter # cond = json.loads(args.filter) # page number page = 1 if args.get('page_num') is not None: page = args.page # page = int(args.page) # page size page_size = 10 if args.get('page_size') is not None: page_size = args.page_size # page = int(args.page_size) # TODO: sort functionality # total count total_count = db_manager.count(col_name=self.col_name, cond=cond) # items items = db_manager.list(col_name=self.col_name, cond=cond, skip=(page - 1) * page_size, limit=page_size) # TODO: getting status for node return { 'status': 'ok', 'total_count': total_count, 'page_num': page, 'page_size': page_size, 'items': jsonify(items) } # get item by id else: return jsonify(db_manager.get(col_name=self.col_name, id=id))
def get(self, id: str = None, action: str = None): """ GET method of TaskAPI. :param id: item id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get(col_name=self.col_name, id=id) spider = db_manager.get(col_name='spiders', id=str(task['spider_id'])) # spider task['num_results'] = 0 if spider: task['spider_name'] = spider['name'] if spider.get('col'): col = spider.get('col') num_results = db_manager.count(col, {'task_id': task['_id']}) task['num_results'] = num_results # duration if task.get('finish_ts') is not None: task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds() task['avg_num_results'] = round( task['num_results'] / task['duration'], 1) try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 filter_str = args.get('filter') filter_ = {} if filter_str is not None: filter_ = json.loads(filter_str) if filter_.get('spider_id'): filter_['spider_id'] = ObjectId(filter_['spider_id']) tasks = db_manager.list(col_name=self.col_name, cond=filter_, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts') items = [] for task in tasks: # get spider _spider = db_manager.get(col_name='spiders', id=str(task['spider_id'])) # status if task.get('status') is None: task['status'] = TaskStatus.UNAVAILABLE # spider task['num_results'] = 0 if _spider: # spider name task['spider_name'] = _spider['name'] # number of results if _spider.get('col'): col = _spider.get('col') num_results = db_manager.count(col, {'task_id': task['_id']}) task['num_results'] = num_results # duration if task.get('finish_ts') is not None: task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds() task['avg_num_results'] = round( task['num_results'] / task['duration'], 1) items.append(task) return { 'status': 'ok', 'total_count': db_manager.count('tasks', filter_), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }