def save(executeid, eventType = '', extData = {}): '''抓取完成通知 executeid 执行ID eventType 事件类型,包括:spider_ok,piping_filterword,piping_fingerprint,piping_keywor,piping_error_http_code,piping_ok extData 附加数据,字典格式 ''' execute = db.getbyid('task_execute', executeid) startAt = formatTimestamp(execute['start_at']) if execute['start_at'] else '' endAt = formatTimestamp(execute['end_at']) if execute['end_at'] else '' requestData = {'status':execute['status'], 'start_at':startAt, 'end_at':endAt} requestData = dict(extData, **requestData) data = { 'site_id': execute['site_id'], 'task_id': execute['task_id'], 'app_id': execute['app_id'], 'execute_id': execute['id'], 'event_type': eventType, 'task_type': execute['task_type'], 'notify_url': execute['notify_url'], 'request_data': json.dumps(requestData, ensure_ascii=False), } notifyId = db.insert('task_notify', data) data['id'] = notifyId data[mqidKey] = notifyId data[batchKey] = execute['id'] Mq.produce([data], 'notify') return notifyId
def task_get_id(id): task = db.fetchone('select * from task where id=:id', {'id': id}) if not task: return False task['start_urls'] = _startUrls2Raw(task['start_urls']) task['create_at'] = formatTimestamp(task['create_at']) task['update_at'] = formatTimestamp(task['update_at']) return task
def getall(): sql = "select * from task_notify order by id desc" rows = db.fetchall(sql) notifies = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) notifies.append(row) return notifies
def get_id(settingid=None): sql = "select * from setting where id=%s" row = db.fetchone(sql, [settingid]) if row: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) return row else: return False
def get_id(appid=None): row = db.fetchone("select * from app where id=:id", {'id':appid}) if row: row['token_expired'] = formatTimestamp(row['token_expired']) row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) del(row['public_key']) return row else: return False
def getall(): sql = "select * from setting" rows = db.fetchall(sql) settings = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) settings.append(row) return settings
def getall(): sql = "select * from proxy" rows = db.fetchall(sql) proxies = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) proxies.append(row) return proxies
def getall(): sql = "select * from app order by id desc" rows = db.fetchall(sql) apps = [] for row in rows: row['token_expired'] = formatTimestamp(row['token_expired']) row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) apps.append(row) return apps
def execute_getnew_taskid(taskid): '''根据任务ID获取最新的执行信息''' execute = db.fetchone( 'select * from task_execute where task_id=:id order by id desc', {'id': taskid}) if not execute: return False execute['start_at'] = formatTimestamp(execute['start_at']) execute['end_at'] = formatTimestamp(execute['end_at']) execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def task_getnew_id(taskid): execute = db.fetchone( 'select * from task_execute where task_id=:task_id order by id desc limit 1', {'task_id': taskid}) execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def execute_get_id(executeid): '''根据执行ID获取执行信息''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) if not execute: return False execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def piping_getall_taskid(taskid = None): ''' 获取数据处理通道''' pipings = {} taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid}) for piping in taskPipings: piping['create_at'] = formatTimestamp(piping['create_at']) piping['update_at'] = formatTimestamp(piping['update_at']) pipingType = piping['type'] if pipingType in ['filterword', 'keyword']: pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']}) piping['words'] = pipingExtend['data'] pipings[pipingType] = piping return pipings
def execute_getall_taskid(taskid): rows = db.fetchall( 'select * from task_execute where task_id=:task_id order by id desc', {'task_id': taskid}) if not rows: return False executes = [] for execute in rows: execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) executes.append(execute) return executes
def result_getall_executeid(executeid = None): ''' 获取数据处理结果''' pipingResults = db.fetchall('select task_id,execute_id,type,result,create_at from task_piping_result where execute_id=:eid', {'eid':executeid}) for row in pipingResults: row['create_at'] = formatTimestamp(row['create_at']) row['result'] = json.loads(row['result']) return pipingResults
def task_getall(page=1, pagesize=20): page = _str2int(page) if page else 1 pagesize = _str2int(pagesize) if pagesize else 20 if page < 1: page = 1 if pagesize < 1: pagesize = 20 offset = (page - 1) * pagesize rows = db.fetchall( 'select * from task order by id desc limit :limit offset :offset;', { 'limit': pagesize, 'offset': offset }) if not rows: return False tasks = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) tasks.append(row) return tasks
def postgres2mongo(executeid): '''根据执行ID将数据从数据库转移到mongodb example: #for executeid in list(range(10000)): # postgres2mongo(executeid) ''' rows = db.fetchall('select * from spider_url where execute_id=:id', {'id': executeid}) if not rows: return False for row in rows: row['start_at'] = formatTimestamp( row['start_at']) if row['start_at'] else '' row['end_at'] = formatTimestamp(row['end_at']) if row['end_at'] else '' row['create_at'] = formatTimestamp( row['create_at']) if row['create_at'] else '' row['update_at'] = formatTimestamp( row['update_at']) if row['update_at'] else '' mongoSpider['spiderurl'].insert(rows) return True
def execute_init(eid): '''本函数允许重复执行''' execute = db.getbyid('task_execute', eid) if not execute: return False execute['create_at'] = formatTimestamp( execute['create_at']) if execute['create_at'] else '' execute['update_at'] = formatTimestamp( execute['update_at']) if execute['update_at'] else '' execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['status'] = 101 mgExecute = mgdb.execute_getbyid(eid) if not mgExecute: execute_spider = deepcopy(execute) mgdb.c_insert('execute', execute_spider, autoid=False) startUrls = json.loads(execute['start_urls']) startUrlsLen = len(startUrls) urlCount = mongoSpider['spiderurl'].find({ 'execute_id': eid }, { '_id': 0 }).count() if startUrlsLen > urlCount: urlRows = [] for url in startUrls: urldata = { 'site_id': execute['site_id'], 'task_id': execute['task_id'], 'app_id': execute['app_id'], 'task_type': execute['task_type'], 'execute_id': eid, 'exec_level': execute['exec_level'], 'url': url, 'url_type': 'self', 'method': 'get', 'status': 0, 'create_at': now_format(), 'update_at': now_format(), } urlRows.append(urldata) mgdb.c_insert_batch('spiderurl', urlRows) undos = [ i for i in mongoSpider['spiderurl'].find({'execute_id': eid}, {'_id': 0}) ] undos_spider = [] undos_mirror = [] for undo in undos: undo[mqidKey] = undo['id'] undo[batchKey] = undo['execute_id'] undos_spider.append(undo) undos_mirror.append(undo) pre = 'mq_spider_' stages = ['undo', 'ready', 'doing', 'done'] stats = { stage: mongoMq[pre + stage].find({ batchKey: eid }).count() for stage in stages } total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done'] if startUrlsLen > total: #添加spider队列 Mq.produce(undos_spider, 'spider') #添加mirror队列 if execute['task_type'] == 'mirror': Mq.produce(undos_mirror, 'mirror') if not mgExecute: db.updatebyid('task_execute', {'status': 101}, eid) return True