示例#1
0
def save(executeid, eventType = '', extData = {}):
    '''抓取完成通知
    executeid 执行ID
    eventType 事件类型,包括:spider_ok,piping_filterword,piping_fingerprint,piping_keywor,piping_error_http_code,piping_ok
    extData 附加数据,字典格式
    '''
    execute = db.getbyid('task_execute', executeid)
    startAt = formatTimestamp(execute['start_at']) if execute['start_at'] else ''
    endAt = formatTimestamp(execute['end_at']) if execute['end_at'] else ''
    requestData = {'status':execute['status'], 'start_at':startAt, 'end_at':endAt}
    requestData = dict(extData, **requestData)
    data = { 
        'site_id': execute['site_id'],
        'task_id': execute['task_id'],
        'app_id': execute['app_id'],
        'execute_id': execute['id'],
        'event_type': eventType,
        'task_type': execute['task_type'],
        'notify_url': execute['notify_url'],
        'request_data': json.dumps(requestData, ensure_ascii=False),
    }
    notifyId = db.insert('task_notify', data)
    data['id'] = notifyId
    data[mqidKey] = notifyId
    data[batchKey] = execute['id']
    Mq.produce([data], 'notify')
    return notifyId
示例#2
0
def task_get_id(id):
    task = db.fetchone('select * from task where id=:id', {'id': id})
    if not task: return False
    task['start_urls'] = _startUrls2Raw(task['start_urls'])
    task['create_at'] = formatTimestamp(task['create_at'])
    task['update_at'] = formatTimestamp(task['update_at'])
    return task
示例#3
0
def getall():
    sql = "select * from task_notify order by id desc"
    rows = db.fetchall(sql)
    notifies = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        notifies.append(row)
    return notifies
示例#4
0
def get_id(settingid=None):
    sql = "select * from setting where id=%s"
    row = db.fetchone(sql, [settingid])
    if row:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        return row
    else:
        return False
示例#5
0
def get_id(appid=None):
    row = db.fetchone("select * from app where id=:id", {'id':appid})
    if row:
        row['token_expired'] = formatTimestamp(row['token_expired'])
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        del(row['public_key'])
        return row
    else:
        return False
示例#6
0
def getall():
    sql = "select * from setting"
    rows = db.fetchall(sql)
    settings = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        settings.append(row)

    return settings
示例#7
0
def getall():
    sql = "select * from proxy"
    rows = db.fetchall(sql)
    proxies = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        proxies.append(row)

    return proxies
示例#8
0
def getall():
    sql = "select * from app order by id desc"
    rows = db.fetchall(sql)
    apps = []
    for row in rows:
        row['token_expired'] = formatTimestamp(row['token_expired'])
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        apps.append(row)

    return apps
示例#9
0
def execute_getnew_taskid(taskid):
    '''根据任务ID获取最新的执行信息'''
    execute = db.fetchone(
        'select * from task_execute where task_id=:id order by id desc',
        {'id': taskid})
    if not execute: return False
    execute['start_at'] = formatTimestamp(execute['start_at'])
    execute['end_at'] = formatTimestamp(execute['end_at'])
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#10
0
def task_getnew_id(taskid):
    execute = db.fetchone(
        'select * from task_execute where task_id=:task_id order by id desc limit 1',
        {'task_id': taskid})
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#11
0
def execute_get_id(executeid):
    '''根据执行ID获取执行信息'''
    execute = db.fetchone('select * from task_execute where id=:id',
                          {'id': executeid})
    if not execute: return False
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#12
0
def piping_getall_taskid(taskid = None):
    ''' 获取数据处理通道'''
    pipings = {}
    taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid})
    for piping in taskPipings:
        piping['create_at'] = formatTimestamp(piping['create_at'])
        piping['update_at'] = formatTimestamp(piping['update_at'])
        pipingType = piping['type']
        if pipingType in ['filterword', 'keyword']:
            pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']})
            piping['words'] = pipingExtend['data']
        pipings[pipingType] = piping
    return pipings
示例#13
0
def execute_getall_taskid(taskid):
    rows = db.fetchall(
        'select * from task_execute where task_id=:task_id order by id desc',
        {'task_id': taskid})
    if not rows: return False
    executes = []
    for execute in rows:
        execute['start_at'] = formatTimestamp(
            execute['start_at']) if execute['start_at'] else ''
        execute['end_at'] = formatTimestamp(
            execute['end_at']) if execute['end_at'] else ''
        execute['create_at'] = formatTimestamp(execute['create_at'])
        execute['update_at'] = formatTimestamp(execute['update_at'])
        executes.append(execute)
    return executes
示例#14
0
def result_getall_executeid(executeid = None):
    ''' 获取数据处理结果'''
    pipingResults = db.fetchall('select task_id,execute_id,type,result,create_at from task_piping_result where execute_id=:eid', {'eid':executeid})
    for row in pipingResults:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['result'] = json.loads(row['result'])
    return pipingResults
示例#15
0
def task_getall(page=1, pagesize=20):
    page = _str2int(page) if page else 1
    pagesize = _str2int(pagesize) if pagesize else 20
    if page < 1: page = 1
    if pagesize < 1: pagesize = 20
    offset = (page - 1) * pagesize

    rows = db.fetchall(
        'select * from task order by id desc limit :limit offset :offset;', {
            'limit': pagesize,
            'offset': offset
        })
    if not rows: return False
    tasks = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        tasks.append(row)
    return tasks
示例#16
0
def postgres2mongo(executeid):
    '''根据执行ID将数据从数据库转移到mongodb
    example:
        #for executeid in list(range(10000)):
        #    postgres2mongo(executeid)
    '''
    rows = db.fetchall('select * from spider_url where execute_id=:id',
                       {'id': executeid})
    if not rows: return False
    for row in rows:
        row['start_at'] = formatTimestamp(
            row['start_at']) if row['start_at'] else ''
        row['end_at'] = formatTimestamp(row['end_at']) if row['end_at'] else ''
        row['create_at'] = formatTimestamp(
            row['create_at']) if row['create_at'] else ''
        row['update_at'] = formatTimestamp(
            row['update_at']) if row['update_at'] else ''
    mongoSpider['spiderurl'].insert(rows)
    return True
示例#17
0
def execute_init(eid):
    '''本函数允许重复执行'''
    execute = db.getbyid('task_execute', eid)
    if not execute: return False
    execute['create_at'] = formatTimestamp(
        execute['create_at']) if execute['create_at'] else ''
    execute['update_at'] = formatTimestamp(
        execute['update_at']) if execute['update_at'] else ''
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['status'] = 101

    mgExecute = mgdb.execute_getbyid(eid)
    if not mgExecute:
        execute_spider = deepcopy(execute)
        mgdb.c_insert('execute', execute_spider, autoid=False)

    startUrls = json.loads(execute['start_urls'])
    startUrlsLen = len(startUrls)
    urlCount = mongoSpider['spiderurl'].find({
        'execute_id': eid
    }, {
        '_id': 0
    }).count()
    if startUrlsLen > urlCount:
        urlRows = []
        for url in startUrls:
            urldata = {
                'site_id': execute['site_id'],
                'task_id': execute['task_id'],
                'app_id': execute['app_id'],
                'task_type': execute['task_type'],
                'execute_id': eid,
                'exec_level': execute['exec_level'],
                'url': url,
                'url_type': 'self',
                'method': 'get',
                'status': 0,
                'create_at': now_format(),
                'update_at': now_format(),
            }
            urlRows.append(urldata)
        mgdb.c_insert_batch('spiderurl', urlRows)
    undos = [
        i
        for i in mongoSpider['spiderurl'].find({'execute_id': eid}, {'_id': 0})
    ]
    undos_spider = []
    undos_mirror = []
    for undo in undos:
        undo[mqidKey] = undo['id']
        undo[batchKey] = undo['execute_id']
        undos_spider.append(undo)
        undos_mirror.append(undo)

    pre = 'mq_spider_'
    stages = ['undo', 'ready', 'doing', 'done']
    stats = {
        stage: mongoMq[pre + stage].find({
            batchKey: eid
        }).count()
        for stage in stages
    }
    total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done']
    if startUrlsLen > total:
        #添加spider队列
        Mq.produce(undos_spider, 'spider')

        #添加mirror队列
        if execute['task_type'] == 'mirror': Mq.produce(undos_mirror, 'mirror')

    if not mgExecute: db.updatebyid('task_execute', {'status': 101}, eid)
    return True