示例#1
0
def dooneurl(taskid=0):
    '''
    执行一次爬行
    '''
    rec = dbm.getoneurl(taskid)
    if rec:
        rec['status'] = 1 # 设置为在爬
        rec['fetchtime'] = dbm.timestring()
        dbm.setoneurl(**rec)
        dbm.commit()
        res = gethtml(rec['url'])
        html = res['html']
        stat = res['status']
        baseurl = rec['baseurl']
        if rec['url'] != res['url']:
            baseurl = dbm.getbaseurl(res['url'])
        if stat:
            ctxs = getcontex(html, rec['keyword'])
            titl = gettitle(html)
#            print 'title:%s'%titl
#            print 'html:%s'%html
            rec['title'] = titl
            if rec['type'] == TYPERECORDALLHTML:
                rec['html'] = html
#            print "ctxs:%s"%ctxs
            count = 0
            if ctxs and len(ctxs):
                count = len(ctxs)
                rec['count'] = count
                rec['context'] = ';;;'.join(ctxs)
                if rec['type'] == TYPERECORDMATCHHTML:
                    rec['html'] = html
                
            # 继续搜索
            if rec['deep'] < rec['maxdeep']:
                urls = geturls(html, baseurl)
#                print "urls:%s"%urls
                if urls and len(urls):
                    rec['childcount'] = len(urls)
                    for i in range(len(urls)):
                        url = urls[i]
                        if rec['urlflag'] and url.find(rec['urlflag'])==-1:
                            continue
                        one = dbm.getoneurl(taskid, url)
                        if not one:
                            print 'add url: %s'%url
                            dbm.addoneurl(taskid=taskid, 
                                            pid=rec['id'], 
                                            url=url, 
                                            keyword=rec['keyword'],
                                            type=rec['type'], 
                                            deep=rec['deep']+1,
                                            urlflag=rec['urlflag'],
                                            power=count,
                                            maxdeep=rec['maxdeep'])
            
            rec['status'] = 2 # 设置为已爬
            rec['completetime'] = dbm.timestring()
            dbm.setoneurl(**rec)
            dbm.commit()
        else:
            print 'get url fail:%s'%rec['url']
            print 'fail info:%s'%html
            rec['html'] = html
            rec['status'] = 3 # 设置为出错
            rec['completetime'] = dbm.timestring()
            dbm.setoneurl(**rec)
            dbm.commit()
        return True
    else:
        print 'task%d is success'%taskid
        return False
示例#2
0
            print 'fail info:%s'%html
            rec['html'] = html
            rec['status'] = 3 # 设置为出错
            rec['completetime'] = dbm.timestring()
            dbm.setoneurl(**rec)
            dbm.commit()
        return True
    else:
        print 'task%d is success'%taskid
        return False

def dotask(taskid):
    '''
    执行一次爬行任务
    '''
    res = True
    try:
        res = dooneurl(taskid)
    except:
        pass
    return res

if __name__ == '__main__':
    taskid = 1
    dbm.clear()
    dbm.addoneurl(taskid=taskid, url='http://news.baidu.com/', urlflag='news.baidu.com', keyword='H7N9', maxdeep=2, type=TYPERECORDMATCHHTML)
    dbm.commit()
    while dotask(taskid):
        pass
    print 'fetch ok'