Exemplo n.º 1
0
def onespider(url):
    print url
    # 设置http报文的header信息
    opener = urllib2.build_opener()
    Cookie = "lDlk_ecc9_saltkey=ZJQz5lj0; lDlk_ecc9_lastvisit=1470395868; lDlk_ecc9_sid=A381e8; lDlk_ecc9_lastact=1470399514%09forum.php%09; lDlk_ecc9_sendmail=1; Hm_lvt_fa32dadde3745af309b587b38d20ea1d=1470399470; Hm_lpvt_fa32dadde3745af309b587b38d20ea1d=1470399516; lDlk_ecc9_ulastactivity=3b5dAWO7kvzHuDcG%2F%2Fd0zcitjHfpCrfanOQFxsz%2FHQxXAHKPO5oc; lDlk_ecc9_auth=ba27pUzFIwwrrTgeRTtK5LrL3Xen7AVwXwswIMThv5frRRAn8mSYJil8%2BnCGogg1OUJxQEKdAVkT6FYJBGrz0Qm9KxA; lDlk_ecc9_lastcheckfeed=363083%7C1470399485; lDlk_ecc9_lip=202.102.144.8%2C1470397927; lDlk_ecc9_connect_is_bind=0; lDlk_ecc9_nofavfid=1; lDlk_ecc9_onlineusernum=5782; lDlk_ecc9_myrepeat_rr=R0; lDlk_ecc9_nofocus_forum=1; tjpctrl=1470401288179; lDlk_ecc9_wx_from=8601ZdCNX8jSB%2BfVpEeMlOKbm0tFllWv27jSI4qT85hZMqN8D3r1"
    opener.addheaders = [
        ('User-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'),
        ('Cookie',Cookie),
        ('Connection','keep-alive')
    ]
    urllib2.install_opener(opener)
    response2 = urllib2.urlopen(url)
    soup2 = BeautifulSoup(response2, 'lxml', from_encoding="utf-8")
    hidden = soup2.find(attrs={'class':'locked'})
    if hidden != None:
        if hidden.find(name='a') != None:
            doReply(url, domain, Cookie)
    try:
        p_page_content = soup2.find(attrs={'class':'pgt'}).find(attrs = {'class':'pg'})
    except:
        print "bad url!"
        f = open('badurl3.txt','a')
        f.write(url + '    ----None posts')
        f.write('\n')
        f.close()
        return
    if(p_page_content != None):
        try:
            p_page_url_init = p_page_content.find(name = 'a').get('href')[:-1]
        except:
            print "bad url!"
            f = open('badurl3.txt','a')
            f.write(url + '    ----p_page_init bad')
            f.write('\n')
            f.close()
            return
        p_page_end = p_page_content.find(name='span').string
        compiled_page = re.compile(r'[0-9]+')
        p_page_end = int(compiled_page.search(str(p_page_end)).group())
        if p_page_end > 400:
            exPage(soup2, opener, coll, domain, url, p_page_end, p_page_url_init)
            return
    else:
        p_page_end = 1
    print 'this post has ', p_page_end, 'pages.'
    #try:
    board = soup2.find(attrs = {'id':'pt'}).findAll(name = 'a')[-2].text
    title = soup2.find(attrs = {'id':'thread_subject'}).text
    #except:
    #    print "bad url!"
    #    f = open('badurl3.txt','a')
    #    f.write(url + '    ----board or title bad')
    #    f.write('\n')
    #    f.close()
    #    return
    newItem = {} # 初始化一个新帖子字典
    flag = True # 识别是不是楼主的flag
    floor = 1
    for l in range(1, p_page_end+1):
        if(p_page_content != None):
            p_page_url = p_page_url_init + str(l)
        else:
            p_page_url = url
        try:
            response3 = urllib2.urlopen(p_page_url)
            soup3 = BeautifulSoup(response3, 'lxml', from_encoding="utf-8")
            print 'located at ' + p_page_url
        except:
            f = open('badurl3.txt','a')
            f.write(p_page_url + '    ----sub herf bad')
            f.write('\n')
            f.close()
            return
        posts = soup3.findAll(attrs={'class': 'pi'})
        posts_contents = soup3.findAll(attrs={'class': 't_f'})
        # 以下循环将正文部分处理成一整个字符串形式
        i = 1
        for post_contents in posts_contents:
            try:
                author = posts[2*(i-1)].contents[1].contents[0].string
                _time = compiled_time.search(str(posts[2*(i-1)+1])).group()
            except:
                print "ERROR:获取作者、时间信息出错!"
                f = open('badurl3.txt','a')
                f.write(p_page_url + '    ----author or time or floor bad')
                f.write('\n')
                f.close()
                continue
            lines = post_contents.encode('utf-8')
            lines = re.sub('[?]', '', lines)
            lines = re.sub('<span style=["]display:none["]>[^>]+>', '', lines)
            lines = re.sub('<font class=["]jammer["]>[^>]+>', '', lines)
            lines = re.sub('<(.*?)>', '', lines)
            lines = lines.strip()
            print lines
            if(i == 1 and flag == True):
                first_floor = {
                    'author':author,
                    'title':title,
                    'floor':str(floor),
                    'content':lines,
                    'href':url,
                    'board':board,
                    'time':_time
                }
                print "爬到",_time,title,"了"
                newItem['1'] = first_floor
                flag = False
            else:
                other_floor = {
                    'content':lines,
                    'time':_time,
                    'floor':str(floor),
                    'author':author
                }
                newItem[str(floor)] = other_floor
            i += 1
            floor += 1
    try:
        if not newItem:
            print "bad url!"
            f = open('badurl3.txt','a')
            f.write(url + '    ----None dict')
            f.write('\n')
            f.close()
        else:
            coll.insert(newItem)
    except:
        print "bad url!"
        f = open('badurl3.txt','a')
        f.write(url + '    ----insert db wrong')
        f.write('\n')
        f.close()
        print "ERROR:数据库存储错误!"
Exemplo n.º 2
0
 if (p_page_content != None):
     try:
         p_page_url_init = p_page_content.find(name='a').get('href')[:-1]
     except:
         print "bad url!"
         f = open('badurl.txt', 'a')
         f.write(href + '    ----p_page_init bad')
         f.write('\n')
         f.close()
         continue
     p_page_end = p_page_content.find(name='span').string
     compiled_page = re.compile(r'[0-9]+')
     p_page_end = int(compiled_page.search(str(p_page_end)).group())
     print 'this post has ', p_page_end, 'pages.'
     if p_page_end > 400:
         exPage(soup2, opener, coll, domain, href, p_page_end,
                p_page_url_init)
         continue
 else:
     p_page_end = 1
 try:
     board = soup2.find(attrs={'class': 'bm cl'}).findAll(name='a')[-2].text
     title = soup2.find(attrs={'id': 'thread_subject'}).text
 except:
     print "bad url!"
     f = open('badurl.txt', 'a')
     f.write(href + '    ----board or title bad')
     f.write('\n')
     f.close()
     continue
 newItem = {}  # 初始化一个新帖子字典
 flag = True  # 识别是不是楼主的flag