Пример #1
0
    def getContent(self, text, folder):

        links_p = re.compile('<td class="td6"><a href=bbstcon\?board=(.*?)>')
        result = links_p.findall(text)
        url_board = 'http://bbs.sysu.edu.cn/bbstcon?board='
        #url+result[0]...
        for i in result:
            each_page_link = url_board + i
            print each_page_link
            content = requests.get(each_page_link, headers=self.header)
            content.encoding = 'gbk'
            s = content.text
            #req=urllib2.Request(each_page_link,headers=self.header)
            #resp=urllib2.urlopen(req).read()
            #c=content.decode('utf-8')
            #c=content
            #print c
            #print type(s)
            #print resp.decode('gbk')
            html = etree.HTML(s)
            #print content.decode('gbk')
            #t= chardet.detect(content)
            #print content['encoding']
            title = html.xpath('//title/text()')[0]
            #t=title[0].decode('gbk').encode('utf-8')
            #t= title[0].decode('gbk')
            #t= unicode(title[0],'gbk')
            try:
                print title
            except:
                print "Can't decode title, return"
                return 0
            filename = re.sub(u' - 逸仙时空BBS', '', title)
            filename = Toolkit.filename_filter(filename)
            f_fullpath = os.path.join(folder, filename)
            try:
                Toolkit.save2filecn(f_fullpath, title)
                Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n')
                Toolkit.save2filecn(f_fullpath, each_page_link)
                Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n')
            except:
                print each_page_link
                print "Create file error, go to next article"
                return 0
            detail = html.xpath('//td[@class="border content2"]')
            #print detail
            for i in detail:
                #print type(i)
                Toolkit.save2filecn(f_fullpath, i.xpath('string(.)'))
                #print i.xpath('string(.)')

            #f = open('log.txt','w')
            #f = codecs.open(filename,'w',encod)
            #f.write(t)
            #f.close()
            #print t
            #Toolkit.save2filezn("log",t)

            time.sleep(5)
Пример #2
0
def bbs_filename_check():
    url = 'http://bbs.sysu.edu.cn/bbstcon?board=Love&file=M.1104508652.A'
    headers = {'User-Agent': agent}
    resp = requests.get(url, headers=headers)
    resp.encoding = 'gbk'
    content = resp.text
    tree = etree.HTML(content)
    title = tree.xpath('//title/text()')[0]
    print title
    filename = Toolkit.filename_filter(title)
    print filename