def getContent(self, text, folder): links_p = re.compile('<td class="td6"><a href=bbstcon\?board=(.*?)>') result = links_p.findall(text) url_board = 'http://bbs.sysu.edu.cn/bbstcon?board=' #url+result[0]... for i in result: each_page_link = url_board + i print each_page_link content = requests.get(each_page_link, headers=self.header) content.encoding = 'gbk' s = content.text #req=urllib2.Request(each_page_link,headers=self.header) #resp=urllib2.urlopen(req).read() #c=content.decode('utf-8') #c=content #print c #print type(s) #print resp.decode('gbk') html = etree.HTML(s) #print content.decode('gbk') #t= chardet.detect(content) #print content['encoding'] title = html.xpath('//title/text()')[0] #t=title[0].decode('gbk').encode('utf-8') #t= title[0].decode('gbk') #t= unicode(title[0],'gbk') try: print title except: print "Can't decode title, return" return 0 filename = re.sub(u' - 逸仙时空BBS', '', title) filename = Toolkit.filename_filter(filename) f_fullpath = os.path.join(folder, filename) try: Toolkit.save2filecn(f_fullpath, title) Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n') Toolkit.save2filecn(f_fullpath, each_page_link) Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n') except: print each_page_link print "Create file error, go to next article" return 0 detail = html.xpath('//td[@class="border content2"]') #print detail for i in detail: #print type(i) Toolkit.save2filecn(f_fullpath, i.xpath('string(.)')) #print i.xpath('string(.)') #f = open('log.txt','w') #f = codecs.open(filename,'w',encod) #f.write(t) #f.close() #print t #Toolkit.save2filezn("log",t) time.sleep(5)
collection=session.get(fav,headers=headers) fav_content= collection.text #print(fav_content) p=re.compile('var favs = {(.*?)};',re.S|re.M) result=p.findall(fav_content)[0].strip() new_result='{'+result+'}' #print(type(new_result)) #print(new_result) data=json.loads(new_result) use_data= data['list'] host='https://xueqiu.com' for i in use_data: url=host+ i['target'] print(url) txt_content=session.get(url,headers=headers).text #print(txt_content.text) tree=etree.HTML(txt_content) title=tree.xpath('//title/text()')[0] filename = re.sub('[\/:*?"<>|]', '-', title) print(filename) content=tree.xpath('//div[@class="detail"]') for i in content: Toolkit.save2filecn(filename, i.xpath('string(.)')) #print(content) #Toolkit.save2file(filename,) time.sleep(10)
fav_content= collection.text #print fav_content p=re.compile('var favs = {(.*?)};',re.S|re.M) result=p.findall(fav_content)[0].strip() new_result='{'+result+'}' #print type(new_result) #print new_result data=json.loads(new_result) use_data= data['list'] host='https://xueqiu.com' for i in use_data: url=host+ i['target'] print url txt_content=session.get(url,headers=headers).text #print txt_content.text tree=etree.HTML(txt_content) title=tree.xpath('//title/text()')[0] filename = re.sub('[\/:*?"<>|]', '-', title) print filename content=tree.xpath('//div[@class="detail"]') Toolkit.save2filecn(filename,"Link: %s\n\n" %url) for i in content: Toolkit.save2filecn(filename, i.xpath('string(.)')) #print content #Toolkit.save2file(filename,) time.sleep(10)
time.sleep(1) try: at_content = session.get(at_addr, headers=headers).text.encode("UTF-8") except: print at_addr at_name = re.findall( '<h1 class="article__bd__title">(.*)</h1><div class="article__bd__detail">', at_content) at_name[0] = at_name[0].replace('|', '--') cp_content = re.findall(r'\>\$(.*?)\$\<', at_content) cp_code = ['cd_none'] cp_name = ['nm_none'] if len(cp_content) > 0: cp_code = re.findall(r'\((.*)\)', cp_content[0]) cp_name = re.findall(r'(.*)\(', cp_content[0]) cp_addr = cp_host + cp_code[0] 'index | name | code | at' save_data = save_data + '\r\n' + str(index).zfill(5) + ' | ' + cp_name[ 0] + ' | [' + cp_code[0] + '](' + cp_addr.encode( "UTF-8") + ')' + ' | [' + at_name[0] + '](' + at_addr.encode( "UTF-8") + ')' #print save_data #save_data = save_data + '\r\n' + str(index).zfill(5) + ' | [' + at_name[0] + '](' + at_addr.encode("UTF-8") + ')' time.sleep(1) #print save_data reload(sys) sys.setdefaultencoding('utf-8') Toolkit.save2filecn('data', save_data)