class XQ_Spider: def __init__(self): self.myclient = HTMLClient() def Get_Json(self, page): myparser = Simple_Parser() return myparser.feed(str(page), 'SNB.data.req_isBrick = 0;', "SNB.data.statusType") def Get_Div(self, page): myparser = Simple_Parser() return myparser.feed(str(page), '<div class="status-content">', "</div>") def Get_Url(self, userid): pg = 1 maxpg = 1000 urlList = [] #retweetList = [] while True: mypage = self.myclient.GetPage("http://xueqiu.com/" + userid +'?page=' + str(pg))#"2821861040") if mypage == None: continue xq_spider = XQ_Spider() xq_json = xq_spider.Get_Json(mypage) #infile = input('>') for item in xq_json: s = item.find('{') e = item.rfind('}') content = item[s:e+1] xml_content = json.loads(content) maxpg = xml_content["maxPage"] for status in xml_content["statuses"]: retweeded_status = status['retweet_status_id'] if retweeded_status == 0: urlList.append(str(status['target'])) #flag = True #for retweed in retweetList: # if retweed == str(retweeded_status): # flag = False #if flag == True: # print(str(status['target'])) # urlList.append(str(status['target'])) # if retweeded_status != 0: # retweetList.append(str(retweeded_status)) pg += 1 if pg > maxpg: break return urlList def Get_HTML(self, userid): urlList = self.Get_Url(userid) with open("xq_article_"+ userid +".html", 'wb') as xqfile: xqfile.write(b'<head>') xqfile.write(b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">') xqfile.write(b'</head>') for url in urlList: #print(url) if url == None: continue mypage = self.myclient.GetPage("http://xueqiu.com" + url) if mypage == None: continue div_ctx = self.Get_Div(mypage) for ctx in div_ctx: xqfile.write(bytes(ctx, 'utf-8')) xqfile.close() def Get_Doc(self, userid): urlList = self.Get_Url(userid) h2d = HTML2Doc() h2d.open('xq_' + userid + '.doc') for url in urlList: if url == None: continue print("download from" + url) mypage = self.myclient.GetPage("http://xueqiu.com" + url) div_ctx = self.Get_Div(mypage) for ctx in div_ctx: h2d.write(ctx) print("Done!")
class JX3_Spider: def Get_News(self, page): myparser = Simple_Parser() return myparser.feed(page, u'<div class="news_list news_list02">', u'</div>') def Get_CSS(self, page): myparser = Simple_Parser() return myparser.feed(page, u'<link ', u'/>') if __name__ == '__main__': myclient = HTMLClient() mypage = myclient.GetPage("http://xw.jx3.xoyo.com/news/") jx3_spider = JX3_Spider() jx3_news = jx3_spider.Get_News(mypage) jx3_css = jx3_spider.Get_CSS(mypage) infile = input('>') with open("jx3_news.html", 'wb') as jx3file: jx3file.write(b'<head>') jx3file.write( b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' ) for item in jx3_css: jx3file.write(bytes(item, 'utf-8')) for item in jx3_news: jx3file.write(bytes(item, 'utf-8')) jx3file.close()