def get_index_2(url): #跟上边的相区分,是为了获得具体某一个版块的里边的帖子的具体链接。 next_page_has_visited = 0 while True: response1 = get_response_and_text(url=url, headers=self.headers) response_in_function_text = response1[ 'response_in_function_text'] try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') for content in datasoup.select( '.content > table > tr')[1:-1]: title = content.select( 'td.title > a[title]')[0].text.strip() #title url = 'http://bbs.csdn.net' + content.select( 'td.title > a[title]')[0].get('href').strip() publish_user = content.select('td.tc a[title]')[0].get( 'title').strip() #publish_user # content.select('td.tc a[title]')[0].get('href')#publish_user_href reply_count = content.select( 'td:nth-of-type(4)')[0].text #read_count this_nodes = { 'url': url, 'publish_user': publish_user, 'title': title, 'reply_count': reply_count, 'spider_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'id': url.split('/')[-1], 'like_count': 0, #根本没有 'content': None, 'reproduce_count': 0, 'publish_user_photo': '', 'reply_nodes': [] } self.content_data_list.append(this_nodes) next_page_url = datasoup.select('a.next') if next_page_url and next_page_has_visited < 10: next_page_has_visited += 1 url_next = 'http://bbs.csdn.net' + next_page_url[ 0].get('href') url = url_next else: break except Exception as e: # print e pass
def get_content_inside(data): #手机端的内容,电脑端的时间 url_for_debug = data['url'] response1 = get_response_and_text(url=url_for_debug) response2 = get_response_and_text(url=url_for_debug, headers=self.headers) try: datasoup2 = BeautifulSoup( response2['response_in_function_text'], 'lxml') publish_time = datasoup2.select( '.replayInfo .float_l.mT10')[0].text.split(u'\xa0')[-1] data['publish_time'] = publish_time except Exception as e: # print e # print 'mark1' pass response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') except Exception as e: return # content=datasoup.select('div.artCont .content-text') content = datasoup.select('div.artCont') if content: #没有结果content应该回事空 content_text = content[0].text.strip() Re_find_img_url = re.compile(r'src="(.*?)"') img_list = Re_find_img_url.findall(str(content[0])) if img_list: for i in range(len(img_list)): if 'http' not in img_list[i]: img_list[ i] = 'http://bbs1.people.com.cn' + img_list[i] # print img_list[i] else: img_list = [] data['content'] = content_text data['img_urls'] = img_list self.comments_data_list.append(data)
def get_index_1(): #获得该论坛所有子论坛,子版块的链接, while True: try: #response_in_function_text有时候乱码有时候不乱码 url_rukou = 'http://bbs.csdn.net/home' response1 = get_response_and_text(url=url_rukou, headers=self.headers) response_in_function_text = response1[ 'response_in_function_text'] datasoup = BeautifulSoup(response_in_function_text, 'lxml') for a in datasoup.select('.dropdown-menu a[href]'): url_bankuai = 'http://bbs.csdn.net' + a.get('href') # print url_bankuai self.index_data_list.append(url_bankuai) if self.index_data_list: break except Exception as e: # print e pass
def get_comment_inside(data): comment_list = [] error_time = 5 page_num = 1 while True: try: comment_url = 'http://bbs1.people.com.cn/mobile.do?action=moreComment&threadId=' + str( data['id']) + '&pageNo=' + str(page_num) response1 = get_response_and_text(url=comment_url) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] datajson = json.loads(response_in_function_text) if not datajson['elements']: break for i in datajson['elements']: id = i['id'] title = i['title'] publish_user = i['userNick'] one_comment = { 'id': id, 'content': title, 'publish_user': publish_user, 'parent_id': data['id'], 'ancestor_id': data['id'] } comment_list.append(one_comment) page_num += 1 except Exception as e: error_time -= 1 if error_time < 0: break time.sleep(5) data['reply_nodes'] = comment_list self.result_data_list.append(data)
def get_content_inside(data): url_debug = data['url'] + '?page=1' while True: response1 = get_response_and_text(url=url_debug, headers=self.headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] response_in_function_text = handleContent( response_in_function_text) datasoup = BeautifulSoup(response_in_function_text, 'lxml') page_begain = 0 if not data['content']: #用来记录是否需要获取content,publish_user的信息 page_begain = 1 try: content_div = datasoup.select( 'div.detailed table.post .post_body')[0] content = content_div.text.strip() except Exception as e: # print e # print data['url'] return img_urls_content = Re_find_img_url.findall( str(content_div)) publish_user_photo = datasoup.select( 'div.detailed table.post .user_info .user_head a img' )[0].get('src') publish_time = datasoup.select( 'div.detailed table.post .time')[0].text.strip().split( '\n')[1].strip() data['content'] = content data['publish_user_photo'] = publish_user_photo data['publish_time'] = publish_time data['img_urls'] = img_urls_content for one_reply in datasoup.select( 'div.detailed table.post')[page_begain:]: try: j = one_reply.select('div.post_body') img_urls = Re_find_img_url.findall(str(j)) img_urls2 = [] for img_url_maybe_have_js in img_urls: if '.js' not in img_url_maybe_have_js: img_urls2.append(img_url_maybe_have_js) content = one_reply.select( '.post_body')[0].text.strip() publish_user_photo = one_reply.select( '.user_info .user_head a img')[0].get( 'src') #publish_user_photo publish_time = one_reply.select( '.time')[0].text.strip().split('\n')[1].strip() louceng_url = one_reply.select('.fr a[href]')[0].get( 'href') like_count = one_reply.select( ' div.control .fr a.red')[0].text.split( '[')[1].split(']')[0] dislike_count = one_reply.select( ' div.control .fr a.bury')[0].text.split( '[')[1].split(']')[0] publish_user = one_reply.select( '.user_info .nickname span')[0].text ancestor_id = data['id'] parent_id = data['id'] publish_user_id = louceng_url.split('post-')[1] url = data['url'] + louceng_url thisnode = { 'publish_user_photo': publish_user_photo, 'publish_time': publish_time, 'like_count': like_count, 'dislike_count': dislike_count, 'publish_user': publish_user, 'ancestor_id': ancestor_id, 'parent_id': parent_id, 'publish_user_id': publish_user_id, 'url': url, 'img_urls': img_urls2, 'content': content, 'id': louceng_url.split('-')[1] } data['reply_nodes'].append(thisnode) except Exception as e: # print e pass next_page_div = datasoup.select('.page_nav .next') if next_page_div: next_url = 'http://bbs.csdn.net' + next_page_div[0].get( 'href') url_debug = next_url else: self.result_data_list.append(data) break
def get_index(url): charge_to_stop = 1 while True: error_num = 5 #又是请求过于频繁,会出现eof错误 while True: response1 = get_response_and_text(url=url) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] # print response_in_function_text try: datajson = json.loads(response_in_function_text) #成功访问一次就重置error_num error_num = 5 if not datajson['elements']: charge_to_stop = 0 break #没有数据了,请求完了 for one_data in datajson['elements']: title = one_data['title'] reply_count = one_data['replyCount'] publish_user = one_data['usernick'] read_count = one_data['readCount'] like_count = one_data['like'] url_index = one_data['url'] id = one_data['id'] publish_time = u'2017-' + one_data[ 'createTime'].replace(u'月', u'-').replace( u'日', u'') + u':00' this_index_info = { 'title': title, 'reply_count': reply_count, 'publish_user': publish_user, 'read_count': read_count, 'like_count': like_count, 'url': u'http://bbs1.people.com.cn' + url_index, 'id': id, 'publish_time': publish_time, 'reply_nodes': [], 'spider_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') } self.content_data_list.append(this_index_info) urlsplit = url.split('pageNo=') url = urlsplit[0] + 'pageNo=' + str( int(urlsplit[1]) + 1) except Exception as e: error_num -= 1 if error_num < 0: break time.sleep(3) if charge_to_stop == 0: break else: url_split = response_in_function.url.split('pageNo=') urlnext = url_split[0] + 'pageNo=' + str( int(url_split[1]) + 1) # get_index(url=urlnext) url = urlnext
def get_content_inside(data): is_first=0 url_debug = data['url'] # url_debug='http://www.chengshiluntan.com/5942261-1.html' # url_debug='http://www.chengshiluntan.com/7561-1.html' # url_debug='http://www.chengshiluntan.com/731-1.html' # url_debug='http://www.chengshiluntan.com/5070-1.html' # url_debug='http://www.chengshiluntan.com/5942282-1.html' while True: response1=get_response_and_text(url=url_debug,headers=self.headers) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] datasoup=BeautifulSoup(response_in_function_text,'lxml') if is_first==0: is_first=1 try: data['title']=datasoup.select('#thread_subject')[0].text except Exception as e: # print e return #有些帖子是没有的 # print data['url'] data['reply_count']= datasoup.select('#postlist > div.bm_h.comiis_snvbt > span.y.comiis_hfs > strong')[0].text data['read_count']= datasoup.select('#postlist > div.bm_h.comiis_snvbt > span.y.comiis_cks > strong')[0].text try: # data['reply_count']=datasoup.select('#postlist > table:nth-of-type(1) > tbody > tr > td.pls.ptm.pbm > div > span:nth-of-type(2)') # data['read_count']=datasoup.select('#postlist > table:nth-of-type(1) > tbody > tr > td.pls.ptm.pbm > div > span:nth-of-type(5)') data['publish_user_photo']=datasoup.select('#postlist div[id] .pls .avatar.comiis_zxtx a img')[0].get('src') data['publish_user_id']=datasoup.select('#postlist div[id] .pls .avatar.comiis_zxtx a')[0].get('href').split('/')[-1] data['id']=datasoup.select('#postlist div[id]')[0].get('id') except Exception as e: # print e data['publish_user_photo']='' data['id']='' data['publish_user_id']='' # print '用户已被删除,所以没有头像' content_div=datasoup.select('#postlist > div[id] div.t_fsz > div.t_f')[0] content_div_this=datasoup.select('#postlist > div[id]')[0] content_div_str=str(content_div_this) img_urls=Re_find_img_url.findall(content_div_str) #9-20添加图片过滤模块,目前只用于去重 img_urls_set=set() for img_url_raw in img_urls: if '.js' not in img_url_raw: img_urls_set.add(img_url_raw) img_urls2=list(img_urls_set) data['img_urls']=img_urls2 data['publish_user']= content_div_this.select('td.plc > div.pi > div.pti > div.authi > a.xi2.kmyzz')[ 0].text # publish_user publish_time_content=content_div_this.select(' .pti .authi em')[0].text.replace(u'发表于', '').strip()+':00' # pubtlish_time data['publish_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(publish_time_content,'%Y-%m-%d %H:%M:%S'))#为什么这么写,因为个是会出现11-1这样的错误,应该的是11-01这样子的。 data['content']= content_div_this.select(' div.t_fsz > div.t_f')[0].text.strip() # content # data['publish_user_photo']=content_div_this.select('') follow_div=datasoup.select('#postlist > div[id]')[is_first:-1] for one_reply in follow_div: try: comment_reply_nodes=[] publish_user=one_reply.select(' div.pti div.authi a.xi2.kmyzz')[ 0].text # publish_user publish_time= one_reply.select('tr:nth-of-type(1) > td.plc > div.pi > div.pti > div.authi > em')[ 0].text.replace(u'发表于', '').strip()+':00' # pubtlish_time content= one_reply.select(' div.t_fsz > div.t_f')[0].text.strip() # content id=one_reply.get('id') if one_reply.select('div.cm')[0].text.strip(): id= one_reply.select('div.cm')[0].get('id') # comment_id try: publish_user_photo= one_reply.select('div.cm div.pstl div.psta a > img')[0].get('src') # publish_photo except : publish_user_photo='' content= one_reply.select('div.pstl div.psti')[0].text.split(u'详情')[0].strip() # content publish_user= one_reply.select('div.pstl div.psta a.xi2')[0].text # publish_user publish_time= one_reply.select('div.pstl div.psti span.xg1')[0].text.replace(u'发表于', '').strip() # publish_time comment_reply_node={ 'id':id, 'publish_user_photo':publish_user_photo, 'content':content, 'publish_user':publish_user, 'publish_time':publish_time, } comment_reply_nodes.append(comment_reply_node) img_urls_reply = Re_find_img_url.findall(str(one_reply.select('.t_fsz'))) img_urls_reply2=[] for i in img_urls_reply: if '.js' in i: continue elif 'http' not in i: i='http://www.chengshiluntan.com/'+i img_urls_reply2.append(i) else: img_urls_reply2.append(i) try: publish_user_id = datasoup.select('.pls .avatar.comiis_zxtx a')[0].get('href').split('/')[ -1] publish_user_photo= one_reply.select('td.pls > div.pls div div.avatar a img')[0].get( 'src') # publish_user_photo # publish_user_id= one_reply.select('td.pls > div.pls div.m.z div[id]')[0].get('id') # publish_user_id except Exception as e: # print e id='' publish_user_photo='' publish_user_id='' this_comment_node={ 'publish_user':publish_user, 'publish_time':time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(publish_time,'%Y-%m-%d %H:%M:%S')), 'content':content, 'id':id, 'publish_user_photo':publish_user_photo, 'publish_user_id':publish_user_id, 'reply_nodes':comment_reply_nodes, 'url':url_debug+"#"+id, 'img_urls':img_urls_reply } data['reply_nodes'].append(this_comment_node) except Exception as e: # print e pass url_next_div=datasoup.select('a.nxt') if url_next_div: url_next=url_next_div[0].get('href') # if len(url_next)<7: url_debug='http://www.chengshiluntan.com/'+url_next # else: # print len(url_next) # break else: self.result_data_list.append(data) break
def get_comment_inside(data): #也是分为两段设计,第一次获得contentid topicid = None cmspage_taotalnum = 1 comments_data = [] cmspagenum = 1 #额外添加 request_num = 1 error_time = 5 # comments_data=[] while True: # reply_count=0 if not topicid: comment_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id=' try: comment_url = comment_url_without_id + data['sid'] except Exception as e: print e break #图片类新闻没有评论 else: comment_url = 'http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=' + str( topicid) + '&page_no=' + str(request_num) response1 = get_response_and_text(url=comment_url) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] try: data_json = json.loads(response_in_function_text) except Exception as e: print e # return break try: data_json['comments'] except Exception as e: print e continue error_time -= 1 if error_time < 1: break if data_json['comments']: data_json_comments = data_json['comments'] cmspage_taotalnum = data_json['cmt_sum'] topicid = data_json['topic_id'] for someone_comment in data_json_comments: content = someone_comment['content'] # content id = someone_comment['comment_id'] # id publish_user_photo = someone_comment['passport'][ 'img_url'] # publish_user_photo try: publish_user = someone_comment['passport'][ 'nickname'] # publish_user except Exception as e: print e publish_user = '' publish_user_id = someone_comment['passport'][ 'user_id'] # publish_user_id create_time = someone_comment[ 'create_time'] # publish_time create_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(int(create_time / 1000)))) spider_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') like_count = someone_comment['support_count'] parent_id = data['id'] #mark这两个节点到底应该放什么东西呢? ancestor_id = data['id'] this_comments = someone_comment['comments'] if this_comments: parent_id = this_comments[0]['comment_id'] #用堆来解决这种类型的评论8-16 # for this_comments cmspagenum += 1 thiscomments = { 'content': content, 'id': id, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'publish_time': create_time, 'spider_time': spider_time, 'like_count': like_count, 'parent_id': parent_id, 'ancestor_id': ancestor_id, } comments_data.append(thiscomments) if cmspagenum >= cmspage_taotalnum - 1: break request_num += 1 if cmspagenum > cmspage_taotalnum / 30: break data['reply_nodes'] = comments_data if not comments_data: data['reply_count'] = 0 else: data['reply_count'] = cmspage_taotalnum while len(self.result_list) > 600: time.sleep(1) print 'is waiting the lenth of the result_list to decrease to 300' #最后处理,去掉不需要的字段: try: del data['sid'] #图片类的新闻没有sid except: pass self.result_list.append(data)
def get_content_inside(data): #这里不设计去重功能就真的没法停下来了 #这里就写第一次的代码功能就行 url = data['url'] page_num = url.split('/')[-1] response1 = get_response_and_text( url=url, needupdate=True, update_info={'page_num': page_num}, charset='utf-8') response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] Re_find_sid = re.compile(r'sid=".*"') try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') except Exception as e: print e return if ('class="swiper-container"' not in response_in_function_text ) and ('class="content"' in response_in_function_text): #这个是文字类的新闻 sid = Re_find_sid.findall(response_in_function_text)[0].split( '"')[1] data['sid'] = sid datasoup = BeautifulSoup(response_in_function_text, 'lxml') for i in datasoup.select( 'body > div.content > div.neirong > h2'): title = i.text for j in datasoup.select( 'body > div.content > div.neirong > p > span:nth-of-type(4)' ): publish_time = j.text for k in datasoup.select( 'body > div.content > div.neirong > p > span:nth-of-type(3)' ): publish_user = k.text.replace(' ', '').replace( '\t', '').replace('\n', '').replace('\r', '').replace(u'来源:', '') break for publish_user_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(3)' ): publish_user = publish_user_for.text break # for publish_user_for in datasoup.select('body > div.content > p.jieshao > span:nth-child(3) > a') content = '' for l in datasoup.select( 'body > div.content > div.neirong > article > p'): content += l.text img_urls = [] neirong_content = datasoup.select( 'body > div.content > div.neirong') neirong_content = str(neirong_content) Re_find_img_url = re.compile(r'src=".*?"') img_find_by_re = Re_find_img_url.findall(neirong_content) for i in img_find_by_re: img_urls.append(i.split('"')[1]) try: publish_time += ':00' except Exception as e: print e data['title'] = title data['content'] = content data['publish_time'] = publish_time data['publish_user'] = publish_user data['reply_nodes'] = [] data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data['img_urls'] = img_urls elif 'class="swiper-container"' in response_in_function_text: #这里可能是图片新闻 content = '' img_urls = [] for title_for in datasoup.select('body > div.content > h2'): title = title_for.text for publish_time_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(4)' ): publish_time = publish_time_for.text + ':00' for publish_user_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(3) > a' ): publish_user = publish_user_for.text.replace( ' ', '').replace('\t', '').replace('\n', '').replace( '\r', '').replace(u'来源:', '') break for publish_user_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(3)' ): publish_user = publish_user_for.text break for content_for in datasoup.select( 'body > div.content > p.zongjie'): content += content_for.text for img_url in datasoup.select( 'div.swiper-container > div.swiper-wrapper > div.swiper-slide > div.imgdiv > img' ): img_urls.append(img_url.get('src')) try: data['title'] = title data['content'] = content data['publish_time'] = publish_time data['publish_user'] = publish_user data['reply_nodes'] = [] data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data['img_urls'] = img_urls except Exception as e: print e return else: print url, '-----not in neirong and picture deal module' return while len(self.comments_url_list) > LEN_COMMENT_LIST: time.sleep(1) # print data self.comments_url_list.append(data) pass