def __init__(self, workqueue): threading.Thread.__init__(self) self.workQueue = workqueue self.setDaemon(True) self.start() self.mongodb = MongoDB() self.checkMongoDB = TempMongoDB()
def __init__(self, workQueue, saveQueue, timeout=30): threading.Thread.__init__(self) self.timeout = timeout self.setDaemon(True) self.workQueue = workQueue self.saveQueue = saveQueue self.mongodb = MongoDB() self.start()
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True) self.cursor = self.conn.cursor() self.message = NewsMessage()
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun()
class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, page): bu = re.split(r'c_|.htm', news_url)[1] comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?&newsId=1-%s&pid=%d' % ( bu, page) json_object = dict() comment_dict = dict() flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return for item in json_object['contentAll']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["content"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["commentTime"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = item["upAmount"] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["userId"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 yong_hu_ming = item["nickName"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ipInfo"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'新华网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = str(ping_lun_id) + news_url #print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict)
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() # self.update = update_uel() # self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True) # self.cursor = self.conn.cursor() def getNewsMessage(self): count = 0 for news_url in NewsUrl.Run(): req = urllib2.Request(news_url) try: urllib2.urlopen(req) except urllib2.URLError, e: if hasattr(e, 'code'): print 'Error code: ', e.code elif hasattr(e, 'reason'): print 'Reason: ', e.reason continue re_ = 'http://sports.sohu.com/\d*?/[n]\d*?.shtml' if (re.match(re_, news_url)): print news_url html = '' flag = 1 while 1: try: html = requests.get(news_url, timeout=30) html.encoding = 'gb2312' break except Exception as e: flag += 1 print e if flag > 10: return soup = BeautifulSoup(html.text, 'html.parser') re_ = '.*[n](\d*?).shtml' _id = re.match(re_, news_url).group(1) title = soup.find_all('title')[0].text if (title == "404,您访问的页面已经不存在!"): continue """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 # select_sql = """ # select ping_lun_shu_liang from news where url = %s""" # if (self.cursor.execute(select_sql, news_url)): # data = self.cursor.fetchone() # # print data[0] # if (data[0] == ping_lun_shu_liang): # continue # else: message_dict = dict() ping_dic = dict() # 发布时间 # shijian1 = tiongoe.strftime('%Y-%m-%d', time.localtime(time.time() - 2 * 24 * 60 * 60)) shijian = time.strftime('%Y-%m-%d', time.localtime(time.time())) fa_bu_shi_jian = soup.find_all(id='pubtime_baidu')[0].text if (not re.search(shijian, fa_bu_shi_jian)): continue message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # print fa_bu_shi_jian # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao = soup.title.string.encode('utf-8') # print wen_zhang_biao wen_ = '(.*?)\-.*?' wen_zhang_biao_ti = re.search(wen_, wen_zhang_biao).group(1) # print wen_zhang_biao_ti message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 wen_zhang_lai_yuan = soup.find_all( id="media_span")[0].text.encode('utf-8') message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文 li = [] for i in soup.select("div#contentText"): for wen_zhang_zheng_wen in i.select('p'): li.append(wen_zhang_zheng_wen.text.encode('utf-8')) message_dict['wen_zhang_zheng_wen'] = ",".join(li) # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' message_dict['zhan_dian'] = zhan_dian # 图片链接n tu_pian_lian_jie = None message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie # 文章栏目 wen_zhang_lan_mu = u'搜狐体育' + soup.select( "div#mypos")[0].text.encode('utf-8') try: message_dict[ 'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace( '>', '->') except Exception as e: print e message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = soup.find_all( id="author")[0].text.encode('utf-8') message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = _id + '|_|' + news_url count += 1 # print count ping_dic['url'] = news_url ping_dic['_id'] = _id ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang # self.update.process_item(ping_dic) # print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) flag1 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(news_url, _id, page) except Exception as e: print e self.comment.run(news_url, _id, page) continue else: print news_url html1 = '' flag = 1 while 1: try: html1 = requests.get(news_url, timeout=30) html1.encoding = 'utf-8' break except Exception as e: flag += 1 print e if flag > 10: return tree = etree.HTML(html1.text) soup = BeautifulSoup(html1.text, 'html.parser') # print soup.text re_ = "http://www.sohu.com/a/(\d*?)\_" title = soup.find_all('title')[0].text if (title == "404,您访问的页面已经不存在!"): continue # print soup.select("#mp-comment") if (soup.select("#mp-comment") != []): _id = soup.select("#mp-comment")[0]['sid'].encode("utf-8") # print _id if (int(_id) == 0): _id = 'mp_' + re.search(re_, news_url).group(1) else: continue """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url, _id) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 # select_sql = """ select ping_lun_shu_liang from news where url = %s""" # if(self.cursor.execute(select_sql, news_url)): # data = self.cursor.fetchone() # #print data[0] # if (data[0] == ping_lun_shu_liang): # continue # else: message_dict = dict() ping_dic = dict() # 发布时间 shijian = time.strftime('%Y-%m-%d', time.localtime(time.time())) try: fa_bu_shi_jian = soup.select('span#news-time')[0].text except: fa_bu_shi_jian = soup.select('span.time')[0].text if (not re.search(shijian, fa_bu_shi_jian)): continue message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao = soup.title.string.encode('utf-8') wen_ = '(.*?)\_.*?' try: wen_zhang_biao_ti = re.search(wen_, wen_zhang_biao).group(1) message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti except: message_dict['wen_zhang_biao_ti'] = "无" # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 try: wen_zhang_lai_yuan = soup.select( "#user-info h4 a")[0].text.encode('utf-8') message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan except: message_dict['wen_zhang_lai_yuan'] = "空" # 文章正文 li = [] for i in soup.select("article.article"): for wen_zhang_zheng_wen in i.select('p'): li.append(wen_zhang_zheng_wen.text.encode('utf-8')) message_dict['wen_zhang_zheng_wen'] = ','.join(li) # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' message_dict['zhan_dian'] = zhan_dian # 图片链接 if (not soup.select('.article img')): tu_pian_lian_jie = None message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie else: tu_pian = soup.select('.article img') tu = [] for tu_pian_lian_jie in tu_pian: if (not re.search('http', tu_pian_lian_jie['src'])): tu.append("http:" + tu_pian_lian_jie['src']) else: tu.append(tu_pian_lian_jie['src']) message_dict['tu_pian_lian_jie'] = " ".join(tu) # 文章栏目 try: wen_zhang_lan_mu = soup.select( ".location.area")[0].text.encode('utf-8') except: wen_zhang_lan_mu = "" try: message_dict[ 'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace( '>', '->') except Exception as e: print e message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = None message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = _id + '|_|' + news_url count += 1 # print count ping_dic['url'] = news_url ping_dic['_id'] = _id ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang # self.update.process_item(ping_dic) # print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) flag2 = 0 if ping_lun_shu_liang > 0: all_page = int(math.ceil(ping_lun_shu_liang / 10.0)) for page in xrange(1, all_page + 1): try: self.comment.run(news_url, _id, page) except Exception as e: print e self.comment.run(news_url, _id, page) continue
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun() '''self.genzong = genZong()'''
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.huan = huanCun() '''self.genzong = genZong()''' def getNewsMessage(self): '''self.genzong.run()''' for news_url in NewsUrl.Run(): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0", "Host": re.split(r"/", news_url)[2], } html = '' flag = 1 while 1: try: html = requests.get(news_url, headers=headers, timeout=30).content break except Exception as e: flag += 1 print "RREQUESTERROR", e print "URL:" + news_url if flag > 10: return tree = etree.HTML(html) '''这一段代码是用来获取阅读数和评论数的''' comment_number = self.getCommentNumber(news_url) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题.//*[@class="h-title"]/text() wen_zhang_biao_ti = pathOneNode( tree, ".//*[@class='h-title']/text()|.//*[@class='btt']/h1/text()|.//*[@class='tit']/h1/text()|.//*[@class='sm01']/text()|.//*[@id='title']/text()" ) message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # 发布时间 fa_bu_shi_jian = pathOneNode( tree, ".//*[@class='h-time']/text()|.//*[@class='time']/text()|.//*[@class='gnxx']/div[2]/text()|.//*[@class='tm']/text()|.//*[@class='sm02']/text()|.//*[@id='pubtime']/text()" ) message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 评论数量 ''' ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源.//*[@id="source"]/text() try: wen_zhang_lai_yuan = tree.xpath( ".//*[@class='ly']/a/text()|.//*[@class='gnxx']/div[1]/text()|.//*[@class='sus']/a/text()|.//*[@class='sm02']/a/text()|.//*[@id='source']/text()" )[-1].replace(u'来源:', '').replace('\r\n', '').replace(' ', '') except: wen_zhang_lai_yuan = pathAllNode(tree, ".//*[@id='source']//text()") message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文.//*[@id='xhw']//p try: wen_zhang_zheng_wen = tree.xpath( ".//*[@id='p-detail']//p/text()|.//*[@class='content']//p/text()|.//*[@id='content']//p/text()|.//*[@id='content']//p/text()" ) except: wen_zhang_zheng_wen = pathAllNode(tree, ".//*[@id='xhw']") zheng_wen = '' for i in wen_zhang_zheng_wen: zheng_wen = zheng_wen + i.replace(u' ', '').replace( '\r\n', '').replace(' ', '') message_dict['wen_zhang_zheng_wen'] = zheng_wen # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'新华网' message_dict['zhan_dian'] = zhan_dian # 图片链接 photo_URL_qian = re.findall( r'http://[a-z|A-Z]+.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/|http://ent.news.cn/\d+-\d+/\d+/|http://www.sc.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/', news_url)[0] tu_pian_lian = '' try: tu_pian_lian_jie = tree.xpath( ".//*[@align='center']/img/@src|.//*[@align='center']/span/img/@src" ) if tu_pian_lian_jie: for i in tu_pian_lian_jie: photo_URL = photo_URL_qian + i tu_pian_lian = tu_pian_lian + ' ' + photo_URL else: pass except: print "photo Error:" + news_url message_dict['tu_pian_lian_jie'] = tu_pian_lian # 文章栏目 if (re.split('/', news_url)[3] == 'politics' or re.split('/', news_url)[3] == 'politics'): wen_zhang_lan_mu = re.split('/', news_url)[3] elif (re.split('/', news_url)[3] == 'c'): wen_zhang_lan_mu = 'sport' elif (re.split('/', news_url)[3] == 'content'): wen_zhang_lan_mu = 'bendi' else: wen_zhang_lan_mu = 'yule' message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 try: try: con = tree.xpath(".//*[@class='tiyi1']/../text()")[-1] wen_zhang_zuo_zhe = '' for i in con: wen_zhang_zuo_zhe += i except: wen_zhang_zuo_zhe = pathAllNode( tree, ".//*[@class='p-jc']|.//*[@class='bjn']|.//*[@class='bj']|.//*[@class='editor']" ) message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe.replace( u'【纠错】', '').replace(u'责任编辑', '').replace(u'体育—', '').replace( '\r\n', '').replace(u':', '').replace(':', '').replace( '[', '').replace(']', '') except: message_dict['wen_zhang_zuo_zhe'] = None # 关键词 try: guan_jian_ci = tree.xpath( './/*[@name="keywords"]/@content')[0].replace('\r\n', '') except: guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = news_url if (message_dict['fa_bu_shi_jian']) == None: try: with open("ERROR.text", "a") as file: file.write(news_url + "\n") finally: pass else: #print json.dumps(message_dict, ensure_ascii=False, indent=4) self.mongo.put_content(message_dict) if ping_lun_shu_liang > 0: all_page = ping_lun_shu_liang / 20 for page in xrange(1, all_page + 2): self.comment.run(news_url, page) '''#追踪 dict_zhui = {} dict_zhui['url'] = news_url dict_zhui['num'] = ping_lun_shu_liang dict_zhui['_id'] = news_url self.huan.put_zhuizong(dict_zhui)''' def getCommentNumber(self, news_url): jison_object = dict() bu = re.split(r'c_|.htm', news_url)[1] comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?newsId=1-' + bu flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print e if flag > 5: return # 阅读数 yue_du_shu = None # 评论数 ping_lun_shu_liang = json_object['totalRows'] return yue_du_shu, ping_lun_shu_liang
def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.f = open('test.txt', "r+") self.i = 0 self.url_list =[]
class MyFetchThreadSecond(threading.Thread): def __init__(self, workQueue, saveQueue, timeout=30): threading.Thread.__init__(self) self.timeout = timeout self.setDaemon(True) self.workQueue = workQueue self.saveQueue = saveQueue self.mongodb = MongoDB() self.start() def working_one(self, item, label): # 返回所需要的内容,fetchFirst->json if label == 'shehui' or label == 'guonei': label = 'news.163.com' else: label = '%s.163.com' % label header = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': label, 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/59.0.3071.115 Safari/537.36' } flag = 1 html = '' while 1: try: html = requests.get(url=item, headers=header, timeout=30). \ content.decode('gbk').encode('utf-8') break except Exception as e: print e flag += 1 if flag > 10: break wenzhang_tree = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) return wenzhang_tree def Xpath_for_content(self, tree, path): try: content = tree.xpath(path) return content except Exception as e: print "ERROR: Locate in the FetchSecond's Xpath_for_content method, exception: e" % e return None def run(self): while not self.workQueue.empty(): try: # print "%s start working" % self.name # condition : json's label should be 其他, json's time should be today item = self.workQueue.get() if item['label'] != u'其它': continue current_time = time.strftime("%m/%d/%Y", time.localtime(time.time())) result = re.search(current_time, item['time']) if result == None: continue # get new's html tree wenzhang_tree = self.working_one(item['docurl'], item['channelname']) message_dict = dict() # 文章来源 wen_zhang_lai_yuan = self.Xpath_for_content(wenzhang_tree, '//*[@id="ne_article_source"]/text()') message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan[0] # 文章正文 wen_zhang_zheng_wen = self.Xpath_for_content(wenzhang_tree, '//*[@id="endText"]//p/text()') b = '\n' for temp in wen_zhang_zheng_wen: b += temp message_dict['wen_zhang_zheng_wen'] = b # 文章栏目 wen_zhang_lan_mu = self.Xpath_for_content(wenzhang_tree, '//*[@id="ne_wrap"]/body//div/div[@class="clearfix' '"]/div[@class="post_crumb"]//a/text()') c = '\n' for temp2 in wen_zhang_lan_mu: c += temp2 c += ' ' message_dict['wen_zhang_lan_mu'] = c # 评论网址 ping_lun_wang_zhi = item['commenturl'] message_dict['ping_lun_wang_zhi'] = ping_lun_wang_zhi # 文章网址 wen_zhang_wang_zhi = item['docurl'] message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao_ti = item['title'] message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # 发布时间 fa_bu_shi_jian = item['time'] message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 参与人数 ping_lun_shu_liang = item['tienum'] message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'网易新闻' message_dict['zhan_dian'] = zhan_dian # 图片链接 tu_pian_lian_jie = None message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie # 文章作者 wen_zhang_zuo_zhe = None message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 try: guan_jian_ci = item['keywords'][0]['keyname'] message_dict['guan_jian_ci'] = guan_jian_ci except Exception as e: message_dict['guan_jian_ci'] = None print "ERROR: Locate in the FetchSecond's run method for guan_jian_ci, exception: %s" % e # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = ping_lun_shu_liang message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = wen_zhang_wang_zhi # save message_dict self.mongodb.put_content(message_dict) # some info url_info = re.match('https?://(.*?).163.com/\d*?/\d*?/\d*?/(.*?).html', wen_zhang_wang_zhi) all_thing = (url_info, wen_zhang_wang_zhi) self.saveQueue.put(all_thing) except Exception as e: print "ERROR: Locate in the FetchSecond's run method 'while not Queue empty', exception: %s" % e continue
def __init__(self, savequeue): self.saveQueue = savequeue self.check_mongodb = TempMongoDB() self.update_mongodb = MongoDB()
class CheckUpdate(object): def __init__(self, savequeue): self.saveQueue = savequeue self.check_mongodb = TempMongoDB() self.update_mongodb = MongoDB() def run(self): old_data = self.check_mongodb.get() count_for_news = 0 count_for_comment = 0 try: for every_data in old_data: ping_lun_shu = every_data['ping_lun_shu'] content_url = every_data['_id'] # get comment's num and then compare,if the num has been changed,then get the new data info = re.match('http://(.*?).163.com/\d*?/\d*?/\d*?/(.*?).html', content_url) default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \ 'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \ (info.group(1), info.group(2)) current_shuliang = self.working(info=info, comment_url=default_url)['newListSize'] num = current_shuliang - ping_lun_shu if num > 0: put_data = (info, content_url) condition_one = {'_id': content_url} condition_two = {'do_time': time.time()} self.saveQueue.put(put_data) self.update_mongodb.update_content(condition_one, condition_two) count_for_news += 1 count_for_comment += num else: remove_condition = { "_id": content_url } self.check_mongodb.delete(remove_condition) except Exception as e: print "ERROR: Locate in the CheckUpdate, exception: %s" % e finally: print "UPDATE: There are %d news has been updated, and there has %d comments been updated" % \ (count_for_news, count_for_comment) def working(self, info, comment_url): host = 'comment.%s.163.com' % (info.group(1)) referer = comment_url header = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': host, 'Referer': referer, 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/59.0.3071.115 Safari/537.36' } flag = 1 while 1: try: json_data = json.loads(requests.get(url=comment_url, headers=header, timeout=30).content) break except Exception as e: print "ERROR: Failed to get the comment's json, exception: %s" % e flag += 1 if flag > 5: return return json_data
class MyCommentThread(threading.Thread): def __init__(self, workqueue): threading.Thread.__init__(self) self.workQueue = workqueue self.setDaemon(True) self.start() self.mongodb = MongoDB() self.checkMongoDB = TempMongoDB() def run(self): while not self.workQueue.empty(): try: # print "%s start working" % self.name info, wenzhang_Url = self.workQueue.get() default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \ 'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \ (info.group(1), info.group(2)) pages = self.working(wenzhang_Url, default_url, info) if pages > 0: comment_urls = list() for i in range(1, pages + 1): offset = i * 30 temp = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' \ '/comments/newList?offset=%d&limit=30&showLevelThreshold=72&headLimit=1&' \ 'tailLimit=2' % (info.group(1), info.group(2), offset) comment_urls.append(temp) for item in comment_urls: drop = self.working(wenzhang_Url, item, info) except Exception as e: print "ERROR: Locate in the CommentThread's run method 'while not Queue empty', exception: %s" % e continue def working(self, content_url, the_comment_url, info): host = 'comment.%s.163.com' % (info.group(1)) referer = the_comment_url header = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': host, 'Referer': referer, 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/59.0.3071.115 Safari/537.36' } # 默认获取第一页的json数据 flag = 1 while 1: try: json_data = json.loads(requests.get(url=the_comment_url, headers=header, timeout=30).content) break except Exception as e: print "ERROR: Failed to get the comment's json, exception: %s" % e flag += 1 if flag > 5: return pages = 0 try: for comment_id in json_data['comments']: comment_dict = dict() # 评论内容 ping_lun_nei_rong = json_data['comments'][str(comment_id)]['content'] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = json_data['comments'][str(comment_id)]['createTime'] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = json_data['comments'][str(comment_id)]['vote'] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论ID ping_lun_id = comment_id comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 try: yong_hu_ming = json_data['comments'][str(comment_id)]['user']['nickname'] comment_dict['yong_hu_ming'] = yong_hu_ming except Exception as e: comment_dict['yong_hu_ming'] = None # 性别 comment_dict['xing_bie'] = None # 用户等级 comment_dict['yong_hu_deng_ji'] = None # 用户省份 comment_dict['yong_hu_sheng_fen'] = json_data['comments'][str(comment_id)]['user']['location'] # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'网易新闻' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = ping_lun_id + content_url # 获取评论数 ping_lun_shu = json_data['newListSize'] pages = ping_lun_shu / 30 self.mongodb.put_comment(comment_dict) # put the data into the database check_dict = dict() check_dict['_id'] = content_url check_dict['do_time'] = do_time check_dict['ping_lun_shu'] = ping_lun_shu self.checkMongoDB.put(check_dict) return pages except Exception as e: print "ERROR: Locate in the CommentThread's working method for parsing json data, exception: %s," \ "and json data is %s" % (e, json_data)
updateUrl_Queue = Queue.Queue() # First: check old news' url for update Updater = CheckUpdate(savequeue=updateUrl_Queue) Updater.run() CommentPool = ThreadPoolOfCommentThread(num_of_threads=10, workqueue=updateUrl_Queue) CommentPool.wait_for_complete() print "UPDATE: Complete checking for the old news!" # Second: Get all news' url for json for url in Urls: StartUrlQueue.put(url) fetch_1 = MyFetchThreadFirst(workQueue=StartUrlQueue, saveQueue=Json_Url_Queue) fetch_1.join() # Third: Parse json data and get news' html and parse to generate the message_dict then save it print "FETCH: There are %d newest news has been fetched today" % Json_Url_Queue.qsize() FetchSecondPool = ThreadPoolOfFetchSecond(num_of_threads=6, workqueue=Json_Url_Queue, savequeue=urlInfo_Queue) FetchSecondPool.wait_for_complete() # Forth: Request the comment_url and save comment's message_dict CommentPool = ThreadPoolOfCommentThread(num_of_threads=10, workqueue=urlInfo_Queue) CommentPool.wait_for_complete() start_uptime, end_uptime = getCondition() time_condition = {"do_time": {"$gte": start_uptime, "$lt": end_uptime}, "zhan_dian": "网易新闻"} temp_mongo = MongoDB() num_of_comments = temp_mongo.get_comment_data(time_condition) print "FETCH: There are %d comments has been fetched today" % num_of_comments print "TIME: Total spent %d seconds" % (time.time() - start_moment) print "SLEEP: Mission complete, start to sleeping.... " sleep_seconds = DeltaSeconds() time.sleep(sleep_seconds)
class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, page): comment_url = 'http://comment.ifeng.com/get.php?docUrl=%s&format=js&job=1&p=%d&pageSize=20' % ( news_url, page) json_object = dict() comment_dict = dict() flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 3: return for item in json_object['comments']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["comment_contents"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["create_time"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = None comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["comment_id"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 yong_hu_ming = item["uname"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ip_from"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'凤凰网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = ping_lun_id + news_url # print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict) pass
class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, _id, page): # comment_url = 'http://apiv2.sohu.com/api/topic/load?page_size=10' \ # '&topic_source_id=%s&page_no=1&hot_size=5&topic_url=%s&source_id=%s' % (_id,news_url,_id) if news_url.endswith('shtml'): pass else: tow_ids = news_url.split('/')[-1].split('_') media_id = tow_ids[1] source_id = tow_ids[0] comment_url = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \ '&topic_source_id=%s&page_no=1&media_id=%s&source_id=mp_%s' % (_id, media_id, source_id) # print comment_url json_object = dict() flag = 1 while 1: try: # json_object = json.loads(requests.get(comment_url, timeout=30).content) comments = requests.get(comment_url, timeout=30).content json_object = json.loads( re.match('.*218\((.*?)\);', comments).group(1)) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return count = 0 if (json_object[u'jsonObject'].has_key(u'topic_id') == False): print "暂时无法获取topic_id" else: item = json_object[u'jsonObject'][u'topic_id'] # comment_URL = 'http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=%s&page_no=%d&source_id=%s' % (item, page,_id) comment_URL = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \ '&topic_id=%s&page_no=%s&media_id=%s&source_id=mp_%s' % (item, page, media_id, source_id) Json_object = dict() comment_dict = dict() flag = 1 while 1: try: # json_object = json.loads(requests.get(comment_url, timeout=30).content) comments = requests.get(comment_URL, timeout=30).content Json_object = json.loads( re.match('.*218\((.*?)\);', comments).group(1)) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return count = 0 for item in Json_object[u'jsonObject'][u'comments']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["content"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["create_time"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = item["reply_count"] comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = item["support_count"] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["comment_id"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 if (item[u'passport'].has_key(u'nickname') == False): yong_hu_ming = None else: yong_hu_ming = item[u'passport']["nickname"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ip_location"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = str(ping_lun_id) + '|_|' + news_url # count += 1 # print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict)
def __init__(self): self.mongo = MongoDB()
class NewsMessage(object): def __init__(self): self.comment = NewsComment() self.mongo = MongoDB() self.f = open('test.txt', "r+") self.i = 0 self.url_list =[] def getNewsTotleUrl(self): for news_url in NewsUrl.Run(): self.getNewsMessage(news_url) if self.f.read(): with open('test.txt', 'r+') as f: a = json.load(f) url_list_before = a['url'] for url_before in url_list_before: url_json = json.loads(url_before)['wen_zhang_wang_zhi'] pin_lun_shu = self.getPinglun(url_json) if pin_lun_shu != json.loads(url_before)['ping_lun_shu_liang']: self.getNewsMessage(url_json) else: url_list_before.remove(url_before) self.file_close(url_list_before) def getNewsMessage(self, news_url): self.i += 1 print self.i print news_url html = '' flag = 1 while 1: try: html = requests.get(news_url, timeout=30).content break except Exception as e: flag += 1 print e if flag > 10: return tree = etree.HTML(html) """这一段代码是用来获取阅读数和评论数的""" comment_number = self.getCommentNumber(news_url) if comment_number: yue_du_shu = comment_number[0] ping_lun_shu_liang = comment_number[1] else: yue_du_shu = 0 ping_lun_shu_liang = 0 message_dict = dict() message_url = dict() # 文章网址 wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi message_url['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # 文章标题 wen_zhang_biao_ti = pathOneNode(tree, '//title/text()') if wen_zhang_biao_ti != None: wen_zhang_biao_ti = wen_zhang_biao_ti.replace('_', '').replace(u"娱乐频道", "").replace(u"凤凰网","").replace(u'凤凰体育','').replace(u'凤凰财经','') message_dict['wen_zhang_biao_ti'] =wen_zhang_biao_ti # 发布时间 fa_bu_shi_jian = pathOneNode(tree, '//span[@itemprop="datePublished"]/text()') if fa_bu_shi_jian == None: fa_bu_shi_jian = pathOneNode(tree, '//*[@id="titL"]/p/span/text()') message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # 评论数量 ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang message_url['ping_lun_shu_liang'] = ping_lun_shu_liang # 文章来源 wen_zhang_lai_yuan = pathOneNode(tree, '//span[@itemprop="publisher"]/span/a/text()') if wen_zhang_lai_yuan == None: wen_zhang_lai_yuan = pathOneNode(tree, '//*[@id="artical_sth"]/p/span[3]/span/text()') if wen_zhang_lai_yuan == None: wen_zhang_lai_yuan = u'凤凰网' message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # 文章正文 wen_zhang_zheng_wen = pathAllNode(tree, '//div[@id="main_content"]') if wen_zhang_zheng_wen == None: try: re_ = "G_listdata=..\n{1,}.*({title:\'[\S\s]+?])" re__ = "{title:\'([\S\s]+?)\'," text_first = re.findall(re_, html) text_conten = re.findall(re__, text_first[0]) wen_zhang_zheng_wen = "".join(text_conten) except Exception as e: try: wen_zhang_zheng_wen = pathAllNode(tree, '//*[@id="slidedesc2"]') except Exception as e: wen_zhang_zheng_wen = None message_dict['wen_zhang_zheng_wen'] = wen_zhang_zheng_wen # 抓取时间 do_time = time.time() message_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'凤凰网' message_dict['zhan_dian'] = zhan_dian # 图片链接 tu_pian_lian_jie = pathGetImg(tree, '//*[@id="main_content"]//img[@alt]/@src') if tu_pian_lian_jie: message_dict['tu_pian_lian_jie'] = " ".join(tu_pian_lian_jie) else: message_dict['tu_pian_lian_jie'] = None # 文章栏目 wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="theCurrent cDGray js_crumb"]') if wen_zhang_lan_mu == None: wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="speNav js_crumb"]') if wen_zhang_lan_mu == None: wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="cmtNav js_crumb"]') try: message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace('>', '->') except Exception as e: message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # 文章作者 wen_zhang_zuo_zhe = None message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # 关键词 guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # 相关标签 xiang_guan_biao_qian = None message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # 阅读数量 yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # 主键 message_dict['_id'] = news_url # #时间 # d1 = datetime.datetime.now().date() # message_url['time'] = d1 # print json.dumps(message_dict, ensure_ascii=False, indent=4) if wen_zhang_zheng_wen != None and wen_zhang_biao_ti != None: self.mongo.put_content(message_dict) self.url_list.append(json.dumps(message_url, sort_keys=True, indent=4)) print message_dict if ping_lun_shu_liang > 0: all_page = ping_lun_shu_liang / 20 for page in xrange(1, all_page + 1): self.comment.run(news_url, page) def getPinglun(self, news_url): """这一段代码是用来获取和评论数的""" comment_number = self.getCommentNumber(news_url) if comment_number: ping_lun_shu_liang = comment_number[1] else: ping_lun_shu_liang = 0 return ping_lun_shu_liang def file_close(self, url_list_before): url_dirc=dict() end_url= self.url_list+url_list_before url_dirc['url'] = end_url self.f.truncate(0) self.f.seek(0, 0) self.f.write(json.dumps(url_dirc)) self.f.close() def getCommentNumber(self, news_url): json_object = dict() comment_url = 'http://comment.ifeng.com/get.php?doc_url=%s&format=js&job=1' % news_url flag = 1 while 1: try: json_object = json.loads(requests.get(comment_url, timeout=30).content.replace('var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print e if flag > 5: return # 阅读数 yue_du_shu = json_object['join_count'] # 评论数 ping_lun_shu_liang = json_object['count'] return yue_du_shu, ping_lun_shu_liang