def main(): while True: ret = redis_cursor.rpop(pre_system + "userlist") if ret: url, siteid, t, adminid = ret.split("|") print "爬取: %s,%s,%s,%s" % (url, siteid, t, adminid) processlog("auto_scrapyuser", 1, "main", "爬取: %s, %s, %s ,adminid:%s" % (url, siteid, t, adminid)) try: scrapy_comment_user(url, siteid, adminid) except Exception, e: try: crc32address = crc32(url) & 0xFFFFFFFF sql = "update system_site_user set status=0 where crc32address=%s" mysql_cursor.execute(sql, crc32address) except Exception, e: processlog("auto_scrapyuser", 0, "main", str(e)) if "Data too long for column" in str(e): continue if "Incorrect string value:" in str(e): print "存在表情,无法保存content, nickname:%s" % url processlog("auto_scrapyuser", 0, "main", "存在表情,无法保存, url:%s" % url) continue if time.time() - int(t) < 3600: pass # print '重新写回队列: %s' % url # processlog('auto_scrapyuser', 1, 'main', '重新写回队列: %s' % url) # redis_cursor.lpush(pre_system + 'userlist', '%s|%s|%s|%s' % (url, siteid, t, adminid)) else: print "超时: %s" % url processlog("auto_scrapyuser", 1, "main", "超时: %s" % url) print "error: %s" % str(e) processlog("auto_scrapyuser", 0, "main", str(e))
def auto_screenshot(): print "图片轮训开始" processlog("autoscreenshot", 1, "auto_screenshot", "图片轮训开始") while True: screenshot = redis_cursor.rpop(pre_system + "screenshotqueue") if screenshot: try: postid, docid, shorturl, commentid, crttime, lasttime = screenshot.split("|") # 判断时间间隔是否太快, 不得少于2min if int(time.time()) - int(lasttime) < 60: print "url采集间隔过快,重新写入队列 postid:%s ; lasttime:%s" % (postid, lasttime) # reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime) redis_cursor.lpush( pre_system + "screenshotqueue", "%s|%s|%s|%s|%s|%s" % (postid, docid, shorturl, commentid, crttime, lasttime), ) print "%s sleep 5 sec!" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time.sleep(5) continue # 网易新闻 if shorturl in "comment.news.163.com": scrapy_hot_comments_news_163_com(docid, commentid, postid, shorturl, crttime) # 凤凰新闻 if shorturl in "gentie.ifeng.com": scrapy_hot_comments_ifeng_com(docid, commentid, postid, shorturl, crttime) except Exception, e: print e processlog("autoscreenshot", 0, "auto_screenshot", str(e)) print "%s sleep 5 sec!" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time.sleep(5)
def setAddressStatus(userid, status): try: sql = "update system_site_user set status=%s where userid=%s" mysql_cursor.execute(sql, status, userid) print "更新状态userid:%s, status:%s" % (userid, status) except Exception, e: print e processlog("auto_scrapyuser", 0, "setPageStatus", str(e))
def setAddressStatus(userid, status): try: sql = 'update system_site_user set status=%s where userid=%s' mysql_cursor.execute(sql, status, userid) print '更新状态userid:%s, status:%s' % (userid, status) except Exception, e: print e processlog('auto_scrapyuser', 0, 'setPageStatus', str(e))
def scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid): headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } for page in xrange(6): userlist = getuserlist(siteid) try: comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=%s&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % ( docId, page * 40) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M) if ret: post = json.loads(ret[0]) comments = post['comments'] for k, v in comments.items(): commentId = v['commentId'] createTime = v['createTime'] userId = v['user']['userId'] content = v['content'] if 'nickname' in v['user']: nickname = v['user']['nickname'] else: nickname = '' if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s' r = mysql_cursor.query(sql, userId, createTime, adminid) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userId, commentId, content, nickname, createTime, adminid) del r time.sleep(0.2) del ret # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_comment_news_163_com', str(ex)) return 0
def scrapy_news_qq_com(rootid, urlid, siteid, adminid): headers = { 'Host': 'comment5.news.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } userlist = getuserlist(siteid) try: comment_url = 'http://coral.qq.com/article/%s/hotcomment?reqnum=10&_=%s' % ( rootid, int(time.time())) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret['data']['commentid'] for comment in comments: commentId = comment['mid'] createTime = comment['time'] userId = comment['uid'] content = comment['content'] r = re.findall(r'wb_screen_name=(.*?)&', comment['config']) if r: nickname = r[0] else: nickname = comment['nick'] if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s' r = mysql_cursor.query(sql, userId, createTime) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`) values (%s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userId, commentId, content, nickname, createTime) del r del ret time.sleep(1) # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_news_qq_com', str(ex)) return 0
def scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid): headers = { 'Host': 'comment.ifeng.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } userlist = getuserlist(siteid) try: comment_url = 'http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=json&job=1&p=1&pageSize=100&callback=newCommentListCallBack' % docUrl req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret['comments'] for comment in comments: commentId = comment['comment_id'] createTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(comment['create_time']))) userId = comment['user_id'] content = comment['comment_contents'] nickname = comment['uname'] if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s' r = mysql_cursor.query(sql, userId, createTime, adminid) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userId, commentId, content, nickname, createTime, adminid) del r del ret time.sleep(2) # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_comment_ifeng_com', str(ex)) return 0
def scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid): headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } for page in xrange(6): userlist = getuserlist(siteid) try: comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=%s&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % (docId, page*40) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M) if ret: post = json.loads(ret[0]) comments = post['comments'] for k, v in comments.items(): commentId = v['commentId'] createTime = v['createTime'] userId = v['user']['userId'] content = v['content'] if 'nickname' in v['user']: nickname = v['user']['nickname'] else: nickname = '' if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s' r = mysql_cursor.query(sql, userId, createTime, adminid) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime, adminid) del r time.sleep(0.2) del ret # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_comment_news_163_com', str(ex)) return 0
def scrapy_news_qq_com(rootid, urlid, siteid, adminid): headers = { 'Host': 'comment5.news.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } userlist = getuserlist(siteid) try: comment_url = 'http://coral.qq.com/article/%s/hotcomment?reqnum=10&_=%s' % (rootid, int(time.time())) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret['data']['commentid'] for comment in comments: commentId = comment['mid'] createTime = comment['time'] userId = comment['uid'] content = comment['content'] r = re.findall(r'wb_screen_name=(.*?)&', comment['config']) if r: nickname = r[0] else: nickname = comment['nick'] if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s' r = mysql_cursor.query(sql, userId, createTime) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`) values (%s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime) del r del ret time.sleep(1) # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_news_qq_com', str(ex)) return 0
def scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid): headers = { 'Host': 'comment.ifeng.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } userlist = getuserlist(siteid) try: comment_url = 'http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=json&job=1&p=1&pageSize=100&callback=newCommentListCallBack' % docUrl req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret['comments'] for comment in comments: commentId = comment['comment_id'] createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(comment['create_time']))) userId = comment['user_id'] content = comment['comment_contents'] nickname = comment['uname'] if nickname in userlist: # 判断记录是否已经写入过 sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s' r = mysql_cursor.query(sql, userId, createTime, adminid) if not r: # 不存在记录 写入内容 sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \ '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)' postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime, adminid) del r del ret time.sleep(2) # 修改url记录为更新状态 seturlstatus(urlid, adminid) except Exception, ex: seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'scrapy_comment_ifeng_com', str(ex)) return 0
def get_comment_news_163_com_pagenum(docId, urlid): comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=0&limit=1&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } try: req = requests.get(comment_url, headers=headers, timeout=timeout) if req.status_code != 200: return '0' html = req.text ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M) if ret: post = json.loads(ret[0]) newListSize = int(ceil(int(post['newListSize']) / 30.0)) comments = post['commentIds'] if comments: if len(comments[0].split(',')) > 1: createTime = post['comments'][comments[0].split(',') [-1]]['createTime'] else: createTime = post['comments'][comments[0]]['createTime'] if newListSize > 2: num = 2 else: num = newListSize return json.dumps({'createTime': createTime, 'num': num}) return json.dumps({'createTime': 0, 'num': 0}) except Exception, ex: # 设置为更新状态, 这样用户可以根据实际情况进行再次采集 seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'get_comment_news_163_com_pagenum', str(ex)) return 'error'
def scrapy_hot_comments_ifeng_com(docUrl, commentid, postid, shorturl, crttime): print '爬取%s,%s,%s,%s' % (docUrl, commentid, postid, shorturl) processlog( 'autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', 'docUrl:%s,commentid:%s, postid:%s, %s' % (docUrl, commentid, postid, shorturl)) headers = { 'Host': 'comment.ifeng.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } try: comment_url = 'http://comment.ifeng.com/get.php?callback=hotCommentListCallBack&orderby=uptimes&docUrl=%s&format=json&job=1&p=1&pageSize=10&callback=hotCommentListCallBack&skey=16a2fe' % docUrl req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret['comments'] for comment in comments: commentId = comment['comment_id'] if commentId == commentid: index = comments.index(comment) # 保存楼层信息到mysql sql = 'update system_url_posts set floor=%s where postid=%s' mysql_cursor.execute(sql, index + 1, postid) # 截图 res = OutPutImg(postid, 'ifeng', postid) if res: # 加入截图完成队列 # redis_cursor.hset(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentid, docid, shorturl), 1) # 更改远端帖子状态, 送图片 pass else: processlog('autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', '截图False') reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime) return # print 'commentid找不到匹配' # processlog('autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', 'commentid找不到匹配: %s ' % commentid) reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime) del ret except Exception, ex: print ex processlog('autoscreenshot', 0, 'scrapy_hot_comments_ifeng_com', str(ex)) reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
def get_comment_news_163_com_pagenum(docId, urlid): comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=0&limit=1&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } try: req = requests.get(comment_url, headers=headers, timeout=timeout) if req.status_code != 200: return '0' html = req.text ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M) if ret: post = json.loads(ret[0]) newListSize = int(ceil(int(post['newListSize'])/30.0)) comments = post['commentIds'] if comments: if len(comments[0].split(',')) > 1: createTime = post['comments'][comments[0].split(',')[-1]]['createTime'] else: createTime = post['comments'][comments[0]]['createTime'] if newListSize > 2: num = 2 else: num = newListSize return json.dumps({'createTime': createTime, 'num': num}) return json.dumps({'createTime': 0, 'num': 0}) except Exception, ex: # 设置为更新状态, 这样用户可以根据实际情况进行再次采集 seturlstatus(urlid, adminid) print ex processlog('autovote_main', 0, 'get_comment_news_163_com_pagenum', str(ex)) return 'error'
def scrapy_hot_comments_ifeng_com(docUrl, commentid, postid, shorturl, crttime): print "爬取%s,%s,%s,%s" % (docUrl, commentid, postid, shorturl) processlog( "autoscreenshot", 1, "scrapy_hot_comments_ifeng_com", "docUrl:%s,commentid:%s, postid:%s, %s" % (docUrl, commentid, postid, shorturl), ) headers = { "Host": "comment.ifeng.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } try: comment_url = ( "http://comment.ifeng.com/get.php?callback=hotCommentListCallBack&orderby=uptimes&docUrl=%s&format=json&job=1&p=1&pageSize=10&callback=hotCommentListCallBack&skey=16a2fe" % docUrl ) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = json.loads(html) if ret: comments = ret["comments"] for comment in comments: commentId = comment["comment_id"] if commentId == commentid: index = comments.index(comment) # 保存楼层信息到mysql sql = "update system_url_posts set floor=%s where postid=%s" mysql_cursor.execute(sql, index + 1, postid) # 截图 res = OutPutImg(postid, "ifeng", postid) if res: # 加入截图完成队列 # redis_cursor.hset(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentid, docid, shorturl), 1) # 更改远端帖子状态, 送图片 pass else: processlog("autoscreenshot", 1, "scrapy_hot_comments_ifeng_com", "截图False") reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime) return # print 'commentid找不到匹配' # processlog('autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', 'commentid找不到匹配: %s ' % commentid) reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime) del ret except Exception, ex: print ex processlog("autoscreenshot", 0, "scrapy_hot_comments_ifeng_com", str(ex)) reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
def main(): while True: ret = redis_cursor.rpop(pre_system + 'userlist') if ret: url, siteid, t, adminid = ret.split('|') print '爬取: %s,%s,%s,%s' % (url, siteid, t, adminid) processlog( 'auto_scrapyuser', 1, 'main', '爬取: %s, %s, %s ,adminid:%s' % (url, siteid, t, adminid)) try: scrapy_comment_user(url, siteid, adminid) except Exception, e: try: crc32address = crc32(url) & 0xffffffff sql = 'update system_site_user set status=0 where crc32address=%s' mysql_cursor.execute(sql, crc32address) except Exception, e: processlog('auto_scrapyuser', 0, 'main', str(e)) if 'Data too long for column' in str(e): continue if 'Incorrect string value:' in str(e): print '存在表情,无法保存content, nickname:%s' % url processlog('auto_scrapyuser', 0, 'main', '存在表情,无法保存, url:%s' % url) continue if time.time() - int(t) < 3600: pass # print '重新写回队列: %s' % url # processlog('auto_scrapyuser', 1, 'main', '重新写回队列: %s' % url) # redis_cursor.lpush(pre_system + 'userlist', '%s|%s|%s|%s' % (url, siteid, t, adminid)) else: print '超时: %s' % url processlog('auto_scrapyuser', 1, 'main', '超时: %s' % url) print 'error: %s' % str(e) processlog('auto_scrapyuser', 0, 'main', str(e))
def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime): """ 判断alue是否超时,不超时写回队列 """ try: if time.time() - int(crttime) < 86400: print "未超时, 重新写入队列: postid:%s" % postid # processlog('autoscreenshot', 1, 'reWriteScreenShotQueue', '未超时,重新写入队列: postid:%s' % postid) # redis重新写入截图队列 redis_cursor.lpush( pre_system + "screenshotqueue", "%s|%s|%s|%s|%s|%s" % (postid, docid, shorturl, commentid, crttime, int(time.time())), ) else: print "超时,%s:%s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(crttime))), postid) processlog( "autoscreenshot", 1, "reWriteScreenShotQueue", "超时,%s:postid: %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(crttime))), postid), ) except Exception, e: processlog("autoscreenshot", 0, "reWriteScreenShotQueue", str(e))
def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime): ''' 判断alue是否超时,不超时写回队列 ''' try: if time.time() - int(crttime) < 86400: print '未超时, 重新写入队列: postid:%s' % postid # processlog('autoscreenshot', 1, 'reWriteScreenShotQueue', '未超时,重新写入队列: postid:%s' % postid) # redis重新写入截图队列 redis_cursor.lpush( pre_system + 'screenshotqueue', '%s|%s|%s|%s|%s|%s' % (postid, docid, shorturl, commentid, crttime, int(time.time()))) else: print '超时,%s:%s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(crttime))), postid) processlog( 'autoscreenshot', 1, 'reWriteScreenShotQueue', '超时,%s:postid: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime( int(crttime))), postid)) except Exception, e: processlog('autoscreenshot', 0, 'reWriteScreenShotQueue', str(e))
def auto_screenshot(): print '图片轮训开始' processlog('autoscreenshot', 1, 'auto_screenshot', '图片轮训开始') while True: screenshot = redis_cursor.rpop(pre_system + 'screenshotqueue') if screenshot: try: postid, docid, shorturl, commentid, crttime, lasttime = screenshot.split( '|') # 判断时间间隔是否太快, 不得少于2min if int(time.time()) - int(lasttime) < 60: print 'url采集间隔过快,重新写入队列 postid:%s ; lasttime:%s' % ( postid, lasttime) # reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime) redis_cursor.lpush( pre_system + 'screenshotqueue', '%s|%s|%s|%s|%s|%s' % (postid, docid, shorturl, commentid, crttime, lasttime)) print '%s sleep 5 sec!' % time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()) time.sleep(5) continue # 网易新闻 if shorturl in 'comment.news.163.com': scrapy_hot_comments_news_163_com(docid, commentid, postid, shorturl, crttime) # 凤凰新闻 if shorturl in 'gentie.ifeng.com': scrapy_hot_comments_ifeng_com(docid, commentid, postid, shorturl, crttime) except Exception, e: print e processlog('autoscreenshot', 0, 'auto_screenshot', str(e)) print '%s sleep 5 sec!' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) time.sleep(5)
def scrapy_comment_user(url, siteid, adminid): if siteid == "1": # 163 username = getUsername(url) print "username: %s" % username processlog("auto_scrapyuser", 1, "scrapy_comment_user", "username: %s, url:%s" % (username, url)) if username: scrapy_comment_user_163(username, adminid, url) else: print "未匹配到参数" processlog("auto_scrapyuser", 1, "scrapy_comment_user", "未匹配到参数, url:%s" % url) elif siteid == "2": # ifeng guid, uname = getGuidAndUsername(url) if guid and uname: print "uname: %s" % uname processlog("auto_scrapyuser", 1, "scrapy_comment_user", "uname: %s, guid:%s, url:%s" % (uname, guid, url)) scrapy_comment_user_ifeng(guid, uname, adminid, url) else: print "未匹配到参数" processlog("auto_scrapyuser", 1, "scrapy_comment_user", "未匹配到参数, url:%s" % url)
def scrapy_comment_user(url, siteid, adminid): if siteid == '1': # 163 username = getUsername(url) print 'username: %s' % username processlog('auto_scrapyuser', 1, 'scrapy_comment_user', 'username: %s, url:%s' % (username, url)) if username: scrapy_comment_user_163(username, adminid, url) else: print '未匹配到参数' processlog('auto_scrapyuser', 1, 'scrapy_comment_user', '未匹配到参数, url:%s' % url) elif siteid == '2': # ifeng guid, uname = getGuidAndUsername(url) if guid and uname: print 'uname: %s' % uname processlog('auto_scrapyuser', 1, 'scrapy_comment_user', 'uname: %s, guid:%s, url:%s' % (uname, guid, url)) scrapy_comment_user_ifeng(guid, uname, adminid, url) else: print '未匹配到参数' processlog('auto_scrapyuser', 1, 'scrapy_comment_user', '未匹配到参数, url:%s' % url)
fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: print ex processlog('autovote_main', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def getuserlist(siteid): # 获取内部帐号列表
config_data = fp.read() fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: processlog('autoscreenshot', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime): '''
def scrapy_comment_user_ifeng(guid, username, adminid, address): """ 凤凰网个人页面爬取 http://comment.ifeng.com/get? job=7 & format=json & pagesize=20 & _1460705534 & guid=65969467 & p=1 """ username_decode = unquote(username) siteid = 2 # 判断用户是否存在 sql = "select userid,adminid from system_site_user where siteid=%s and username=%s" r = mysql_cursor.query(sql, siteid, username_decode) if r: if int(adminid) != int(r[0]["adminid"]): print "网站帐号存在,且adminid不符" processlog( "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "网站帐号存在,添加人不匹配,现:%s, 原:%s" % (adminid, r[0]["adminid"]), ) return print "网站帐号存在" userid = r[0]["userid"] setAddressStatus(userid, 1) else: processlog("auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "网站帐号不存在,添加:%s" % username) crc32_address = crc32(address) & 0xFFFFFFFF sql = "insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)" userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address) headers = { "Host": "comment.news.163.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } # 默认爬取6页 for page in xrange(scrapy_page): url = "http://comment.ifeng.com/get?job=7&format=json&pagesize=20&guid=%s&p=%s" % (guid, page) req = requests.get(url, headers=headers, timeout=timeout) if req.status_code == 200: data = json.loads(req.text) comments = data["comments"] for comment in comments: _url = comment["doc_url"] # 判断url是否支持 res = r"://(.*?)/" ret = re.findall(res, _url) if ret: shorturl = ret[0] if shorturl in ["news.ifeng.com"]: title = comment["doc_name"] # 帖子标题 content = comment["comment_contents"] createTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(comment["create_time"]))) commentId = comment["comment_id"] nickname = comment["uname"] url_post = ( "http://gentie.ifeng.com/view.html?docUrl=" + quote(_url.encode("utf8")) + "&docName=" + quote(title.encode("utf8")) ) # 判断帖子是否保存过 sql = "select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s" r = mysql_cursor.query(sql, commentId, createTime, adminid) if not r: # 判断url是否添加过 crc32_url = crc32(url_post) & 0xFFFFFFFF sql = "select urlid from system_url_list where `crc32url`=%s and adminid=%s" ret = mysql_cursor.query(sql, crc32_url, adminid) if ret: # 添加过 urlid = ret[0]["urlid"] else: sql = "insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`,`status`, `adminid`) values(%s, %s, %s, %s, now(), 1, %s)" urlid = mysql_cursor.execute_lastrowid(sql, siteid, title, url_post, crc32_url, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "url未添加过,添加url,urlid:%s" % urlid ) try: # 保存帖子 sql = ( "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`" ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)" ) postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid ) print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid), ) except Exception, e: if "Data too long for column" in str(e): processlog( "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "帖子内容过长,重新截取写入,urlid:%s" % urlid, ) content = content[:255] # 保存帖子 sql = ( "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`" ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)" ) postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid ) print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid), ) # 更新site_user状态 setAddressStatus(userid, 0) else: print "帖子已经添加过: commentId:%s" % commentId # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '帖子已经添加过: commentId:%s' % commentId) # 如果到最后一页,退出循环 total = data["count"] if (page + 1) * 20 >= total: break else: print req.text
fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: print ex processlog('autovote_agent', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def autovote(): # 获取commentid
config_data = fp.read() fp.close() config_json = json.loads(config_data)["config"] redis_host = config_json["redis_host"] redis_port = config_json["redis_port"] mysql_host = config_json["mysql_host"] mysql_db = config_json["mysql_db"] mysql_user = config_json["mysql_user"] mysql_pass = config_json["mysql_pass"] pre_system = config_json["pre_system"] serverport = config_json["http_port"] except Exception, ex: processlog("autoscreenshot", 0, "config", str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime): """ 判断alue是否超时,不超时写回队列 """
def scrapy_comment_user_ifeng(guid, username, adminid, address): ''' 凤凰网个人页面爬取 http://comment.ifeng.com/get? job=7 & format=json & pagesize=20 & _1460705534 & guid=65969467 & p=1 ''' username_decode = unquote(username) siteid = 2 # 判断用户是否存在 sql = 'select userid,adminid from system_site_user where siteid=%s and username=%s' r = mysql_cursor.query(sql, siteid, username_decode) if r: if int(adminid) != int(r[0]['adminid']): print '网站帐号存在,且adminid不符' processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '网站帐号存在,添加人不匹配,现:%s, 原:%s' % (adminid, r[0]['adminid'])) return print '网站帐号存在' userid = r[0]['userid'] setAddressStatus(userid, 1) else: processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '网站帐号不存在,添加:%s' % username) crc32_address = crc32(address) & 0xffffffff sql = 'insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)' userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address) headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } # 默认爬取6页 for page in xrange(scrapy_page): url = 'http://comment.ifeng.com/get?job=7&format=json&pagesize=20&guid=%s&p=%s' % ( guid, page) req = requests.get(url, headers=headers, timeout=timeout) if req.status_code == 200: data = json.loads(req.text) comments = data['comments'] for comment in comments: _url = comment['doc_url'] # 判断url是否支持 res = r'://(.*?)/' ret = re.findall(res, _url) if ret: shorturl = ret[0] if shorturl in [ 'news.ifeng.com', ]: title = comment['doc_name'] #帖子标题 content = comment['comment_contents'] createTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(comment['create_time']))) commentId = comment['comment_id'] nickname = comment['uname'] url_post = 'http://gentie.ifeng.com/view.html?docUrl=' + quote( _url.encode('utf8')) + '&docName=' + quote( title.encode('utf8')) #判断帖子是否保存过 sql = 'select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s' r = mysql_cursor.query(sql, commentId, createTime, adminid) if not r: #判断url是否添加过 crc32_url = crc32(url_post) & 0xffffffff sql = 'select urlid from system_url_list where `crc32url`=%s and adminid=%s' ret = mysql_cursor.query(sql, crc32_url, adminid) if ret: #添加过 urlid = ret[0]['urlid'] else: sql = 'insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`,`status`, `adminid`) values(%s, %s, %s, %s, now(), 1, %s)' urlid = mysql_cursor.execute_lastrowid( sql, siteid, title, url_post, crc32_url, adminid) processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', 'url未添加过,添加url,urlid:%s' % urlid) try: #保存帖子 sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\ ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid) print '保存帖子: %s; postid :%s ; adminid : %s' % ( nickname, postid, adminid) processlog( 'auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '保存帖子: %s; postid :%s ; adminid : %s' % (nickname, postid, adminid)) except Exception, e: if 'Data too long for column' in str(e): processlog( 'auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '帖子内容过长,重新截取写入,urlid:%s' % urlid) content = content[:255] #保存帖子 sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\ ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid) print '保存帖子: %s; postid :%s ; adminid : %s' % ( nickname, postid, adminid) processlog( 'auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '保存帖子: %s; postid :%s ; adminid : %s' % (nickname, postid, adminid)) # 更新site_user状态 setAddressStatus(userid, 0) else: print '帖子已经添加过: commentId:%s' % commentId # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '帖子已经添加过: commentId:%s' % commentId) #如果到最后一页,退出循环 total = data['count'] if (page + 1) * 20 >= total: break else: print req.text
fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: print ex processlog('auto_scrapyuser', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) # 爬取页数 scrapy_page = 4 #网易从个人页面获取所有帖子
def scrapy_comment_user_163(username, adminid, address): """ 网易用户所有跟贴的爬取 """ username_decode = base64.b64decode(username) siteid = mysql_cursor.query('select siteid from system_site_list where shorturl="comment.news.163.com"')[0][ "siteid" ] # 判断用户是否存在 sql = "select userid,adminid from system_site_user where siteid=%s and username=%s " r = mysql_cursor.query(sql, int(siteid), username_decode) if r: if int(adminid) != int(r[0]["adminid"]): print "网站帐号存在,添加人不匹配" processlog( "auto_scrapyuser", 1, "scrapy_comment_user_163", "网站帐号存在,添加人不匹配,现:%s, 原:%s" % (adminid, r[0]["adminid"]) ) return userid = r[0]["userid"] setAddressStatus(userid, 1) else: processlog("auto_scrapyuser", 1, "scrapy_comment_user_163", "网站帐号不存在,添加:%s,userid:%s" % (username, adminid)) crc32_address = crc32(address) & 0xFFFFFFFF sql = "insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)" userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address) headers = { "Host": "comment.news.163.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } # 默认爬取6页 for page in xrange(scrapy_page): url = ( "http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/users/0/comments?username=%s&offset=%s&limit=30&ibc=newspc" % (username, page) ) req = requests.get(url, headers=headers, timeout=timeout) if req.status_code == 200: data = json.loads(req.text) threads = data["threads"] urllist = [] for k, v in threads.items(): param = {} _url = v["url"] # 判断url是否支持 res = r"://(.*?)/" ret = re.findall(res, _url) if ret: shorturl = ret[0] if shorturl in ["news.163.com"]: boardId = v["boardId"] param["docId"] = v["docId"] param["title"] = v["title"] param["url"] = "http://comment.news.163.com/" + boardId + "/" + v["docId"] + ".html" urllist.append(param) else: processlog("auto_scrapyuser", 1, "crapy_comment_user_163", "url不支持:%s" % _url) comments = data["comments"] for k, v in comments.items(): url_post = "" title = "" for u in urllist: if u["docId"] == k.split("_")[0]: url_post = u["url"] title = u["title"] buildLevel = v["buildLevel"] # 判断是否含有nickname, 是否是最外层的评论 if url_post and title and v["user"].has_key("nickname") and buildLevel == 1: nickname = v["user"]["nickname"] commentId = v["commentId"] createTime = v["createTime"] content = v["content"].encode("utf8") # 判断帖子是否保存过 sql = "select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s" r = mysql_cursor.query(sql, commentId, createTime, adminid) if not r: # 判断url是否添加过 crc32_url = crc32(url_post) & 0xFFFFFFFF sql = "select urlid from system_url_list where `crc32url`=%s and `adminid`=%s" ret = mysql_cursor.query(sql, crc32_url, adminid) if ret: # 添加过 urlid = ret[0]["urlid"] else: sql = "insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`, `status`, `adminid`) values(%s,%s,%s,%s,now(),1, %s)" urlid = mysql_cursor.execute_lastrowid(sql, siteid, title, url_post, crc32_url, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_163", "url未添加过,添加url,urlid:%s" % urlid ) # 保存帖子 try: sql = ( "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`" ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)" ) postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid ) print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_163", "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid), ) except Exception, e: # 有的字符集无法保存 if "Incorrect string value:" in str(e): print "存在表情,无法保存content, nickname:%s" % nickname processlog( "auto_scrapyuser", 0, "scrapy_comment_user_163", "存在表情,无法保存content, nickname:%s" % nickname, ) elif "Data too long for column" in str(e): processlog( "auto_scrapyuser", 1, "scrapy_comment_user_163", "帖子内容过长,重新截取写入,urlid:%s" % urlid ) content = content[:255] sql = ( "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`" ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)" ) postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid ) print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid) processlog( "auto_scrapyuser", 1, "scrapy_comment_user_163", "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid), ) else: print e processlog("auto_scrapyuser", 0, "scrapy_comment_user_163", str(e)) # 更新site_user状态 setAddressStatus(userid, 0) else: print "帖子保存过:postid:%s" % r[0]["postid"] # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '帖子保存过:postid:%s' % r[0]['postid']) # 如果到最后一页,退出循环 total = data["total"] if (page + 1) * 30 >= total: break else: print req.text
def autovote(): # 获取commentid postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque') if not postid_commentid: print '%s null' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) return 0 postid = postid_commentid.split('|')[0] commentId = postid_commentid.split('|')[1] docId = postid_commentid.split('|')[2] shorturl = postid_commentid.split('|')[3] adminid = postid_commentid.split('|')[4] processlog('autovote_agent', 1, 'autovote', '爬取:postid:%s, commetnid:%s, drcid:%s, shorturl: %s' % (postid, commentId, docId, shorturl)) # TODO 修改完成判断为截图完成 # 搜索截图完成队列中是否含有本条帖子 # if redis_cursor.hexists(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)): # return 0 # 获取代理ip 如果没有代理ip则为127.0.0.1 proxy_ip = redis_cursor.lpop(pre_system + 'proxylist') if not proxy_ip: proxy_ip = '127.0.0.1' # 判断帖子现在是否是开启状态 ret = redis_cursor.hget(pre_system + 'commentidstatus', postid_commentid) if ret == '0': # 帖子处于停止状态 print '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid) processlog('autovote_agent', 1, 'autovote', '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid)) # # 将commentid写回队列 # redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) return 0 # 判断此代理ip是否能够访问 ret = redis_cursor.hget(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) if ret: t = int(time.mktime(datetime.datetime.now().timetuple())) if t <= int(ret): # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 0 # 判断次数有没有顶贴完成 sql = 'select `count`, `maxcount` from system_url_posts where `postid`=%s' ret = mysql_cursor.query(sql, postid) if ret: count_now = int(ret[0]['count']) count_max = int(ret[0]['maxcount']) if count_now >= count_max: # 设置状态postid状态为已完成 sql = 'update system_url_posts set `status`=3 where `postid`=%s' mysql_cursor.execute(sql, postid) # 删除本条postid对应的hash表 redis_cursor.delete(pre_system + 'ipinterval_' + postid_commentid) # 已经采集完成 删除status信息 redis_cursor.hdel(pre_system + 'commentidstatus', postid_commentid) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 删除autovote_commentidadded 表中的记录 以免在顶贴目标完成之后 再次增加次数无法在进行添加 redis_cursor.hdel(pre_system + 'commentidadded', postid_commentid) print 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) processlog('autovote_agent', 1, 'autovote', 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) # # TODO 截图 # # 方案二:如果顶帖完成,保存到redis一个完成的列表,截图程序轮询这个列表 # redis_cursor.lpush(pre_system + 'postidfinished', postid + "|" + shorturl) # if shorturl in 'comment.news.163.com': # urltype = '163' # if shorturl in 'gentie.ifeng.com': # urltype = 'ifeng' # try: # sql = 'select url from system_url_post as posts, system_url_list as list where postid=%s and post.urlid=list.urlid' # url_img = mysql_cursor.query(sql, postid)[0]['url'] # OutPutImg(url, urltype, postid) # except Exception,e: # print '截图错误:%s' % str(e) return 0 else: print '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) processlog('autovote_agent', 1, 'reWriteScreenShotQueue', '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) return 0 try: if shorturl in 'comment.news.163.com': url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/%s/action/upvote?ibc=newspc' % ( docId, commentId) if shorturl in 'gentie.ifeng.com': url = 'http://comment.ifeng.com/vote.php?callback=recmCallback&cmtId=%s&job=up&docUrl=%s&callback=recmCallback&format=js' % (commentId, docId) headers = { 'Host': '%s' % shorturl, 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest' } # 网易新闻参数 data = { 'ibc': 'newspc' } if proxy_ip == '127.0.0.1': if shorturl in 'comment.news.163.com': ret = requests.post(url, data=data, headers=headers, timeout=2) if shorturl in 'gentie.ifeng.com': ret = requests.get(url, headers=headers, timeout=2) else: try: proxies = {'http': 'http://' + proxy_ip} # 网易新闻 if shorturl in 'comment.news.163.com': ret = requests.post(url, data=data, proxies=proxies, headers=headers, timeout=2) # 凤凰新闻 if shorturl in 'gentie.ifeng.com': ret = requests.get(url, headers=headers, timeout=2) except requests.RequestException: # 判断ip发生异常的次数 超过三次则移除IP timeout_count = redis_cursor.hget(pre_system + 'iptimeoutcount', proxy_ip) print 'timeout_count: %s' % timeout_count if timeout_count: if int(timeout_count) > 2: print 'IP: %s 发生异常, 异常次数: %s 移除IP!' % (proxy_ip, int(timeout_count)) processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s 移除IP!' % (proxy_ip, int(timeout_count))) # 删除autovote_ipinterval hash表 redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip) else: print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count)) processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count))) redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, int(timeout_count) + 1) redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) else: print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1) processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1)) redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, 1) redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 1 if ret.status_code == 200: if len(ret.text) <= 60: # 删除之前timeout的次数 redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip) # 一分钟之后才能继续采集 if shorturl in 'comment.news.163.com': nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 60))) # 时间间隔5秒钟 if shorturl in 'gentie.ifeng.com': nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 5))) redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time) if shorturl in 'gentie.ifeng.com': if 'alert' not in ret.text: sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s' mysql_cursor.execute(sql, postid) print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip) processlog('autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)) else: print '%s 凤凰新闻顶贴时间过快!' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) processlog('autovote_agent', 1, 'autovote', '%s 凤凰新闻顶贴时间过快!' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) # 将commentid写回队列 redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) return 1 if shorturl in 'comment.news.163.com': sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s' mysql_cursor.execute(sql, postid) # 写入统计表 sql = 'insert into system_post_detail (`postTime`, `count`, `adminid`) values (%s, 1, %s)' t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) mysql_cursor.execute(sql, t, adminid) del t print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip) processlog('autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) else: print 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (proxy_ip, len(ret.text)) processlog('autovote_agent', 1, 'autovote', 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (proxy_ip, len(ret.text))) # 删除autovote_ipinterval hash表 redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) elif ret.status_code == 429: # 10秒之后才能继续采集 print '%s 顶贴太频繁!' % proxy_ip nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 10))) redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time) else: print '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip) processlog('autovote_agent', 1, 'autovote', '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip)) # 删除autovote_ipinterval hash表 redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) # 将commentid写回队列 redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 1 except Exception, ex: print ex processlog('autovote_agent', 0, 'autovote', str(ex)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 0
def scrapy_hot_comments_news_163_com(docId, commentid, postid, shorturl, crttime): """ """ print "爬取%s,%s,%s,%s" % (docId, commentid, postid, shorturl) processlog( "autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "爬取docId:%s,commentid:%s, postid:%s, %s" % (docId, commentid, postid, shorturl), ) page = 1 inhot = False headers = { "Host": "comment.news.163.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } try: comment_url = ( "http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/hotTopList?offset=0&limit=40&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc" % docId ) req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = re.findall("getData\(\n(.*)\);", html, re.S | re.M) if ret: post = json.loads(ret[0]) indexs = post["commentIds"] for commid in indexs: if str(commentid) in commid: # 获取热门帖子索引 k index = indexs.index(commid) page = math.ceil((index + 1) / 10.0) # 保存楼层信息到mysql sql = "update system_url_posts set floor=%s where postid=%s" mysql_cursor.execute(sql, index + 1, postid) inhot = True if not inhot: print "热门帖中无法找到: %s" % postid processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "热门帖中无法找到: %s" % postid) reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime) return comments = post["comments"] for k, v in comments.items(): commentId = v["commentId"] if int(commentId) == int(commentid): # 截图 res = OutPutImg(postid, "163", postid, page) if not res: print "截图失败" processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "截图False") reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime) return print "截图完成: %s !" % postid processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "截图完成: %s " % postid) # print 'commentid找不到匹配' # processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', 'commentid找不到匹配: %s ' % commentid) del ret except Exception, ex: print ex processlog("autoscreenshot", 0, "scrapy_hot_comments_news_163_com", str(ex)) reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
def scrapy_hot_comments_news_163_com(docId, commentid, postid, shorturl, crttime): ''' ''' print '爬取%s,%s,%s,%s' % (docId, commentid, postid, shorturl) processlog( 'autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', '爬取docId:%s,commentid:%s, postid:%s, %s' % (docId, commentid, postid, shorturl)) page = 1 inhot = False headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } try: comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/hotTopList?offset=0&limit=40&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId req = requests.get(comment_url, headers=headers, timeout=timeout) html = req.text ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M) if ret: post = json.loads(ret[0]) indexs = post['commentIds'] for commid in indexs: if str(commentid) in commid: # 获取热门帖子索引 k index = indexs.index(commid) page = math.ceil((index + 1) / 10.0) # 保存楼层信息到mysql sql = 'update system_url_posts set floor=%s where postid=%s' mysql_cursor.execute(sql, index + 1, postid) inhot = True if not inhot: print '热门帖中无法找到: %s' % postid processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', '热门帖中无法找到: %s' % postid) reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime) return comments = post['comments'] for k, v in comments.items(): commentId = v['commentId'] if int(commentId) == int(commentid): # 截图 res = OutPutImg(postid, '163', postid, page) if not res: print '截图失败' processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', '截图False') reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime) return print '截图完成: %s !' % postid processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', '截图完成: %s ' % postid) # print 'commentid找不到匹配' # processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', 'commentid找不到匹配: %s ' % commentid) del ret except Exception, ex: print ex processlog('autoscreenshot', 0, 'scrapy_hot_comments_news_163_com', str(ex)) reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
def seturlstatus(urlid, adminid): # 设置url的状态 sql = 'update system_url_list set `status`=1 where `urlid`=%s and `adminid`=%s' mysql_cursor.execute(sql, urlid, adminid) processlog('autovote_main', 1, 'seturlstatus', '设置url的状态,urlid:%s' % urlid)
fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: print ex processlog('autovote_main', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def getuserlist(siteid): # 获取内部帐号列表 userlist = [] sql = 'select username from system_site_user where `siteid`=%s'
def autovote(): # 获取commentid postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque') if not postid_commentid: print '%s null' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) return 0 postid = postid_commentid.split('|')[0] commentId = postid_commentid.split('|')[1] docId = postid_commentid.split('|')[2] shorturl = postid_commentid.split('|')[3] adminid = postid_commentid.split('|')[4] processlog( 'autovote_agent', 1, 'autovote', '爬取:postid:%s, commetnid:%s, drcid:%s, shorturl: %s' % (postid, commentId, docId, shorturl)) # TODO 修改完成判断为截图完成 # 搜索截图完成队列中是否含有本条帖子 # if redis_cursor.hexists(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)): # return 0 # 获取代理ip 如果没有代理ip则为127.0.0.1 proxy_ip = redis_cursor.lpop(pre_system + 'proxylist') if not proxy_ip: proxy_ip = '127.0.0.1' # 判断帖子现在是否是开启状态 ret = redis_cursor.hget(pre_system + 'commentidstatus', postid_commentid) if ret == '0': # 帖子处于停止状态 print '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid) processlog( 'autovote_agent', 1, 'autovote', '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid)) # # 将commentid写回队列 # redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) return 0 # 判断此代理ip是否能够访问 ret = redis_cursor.hget(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) if ret: t = int(time.mktime(datetime.datetime.now().timetuple())) if t <= int(ret): # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush( pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 0 # 判断次数有没有顶贴完成 sql = 'select `count`, `maxcount` from system_url_posts where `postid`=%s' ret = mysql_cursor.query(sql, postid) if ret: count_now = int(ret[0]['count']) count_max = int(ret[0]['maxcount']) if count_now >= count_max: # 设置状态postid状态为已完成 sql = 'update system_url_posts set `status`=3 where `postid`=%s' mysql_cursor.execute(sql, postid) # 删除本条postid对应的hash表 redis_cursor.delete(pre_system + 'ipinterval_' + postid_commentid) # 已经采集完成 删除status信息 redis_cursor.hdel(pre_system + 'commentidstatus', postid_commentid) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 删除autovote_commentidadded 表中的记录 以免在顶贴目标完成之后 再次增加次数无法在进行添加 redis_cursor.hdel(pre_system + 'commentidadded', postid_commentid) print 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % ( postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) processlog( 'autovote_agent', 1, 'autovote', 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) # # TODO 截图 # # 方案二:如果顶帖完成,保存到redis一个完成的列表,截图程序轮询这个列表 # redis_cursor.lpush(pre_system + 'postidfinished', postid + "|" + shorturl) # if shorturl in 'comment.news.163.com': # urltype = '163' # if shorturl in 'gentie.ifeng.com': # urltype = 'ifeng' # try: # sql = 'select url from system_url_post as posts, system_url_list as list where postid=%s and post.urlid=list.urlid' # url_img = mysql_cursor.query(sql, postid)[0]['url'] # OutPutImg(url, urltype, postid) # except Exception,e: # print '截图错误:%s' % str(e) return 0 else: print '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) processlog( 'autovote_agent', 1, 'reWriteScreenShotQueue', '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) return 0 try: if shorturl in 'comment.news.163.com': url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/%s/action/upvote?ibc=newspc' % ( docId, commentId) if shorturl in 'gentie.ifeng.com': url = 'http://comment.ifeng.com/vote.php?callback=recmCallback&cmtId=%s&job=up&docUrl=%s&callback=recmCallback&format=js' % ( commentId, docId) headers = { 'Host': '%s' % shorturl, 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest' } # 网易新闻参数 data = {'ibc': 'newspc'} if proxy_ip == '127.0.0.1': if shorturl in 'comment.news.163.com': ret = requests.post(url, data=data, headers=headers, timeout=2) if shorturl in 'gentie.ifeng.com': ret = requests.get(url, headers=headers, timeout=2) else: try: proxies = {'http': 'http://' + proxy_ip} # 网易新闻 if shorturl in 'comment.news.163.com': ret = requests.post(url, data=data, proxies=proxies, headers=headers, timeout=2) # 凤凰新闻 if shorturl in 'gentie.ifeng.com': ret = requests.get(url, headers=headers, timeout=2) except requests.RequestException: # 判断ip发生异常的次数 超过三次则移除IP timeout_count = redis_cursor.hget( pre_system + 'iptimeoutcount', proxy_ip) print 'timeout_count: %s' % timeout_count if timeout_count: if int(timeout_count) > 2: print 'IP: %s 发生异常, 异常次数: %s 移除IP!' % ( proxy_ip, int(timeout_count)) processlog( 'autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s 移除IP!' % (proxy_ip, int(timeout_count))) # 删除autovote_ipinterval hash表 redis_cursor.hdel( pre_system + 'ipinterval_' + postid_commentid, proxy_ip) redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip) else: print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count)) processlog( 'autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count))) redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, int(timeout_count) + 1) redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) else: print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1) processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1)) redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, 1) redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush( pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 1 if ret.status_code == 200: if len(ret.text) <= 60: # 删除之前timeout的次数 redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip) # 一分钟之后才能继续采集 if shorturl in 'comment.news.163.com': nxt_time = int( time.mktime(time.localtime(time.time() + int(1) * 60))) # 时间间隔5秒钟 if shorturl in 'gentie.ifeng.com': nxt_time = int( time.mktime(time.localtime(time.time() + int(1) * 5))) redis_cursor.hset( pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time) if shorturl in 'gentie.ifeng.com': if 'alert' not in ret.text: sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s' mysql_cursor.execute(sql, postid) print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip) processlog( 'autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)) else: print '%s 凤凰新闻顶贴时间过快!' % time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()) processlog( 'autovote_agent', 1, 'autovote', '%s 凤凰新闻顶贴时间过快!' % time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime())) # 将commentid写回队列 redis_cursor.rpush( pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) return 1 if shorturl in 'comment.news.163.com': sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s' mysql_cursor.execute(sql, postid) # 写入统计表 sql = 'insert into system_post_detail (`postTime`, `count`, `adminid`) values (%s, 1, %s)' t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) mysql_cursor.execute(sql, t, adminid) del t print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip) processlog( 'autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) else: print 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % ( proxy_ip, len(ret.text)) processlog( 'autovote_agent', 1, 'autovote', 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (proxy_ip, len(ret.text))) # 删除autovote_ipinterval hash表 redis_cursor.hdel( pre_system + 'ipinterval_' + postid_commentid, proxy_ip) elif ret.status_code == 429: # 10秒之后才能继续采集 print '%s 顶贴太频繁!' % proxy_ip nxt_time = int( time.mktime(time.localtime(time.time() + int(1) * 10))) redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time) else: print '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip) processlog('autovote_agent', 1, 'autovote', '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip)) # 删除autovote_ipinterval hash表 redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip) # 将commentid写回队列 redis_cursor.rpush( pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 1 except Exception, ex: print ex processlog('autovote_agent', 0, 'autovote', str(ex)) # 将代理ip写入队列 redis_cursor.rpush(pre_system + 'proxylist', proxy_ip) # 将commentid写回队列 redis_cursor.rpush( pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)) return 0
def scrapy_comment_user_163(username, adminid, address): ''' 网易用户所有跟贴的爬取 ''' username_decode = base64.b64decode(username) siteid = mysql_cursor.query( 'select siteid from system_site_list where shorturl="comment.news.163.com"' )[0]['siteid'] # 判断用户是否存在 sql = 'select userid,adminid from system_site_user where siteid=%s and username=%s ' r = mysql_cursor.query(sql, int(siteid), username_decode) if r: if int(adminid) != int(r[0]['adminid']): print '网站帐号存在,添加人不匹配' processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '网站帐号存在,添加人不匹配,现:%s, 原:%s' % (adminid, r[0]['adminid'])) return userid = r[0]['userid'] setAddressStatus(userid, 1) else: processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '网站帐号不存在,添加:%s,userid:%s' % (username, adminid)) crc32_address = crc32(address) & 0xffffffff sql = 'insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)' userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address) headers = { 'Host': 'comment.news.163.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' } #默认爬取6页 for page in xrange(scrapy_page): url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/users/0/comments?username=%s&offset=%s&limit=30&ibc=newspc' % ( username, page) req = requests.get(url, headers=headers, timeout=timeout) if req.status_code == 200: data = json.loads(req.text) threads = data['threads'] urllist = [] for k, v in threads.items(): param = {} _url = v['url'] # 判断url是否支持 res = r'://(.*?)/' ret = re.findall(res, _url) if ret: shorturl = ret[0] if shorturl in [ 'news.163.com', ]: boardId = v['boardId'] param['docId'] = v['docId'] param['title'] = v['title'] param[ 'url'] = 'http://comment.news.163.com/' + boardId + '/' + v[ 'docId'] + '.html' urllist.append(param) else: processlog('auto_scrapyuser', 1, 'crapy_comment_user_163', 'url不支持:%s' % _url) comments = data['comments'] for k, v in comments.items(): url_post = '' title = '' for u in urllist: if u['docId'] == k.split('_')[0]: url_post = u['url'] title = u['title'] buildLevel = v['buildLevel'] # 判断是否含有nickname, 是否是最外层的评论 if url_post and title and v['user'].has_key( 'nickname') and buildLevel == 1: nickname = v['user']['nickname'] commentId = v['commentId'] createTime = v['createTime'] content = v['content'].encode('utf8') #判断帖子是否保存过 sql = 'select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s' r = mysql_cursor.query(sql, commentId, createTime, adminid) if not r: #判断url是否添加过 crc32_url = crc32(url_post) & 0xffffffff sql = 'select urlid from system_url_list where `crc32url`=%s and `adminid`=%s' ret = mysql_cursor.query(sql, crc32_url, adminid) if ret: #添加过 urlid = ret[0]['urlid'] else: sql = 'insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`, `status`, `adminid`) values(%s,%s,%s,%s,now(),1, %s)' urlid = mysql_cursor.execute_lastrowid( sql, siteid, title, url_post, crc32_url, adminid) processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', 'url未添加过,添加url,urlid:%s' % urlid) #保存帖子 try: sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\ ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid) print '保存帖子: %s; postid :%s ; adminid : %s' % ( nickname, postid, adminid) processlog( 'auto_scrapyuser', 1, 'scrapy_comment_user_163', '保存帖子: %s; postid :%s ; adminid : %s' % (nickname, postid, adminid)) except Exception, e: # 有的字符集无法保存 if 'Incorrect string value:' in str(e): print '存在表情,无法保存content, nickname:%s' % nickname processlog( 'auto_scrapyuser', 0, 'scrapy_comment_user_163', '存在表情,无法保存content, nickname:%s' % nickname) elif 'Data too long for column' in str(e): processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '帖子内容过长,重新截取写入,urlid:%s' % urlid) content = content[:255] sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\ ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)' postid = mysql_cursor.execute_lastrowid( sql, urlid, userid, commentId, content, nickname, createTime, adminid) print '保存帖子: %s; postid :%s ; adminid : %s' % ( nickname, postid, adminid) processlog( 'auto_scrapyuser', 1, 'scrapy_comment_user_163', '保存帖子: %s; postid :%s ; adminid : %s' % (nickname, postid, adminid)) else: print e processlog('auto_scrapyuser', 0, 'scrapy_comment_user_163', str(e)) # 更新site_user状态 setAddressStatus(userid, 0) else: print '帖子保存过:postid:%s' % r[0]['postid'] # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '帖子保存过:postid:%s' % r[0]['postid']) #如果到最后一页,退出循环 total = data['total'] if (page + 1) * 30 >= total: break else: print req.text
def main(): while 1: urlinfo = redis_cursor.rpop(pre_system + 'urlqueque') # urlinfo = '3|1970-01-01 23:59:00|http://news.sina.com.cn/c/gat/2016-04-05/doc-ifxqxcnr5291732.shtml' if urlinfo: urlinfo = urlinfo.split('|') urlid = urlinfo[0] lastcreateTime = urlinfo[1] url = urlinfo[2] adminid = urlinfo[3] res = r'://(.*?)/' ret = re.findall(res, url) if ret: shorturl = ret[0] # 通过shorturl来获取siteid siteid = '' sql = 'select siteid from system_site_list where `shorturl`=%s' r = mysql_cursor.query(sql, shorturl) if r: siteid = r[0]['siteid'] del r else: print '没有siteid' del r time.sleep(2) continue if shorturl in ['comment.news.163.com']: docId = re.findall('(.*?).html', url.split('/')[-1])[0] scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue if shorturl in ['gentie.ifeng.com']: docUrl = urllib2.unquote(re.findall(r'docUrl=(.*?.shtml)', url)[0]) scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue if shorturl in ['news.sina.com.cn']: channel_newsid = news_sina_com_cn_getNewsid(url) if channel_newsid == 'error': print '%s 获取帖子url错误, url: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), url) processlog('autovote_main', 1, 'main', '%s 获取帖子url错误, url: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), url)) time.sleep(2) continue channel = channel_newsid['channel'] newsid = channel_newsid['newsid'] scrapy_news_sina_com_cn(channel, newsid, urlid, siteid, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue else: print '网站还不支持' processlog('autovote_main', 1, 'main', '网站还不支持:%s' % url) else: print 'url wrong, %s' % url processlog('autovote_main', 1, 'main', 'url wrong, %s' % url) else: print '^sleep 10 seconds^' time.sleep(10)
fp.close() config_json = json.loads(config_data)["config"] redis_host = config_json["redis_host"] redis_port = config_json["redis_port"] mysql_host = config_json["mysql_host"] mysql_db = config_json["mysql_db"] mysql_user = config_json["mysql_user"] mysql_pass = config_json["mysql_pass"] pre_system = config_json["pre_system"] serverport = config_json["http_port"] except Exception, ex: print ex processlog("auto_scrapyuser", 0, "config", str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) # 爬取页数 scrapy_page = 4 # 网易从个人页面获取所有帖子
fp.close() config_json = json.loads(config_data)['config'] redis_host = config_json['redis_host'] redis_port = config_json['redis_port'] mysql_host = config_json['mysql_host'] mysql_db = config_json['mysql_db'] mysql_user = config_json['mysql_user'] mysql_pass = config_json['mysql_pass'] pre_system = config_json['pre_system'] serverport = config_json['http_port'] except Exception, ex: print ex processlog('autovote_agent', 0, 'config', str(ex)) sys.exit(-1) # 链接redis pool = redis.ConnectionPool(host=redis_host, port=redis_port) redis_cursor = redis.Redis(connection_pool=pool) # 链接mysql mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass) def autovote(): # 获取commentid postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque') if not postid_commentid:
def main(): while 1: urlinfo = redis_cursor.rpop(pre_system + 'urlqueque') # urlinfo = '3|1970-01-01 23:59:00|http://news.sina.com.cn/c/gat/2016-04-05/doc-ifxqxcnr5291732.shtml' if urlinfo: urlinfo = urlinfo.split('|') urlid = urlinfo[0] lastcreateTime = urlinfo[1] url = urlinfo[2] adminid = urlinfo[3] res = r'://(.*?)/' ret = re.findall(res, url) if ret: shorturl = ret[0] # 通过shorturl来获取siteid siteid = '' sql = 'select siteid from system_site_list where `shorturl`=%s' r = mysql_cursor.query(sql, shorturl) if r: siteid = r[0]['siteid'] del r else: print '没有siteid' del r time.sleep(2) continue if shorturl in ['comment.news.163.com']: docId = re.findall('(.*?).html', url.split('/')[-1])[0] scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue if shorturl in ['gentie.ifeng.com']: docUrl = urllib2.unquote( re.findall(r'docUrl=(.*?.shtml)', url)[0]) scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue if shorturl in ['news.sina.com.cn']: channel_newsid = news_sina_com_cn_getNewsid(url) if channel_newsid == 'error': print '%s 获取帖子url错误, url: %s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()), url) processlog( 'autovote_main', 1, 'main', '%s 获取帖子url错误, url: %s' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()), url)) time.sleep(2) continue channel = channel_newsid['channel'] newsid = channel_newsid['newsid'] scrapy_news_sina_com_cn(channel, newsid, urlid, siteid, adminid) print '%s 扫描完成!' % url processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url) time.sleep(2) continue else: print '网站还不支持' processlog('autovote_main', 1, 'main', '网站还不支持:%s' % url) else: print 'url wrong, %s' % url processlog('autovote_main', 1, 'main', 'url wrong, %s' % url) else: print '^sleep 10 seconds^' time.sleep(10)