示例#1
0
def main():
    while True:
        ret = redis_cursor.rpop(pre_system + "userlist")
        if ret:
            url, siteid, t, adminid = ret.split("|")

            print "爬取: %s,%s,%s,%s" % (url, siteid, t, adminid)
            processlog("auto_scrapyuser", 1, "main", "爬取: %s, %s, %s ,adminid:%s" % (url, siteid, t, adminid))
            try:
                scrapy_comment_user(url, siteid, adminid)

            except Exception, e:
                try:
                    crc32address = crc32(url) & 0xFFFFFFFF
                    sql = "update system_site_user set status=0 where crc32address=%s"
                    mysql_cursor.execute(sql, crc32address)
                except Exception, e:
                    processlog("auto_scrapyuser", 0, "main", str(e))

                if "Data too long for column" in str(e):
                    continue
                if "Incorrect string value:" in str(e):
                    print "存在表情,无法保存content, nickname:%s" % url
                    processlog("auto_scrapyuser", 0, "main", "存在表情,无法保存, url:%s" % url)
                    continue
                if time.time() - int(t) < 3600:
                    pass
                    # print '重新写回队列: %s' % url
                    # processlog('auto_scrapyuser', 1, 'main', '重新写回队列: %s' % url)
                    # redis_cursor.lpush(pre_system + 'userlist', '%s|%s|%s|%s' % (url, siteid, t, adminid))
                else:
                    print "超时: %s" % url
                    processlog("auto_scrapyuser", 1, "main", "超时: %s" % url)
                print "error: %s" % str(e)
                processlog("auto_scrapyuser", 0, "main", str(e))
示例#2
0
def auto_screenshot():
    print "图片轮训开始"
    processlog("autoscreenshot", 1, "auto_screenshot", "图片轮训开始")
    while True:
        screenshot = redis_cursor.rpop(pre_system + "screenshotqueue")
        if screenshot:
            try:
                postid, docid, shorturl, commentid, crttime, lasttime = screenshot.split("|")

                # 判断时间间隔是否太快, 不得少于2min
                if int(time.time()) - int(lasttime) < 60:
                    print "url采集间隔过快,重新写入队列  postid:%s  ; lasttime:%s" % (postid, lasttime)
                    # reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime)
                    redis_cursor.lpush(
                        pre_system + "screenshotqueue",
                        "%s|%s|%s|%s|%s|%s" % (postid, docid, shorturl, commentid, crttime, lasttime),
                    )
                    print "%s sleep 5 sec!" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    time.sleep(5)
                    continue
                # 网易新闻
                if shorturl in "comment.news.163.com":
                    scrapy_hot_comments_news_163_com(docid, commentid, postid, shorturl, crttime)
                # 凤凰新闻
                if shorturl in "gentie.ifeng.com":
                    scrapy_hot_comments_ifeng_com(docid, commentid, postid, shorturl, crttime)
            except Exception, e:
                print e
                processlog("autoscreenshot", 0, "auto_screenshot", str(e))
        print "%s sleep 5 sec!" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        time.sleep(5)
示例#3
0
def setAddressStatus(userid, status):
    try:
        sql = "update system_site_user set status=%s where userid=%s"
        mysql_cursor.execute(sql, status, userid)
        print "更新状态userid:%s, status:%s" % (userid, status)
    except Exception, e:
        print e
        processlog("auto_scrapyuser", 0, "setPageStatus", str(e))
示例#4
0
def setAddressStatus(userid, status):
    try:
        sql = 'update system_site_user set status=%s where userid=%s'
        mysql_cursor.execute(sql, status, userid)
        print '更新状态userid:%s, status:%s' % (userid, status)
    except Exception, e:
        print e
        processlog('auto_scrapyuser', 0, 'setPageStatus', str(e))
示例#5
0
def scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url,
                                adminid):

    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    for page in xrange(6):

        userlist = getuserlist(siteid)
        try:
            comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=%s&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % (
                docId, page * 40)
            req = requests.get(comment_url, headers=headers, timeout=timeout)
            html = req.text
            ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M)
            if ret:
                post = json.loads(ret[0])
                comments = post['comments']
                for k, v in comments.items():
                    commentId = v['commentId']
                    createTime = v['createTime']
                    userId = v['user']['userId']
                    content = v['content']
                    if 'nickname' in v['user']:
                        nickname = v['user']['nickname']
                    else:
                        nickname = ''
                    if nickname in userlist:
                        # 判断记录是否已经写入过
                        sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s'
                        r = mysql_cursor.query(sql, userId, createTime,
                                               adminid)
                        if not r:
                            # 不存在记录  写入内容
                            sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                                  '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)'
                            postid = mysql_cursor.execute_lastrowid(
                                sql, urlid, userId, commentId, content,
                                nickname, createTime, adminid)

                        del r
                    time.sleep(0.2)
            del ret
            # 修改url记录为更新状态
            seturlstatus(urlid, adminid)
        except Exception, ex:
            seturlstatus(urlid, adminid)
            print ex
            processlog('autovote_main', 0, 'scrapy_comment_news_163_com',
                       str(ex))
            return 0
示例#6
0
def scrapy_news_qq_com(rootid, urlid, siteid, adminid):
    headers = {
        'Host': 'comment5.news.sina.com.cn',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    userlist = getuserlist(siteid)
    try:
        comment_url = 'http://coral.qq.com/article/%s/hotcomment?reqnum=10&_=%s' % (
            rootid, int(time.time()))
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret['data']['commentid']
            for comment in comments:
                commentId = comment['mid']
                createTime = comment['time']
                userId = comment['uid']
                content = comment['content']
                r = re.findall(r'wb_screen_name=(.*?)&', comment['config'])
                if r:
                    nickname = r[0]
                else:
                    nickname = comment['nick']
                if nickname in userlist:
                    # 判断记录是否已经写入过
                    sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s'
                    r = mysql_cursor.query(sql, userId, createTime)
                    if not r:
                        # 不存在记录  写入内容
                        sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                              '`nickname`, `createTime`) values (%s, %s, %s, %s, %s, %s)'
                        postid = mysql_cursor.execute_lastrowid(
                            sql, urlid, userId, commentId, content, nickname,
                            createTime)

                    del r
        del ret
        time.sleep(1)
        # 修改url记录为更新状态
        seturlstatus(urlid, adminid)
    except Exception, ex:
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'scrapy_news_qq_com', str(ex))
        return 0
示例#7
0
def scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid):
    headers = {
        'Host': 'comment.ifeng.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    userlist = getuserlist(siteid)
    try:
        comment_url = 'http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=json&job=1&p=1&pageSize=100&callback=newCommentListCallBack' % docUrl
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret['comments']
            for comment in comments:
                commentId = comment['comment_id']
                createTime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(int(comment['create_time'])))
                userId = comment['user_id']
                content = comment['comment_contents']
                nickname = comment['uname']
                if nickname in userlist:
                    # 判断记录是否已经写入过
                    sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s'
                    r = mysql_cursor.query(sql, userId, createTime, adminid)
                    if not r:
                        # 不存在记录  写入内容
                        sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                              '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)'
                        postid = mysql_cursor.execute_lastrowid(
                            sql, urlid, userId, commentId, content, nickname,
                            createTime, adminid)

                    del r
        del ret
        time.sleep(2)
        # 修改url记录为更新状态
        seturlstatus(urlid, adminid)
    except Exception, ex:
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'scrapy_comment_ifeng_com', str(ex))
        return 0
示例#8
0
def scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid):

    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    for page in xrange(6):

        userlist = getuserlist(siteid)
        try:
            comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=%s&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % (docId, page*40)
            req = requests.get(comment_url, headers=headers, timeout=timeout)
            html = req.text
            ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M)
            if ret:
                post = json.loads(ret[0])
                comments = post['comments']
                for k, v in comments.items():
                    commentId = v['commentId']
                    createTime = v['createTime']
                    userId = v['user']['userId']
                    content = v['content']
                    if 'nickname' in v['user']:
                        nickname = v['user']['nickname']
                    else:
                        nickname = ''
                    if nickname in userlist:
                        # 判断记录是否已经写入过
                        sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s'
                        r = mysql_cursor.query(sql, userId, createTime, adminid)
                        if not r:
                            # 不存在记录  写入内容
                            sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                                  '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)'
                            postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime, adminid)

                        del r
                    time.sleep(0.2)
            del ret
            # 修改url记录为更新状态
            seturlstatus(urlid, adminid)
        except Exception, ex:
            seturlstatus(urlid, adminid)
            print ex
            processlog('autovote_main', 0, 'scrapy_comment_news_163_com', str(ex))
            return 0
示例#9
0
def scrapy_news_qq_com(rootid, urlid, siteid, adminid):
    headers = {
        'Host': 'comment5.news.sina.com.cn',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    userlist = getuserlist(siteid)
    try:
        comment_url = 'http://coral.qq.com/article/%s/hotcomment?reqnum=10&_=%s' % (rootid, int(time.time()))
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret['data']['commentid']
            for comment in comments:
                commentId = comment['mid']
                createTime = comment['time']
                userId = comment['uid']
                content = comment['content']
                r = re.findall(r'wb_screen_name=(.*?)&', comment['config'])
                if r:
                    nickname = r[0]
                else:
                    nickname = comment['nick']
                if nickname in userlist:
                    # 判断记录是否已经写入过
                    sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s'
                    r = mysql_cursor.query(sql, userId, createTime)
                    if not r:
                        # 不存在记录  写入内容
                        sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                              '`nickname`, `createTime`) values (%s, %s, %s, %s, %s, %s)'
                        postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime)

                    del r
        del ret
        time.sleep(1)
        # 修改url记录为更新状态
        seturlstatus(urlid, adminid)
    except Exception, ex:
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'scrapy_news_qq_com', str(ex))
        return 0
示例#10
0
def scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid):
    headers = {
        'Host': 'comment.ifeng.com',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    userlist = getuserlist(siteid)
    try:
        comment_url = 'http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=json&job=1&p=1&pageSize=100&callback=newCommentListCallBack' % docUrl
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret['comments']
            for comment in comments:
                commentId = comment['comment_id']
                createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(comment['create_time'])))
                userId = comment['user_id']
                content = comment['comment_contents']
                nickname = comment['uname']
                if nickname in userlist:
                    # 判断记录是否已经写入过
                    sql = 'select userId,createTime from system_url_posts where `userId`=%s and `createTime`=%s and `adminid`=%s'
                    r = mysql_cursor.query(sql, userId, createTime, adminid)
                    if not r:
                        # 不存在记录  写入内容
                        sql = 'insert into system_url_posts (`urlid`, `userId`, `commentIds`, `content`, ' \
                              '`nickname`, `createTime`, `adminid`) values (%s, %s, %s, %s, %s, %s, %s)'
                        postid = mysql_cursor.execute_lastrowid(sql, urlid, userId, commentId, content, nickname, createTime, adminid)

                    del r
        del ret
        time.sleep(2)
        # 修改url记录为更新状态
        seturlstatus(urlid, adminid)
    except Exception, ex:
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'scrapy_comment_ifeng_com', str(ex))
        return 0
示例#11
0
def get_comment_news_163_com_pagenum(docId, urlid):
    comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=0&limit=1&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId
    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    try:
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        if req.status_code != 200:
            return '0'
        html = req.text
        ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M)
        if ret:
            post = json.loads(ret[0])
            newListSize = int(ceil(int(post['newListSize']) / 30.0))
            comments = post['commentIds']
            if comments:
                if len(comments[0].split(',')) > 1:
                    createTime = post['comments'][comments[0].split(',')
                                                  [-1]]['createTime']
                else:
                    createTime = post['comments'][comments[0]]['createTime']
                if newListSize > 2:
                    num = 2
                else:
                    num = newListSize

                return json.dumps({'createTime': createTime, 'num': num})
        return json.dumps({'createTime': 0, 'num': 0})
    except Exception, ex:
        # 设置为更新状态, 这样用户可以根据实际情况进行再次采集
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'get_comment_news_163_com_pagenum',
                   str(ex))
        return 'error'
示例#12
0
def scrapy_hot_comments_ifeng_com(docUrl, commentid, postid, shorturl,
                                  crttime):

    print '爬取%s,%s,%s,%s' % (docUrl, commentid, postid, shorturl)
    processlog(
        'autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com',
        'docUrl:%s,commentid:%s, postid:%s, %s' %
        (docUrl, commentid, postid, shorturl))

    headers = {
        'Host': 'comment.ifeng.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    try:
        comment_url = 'http://comment.ifeng.com/get.php?callback=hotCommentListCallBack&orderby=uptimes&docUrl=%s&format=json&job=1&p=1&pageSize=10&callback=hotCommentListCallBack&skey=16a2fe' % docUrl
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret['comments']
            for comment in comments:
                commentId = comment['comment_id']

                if commentId == commentid:
                    index = comments.index(comment)
                    # 保存楼层信息到mysql
                    sql = 'update system_url_posts set floor=%s where postid=%s'
                    mysql_cursor.execute(sql, index + 1, postid)

                    # 截图
                    res = OutPutImg(postid, 'ifeng', postid)
                    if res:
                        # 加入截图完成队列
                        # redis_cursor.hset(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentid, docid, shorturl), 1)
                        # 更改远端帖子状态, 送图片

                        pass
                    else:
                        processlog('autoscreenshot', 1,
                                   'scrapy_hot_comments_ifeng_com', '截图False')
                        reWriteScreenShotQueue(postid, docUrl, shorturl,
                                               commentid, crttime)
                    return
            # print 'commentid找不到匹配'
            # processlog('autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', 'commentid找不到匹配: %s ' % commentid)
            reWriteScreenShotQueue(postid, docUrl, shorturl, commentid,
                                   crttime)
        del ret
    except Exception, ex:
        print ex
        processlog('autoscreenshot', 0, 'scrapy_hot_comments_ifeng_com',
                   str(ex))
        reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
示例#13
0
def get_comment_news_163_com_pagenum(docId, urlid):
    comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/newList?offset=0&limit=1&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId
    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    try:
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        if req.status_code != 200:
            return '0'
        html = req.text
        ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M)
        if ret:
            post = json.loads(ret[0])
            newListSize = int(ceil(int(post['newListSize'])/30.0))
            comments = post['commentIds']
            if comments:
                if len(comments[0].split(',')) > 1:
                    createTime = post['comments'][comments[0].split(',')[-1]]['createTime']
                else:
                    createTime = post['comments'][comments[0]]['createTime']
                if newListSize > 2:
                    num = 2
                else:
                    num = newListSize

                return json.dumps({'createTime': createTime, 'num': num})
        return json.dumps({'createTime': 0, 'num': 0})
    except Exception, ex:
        # 设置为更新状态, 这样用户可以根据实际情况进行再次采集
        seturlstatus(urlid, adminid)
        print ex
        processlog('autovote_main', 0, 'get_comment_news_163_com_pagenum', str(ex))
        return 'error'
示例#14
0
def scrapy_hot_comments_ifeng_com(docUrl, commentid, postid, shorturl, crttime):

    print "爬取%s,%s,%s,%s" % (docUrl, commentid, postid, shorturl)
    processlog(
        "autoscreenshot",
        1,
        "scrapy_hot_comments_ifeng_com",
        "docUrl:%s,commentid:%s, postid:%s, %s" % (docUrl, commentid, postid, shorturl),
    )

    headers = {
        "Host": "comment.ifeng.com",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }

    try:
        comment_url = (
            "http://comment.ifeng.com/get.php?callback=hotCommentListCallBack&orderby=uptimes&docUrl=%s&format=json&job=1&p=1&pageSize=10&callback=hotCommentListCallBack&skey=16a2fe"
            % docUrl
        )
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = json.loads(html)
        if ret:
            comments = ret["comments"]
            for comment in comments:
                commentId = comment["comment_id"]

                if commentId == commentid:
                    index = comments.index(comment)
                    # 保存楼层信息到mysql
                    sql = "update system_url_posts set floor=%s where postid=%s"
                    mysql_cursor.execute(sql, index + 1, postid)

                    # 截图
                    res = OutPutImg(postid, "ifeng", postid)
                    if res:
                        # 加入截图完成队列
                        # redis_cursor.hset(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentid, docid, shorturl), 1)
                        # 更改远端帖子状态, 送图片

                        pass
                    else:
                        processlog("autoscreenshot", 1, "scrapy_hot_comments_ifeng_com", "截图False")
                        reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
                    return
            # print 'commentid找不到匹配'
            # processlog('autoscreenshot', 1, 'scrapy_hot_comments_ifeng_com', 'commentid找不到匹配: %s ' % commentid)
            reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
        del ret
    except Exception, ex:
        print ex
        processlog("autoscreenshot", 0, "scrapy_hot_comments_ifeng_com", str(ex))
        reWriteScreenShotQueue(postid, docUrl, shorturl, commentid, crttime)
示例#15
0
def main():
    while True:
        ret = redis_cursor.rpop(pre_system + 'userlist')
        if ret:
            url, siteid, t, adminid = ret.split('|')

            print '爬取: %s,%s,%s,%s' % (url, siteid, t, adminid)
            processlog(
                'auto_scrapyuser', 1, 'main',
                '爬取: %s, %s, %s ,adminid:%s' % (url, siteid, t, adminid))
            try:
                scrapy_comment_user(url, siteid, adminid)

            except Exception, e:
                try:
                    crc32address = crc32(url) & 0xffffffff
                    sql = 'update system_site_user set status=0 where crc32address=%s'
                    mysql_cursor.execute(sql, crc32address)
                except Exception, e:
                    processlog('auto_scrapyuser', 0, 'main', str(e))

                if 'Data too long for column' in str(e):
                    continue
                if 'Incorrect string value:' in str(e):
                    print '存在表情,无法保存content, nickname:%s' % url
                    processlog('auto_scrapyuser', 0, 'main',
                               '存在表情,无法保存, url:%s' % url)
                    continue
                if time.time() - int(t) < 3600:
                    pass
                    # print '重新写回队列: %s' % url
                    # processlog('auto_scrapyuser', 1, 'main', '重新写回队列: %s' % url)
                    # redis_cursor.lpush(pre_system + 'userlist', '%s|%s|%s|%s' % (url, siteid, t, adminid))
                else:
                    print '超时: %s' % url
                    processlog('auto_scrapyuser', 1, 'main', '超时: %s' % url)
                print 'error: %s' % str(e)
                processlog('auto_scrapyuser', 0, 'main', str(e))
示例#16
0
def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime):
    """
    判断alue是否超时,不超时写回队列
    """
    try:
        if time.time() - int(crttime) < 86400:
            print "未超时, 重新写入队列: postid:%s" % postid
            # processlog('autoscreenshot', 1, 'reWriteScreenShotQueue', '未超时,重新写入队列: postid:%s' % postid)
            # redis重新写入截图队列
            redis_cursor.lpush(
                pre_system + "screenshotqueue",
                "%s|%s|%s|%s|%s|%s" % (postid, docid, shorturl, commentid, crttime, int(time.time())),
            )
        else:
            print "超时,%s:%s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(crttime))), postid)
            processlog(
                "autoscreenshot",
                1,
                "reWriteScreenShotQueue",
                "超时,%s:postid: %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(crttime))), postid),
            )
    except Exception, e:
        processlog("autoscreenshot", 0, "reWriteScreenShotQueue", str(e))
示例#17
0
def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime):
    '''
    判断alue是否超时,不超时写回队列
    '''
    try:
        if time.time() - int(crttime) < 86400:
            print '未超时, 重新写入队列: postid:%s' % postid
            # processlog('autoscreenshot', 1, 'reWriteScreenShotQueue', '未超时,重新写入队列: postid:%s' % postid)
            # redis重新写入截图队列
            redis_cursor.lpush(
                pre_system + 'screenshotqueue',
                '%s|%s|%s|%s|%s|%s' % (postid, docid, shorturl, commentid,
                                       crttime, int(time.time())))
        else:
            print '超时,%s:%s' % (time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(int(crttime))), postid)
            processlog(
                'autoscreenshot', 1, 'reWriteScreenShotQueue',
                '超时,%s:postid: %s' %
                (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(
                    int(crttime))), postid))
    except Exception, e:
        processlog('autoscreenshot', 0, 'reWriteScreenShotQueue', str(e))
示例#18
0
def auto_screenshot():
    print '图片轮训开始'
    processlog('autoscreenshot', 1, 'auto_screenshot', '图片轮训开始')
    while True:
        screenshot = redis_cursor.rpop(pre_system + 'screenshotqueue')
        if screenshot:
            try:
                postid, docid, shorturl, commentid, crttime, lasttime = screenshot.split(
                    '|')

                # 判断时间间隔是否太快, 不得少于2min
                if int(time.time()) - int(lasttime) < 60:
                    print 'url采集间隔过快,重新写入队列  postid:%s  ; lasttime:%s' % (
                        postid, lasttime)
                    # reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime)
                    redis_cursor.lpush(
                        pre_system + 'screenshotqueue',
                        '%s|%s|%s|%s|%s|%s' % (postid, docid, shorturl,
                                               commentid, crttime, lasttime))
                    print '%s sleep 5 sec!' % time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime())
                    time.sleep(5)
                    continue
                # 网易新闻
                if shorturl in 'comment.news.163.com':
                    scrapy_hot_comments_news_163_com(docid, commentid, postid,
                                                     shorturl, crttime)
                # 凤凰新闻
                if shorturl in 'gentie.ifeng.com':
                    scrapy_hot_comments_ifeng_com(docid, commentid, postid,
                                                  shorturl, crttime)
            except Exception, e:
                print e
                processlog('autoscreenshot', 0, 'auto_screenshot', str(e))
        print '%s sleep 5 sec!' % time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime())
        time.sleep(5)
示例#19
0
def scrapy_comment_user(url, siteid, adminid):
    if siteid == "1":  # 163
        username = getUsername(url)
        print "username: %s" % username
        processlog("auto_scrapyuser", 1, "scrapy_comment_user", "username: %s, url:%s" % (username, url))
        if username:
            scrapy_comment_user_163(username, adminid, url)
        else:
            print "未匹配到参数"
            processlog("auto_scrapyuser", 1, "scrapy_comment_user", "未匹配到参数, url:%s" % url)

    elif siteid == "2":  # ifeng
        guid, uname = getGuidAndUsername(url)
        if guid and uname:
            print "uname: %s" % uname
            processlog("auto_scrapyuser", 1, "scrapy_comment_user", "uname: %s, guid:%s, url:%s" % (uname, guid, url))
            scrapy_comment_user_ifeng(guid, uname, adminid, url)
        else:
            print "未匹配到参数"
            processlog("auto_scrapyuser", 1, "scrapy_comment_user", "未匹配到参数, url:%s" % url)
示例#20
0
def scrapy_comment_user(url, siteid, adminid):
    if siteid == '1':  # 163
        username = getUsername(url)
        print 'username: %s' % username
        processlog('auto_scrapyuser', 1, 'scrapy_comment_user',
                   'username: %s, url:%s' % (username, url))
        if username:
            scrapy_comment_user_163(username, adminid, url)
        else:
            print '未匹配到参数'
            processlog('auto_scrapyuser', 1, 'scrapy_comment_user',
                       '未匹配到参数, url:%s' % url)

    elif siteid == '2':  # ifeng
        guid, uname = getGuidAndUsername(url)
        if guid and uname:
            print 'uname: %s' % uname
            processlog('auto_scrapyuser', 1, 'scrapy_comment_user',
                       'uname: %s, guid:%s, url:%s' % (uname, guid, url))
            scrapy_comment_user_ifeng(guid, uname, adminid, url)
        else:
            print '未匹配到参数'
            processlog('auto_scrapyuser', 1, 'scrapy_comment_user',
                       '未匹配到参数, url:%s' % url)
示例#21
0
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    print ex
    processlog('autovote_main', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host,
                                 mysql_db,
                                 user=mysql_user,
                                 password=mysql_pass)


def getuserlist(siteid):
    # 获取内部帐号列表
示例#22
0
    config_data = fp.read()
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    processlog('autoscreenshot', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host,
                                 mysql_db,
                                 user=mysql_user,
                                 password=mysql_pass)


def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime):
    '''
示例#23
0
def scrapy_comment_user_ifeng(guid, username, adminid, address):
    """
    凤凰网个人页面爬取
    http://comment.ifeng.com/get? job=7 & format=json & pagesize=20 & _1460705534 & guid=65969467 & p=1
    """
    username_decode = unquote(username)
    siteid = 2
    # 判断用户是否存在
    sql = "select userid,adminid from system_site_user where siteid=%s and username=%s"
    r = mysql_cursor.query(sql, siteid, username_decode)
    if r:
        if int(adminid) != int(r[0]["adminid"]):
            print "网站帐号存在,且adminid不符"
            processlog(
                "auto_scrapyuser",
                1,
                "scrapy_comment_user_ifeng",
                "网站帐号存在,添加人不匹配,现:%s, 原:%s" % (adminid, r[0]["adminid"]),
            )
            return
        print "网站帐号存在"
        userid = r[0]["userid"]
        setAddressStatus(userid, 1)
    else:
        processlog("auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "网站帐号不存在,添加:%s" % username)
        crc32_address = crc32(address) & 0xFFFFFFFF
        sql = "insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)"
        userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address)

    headers = {
        "Host": "comment.news.163.com",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    # 默认爬取6页
    for page in xrange(scrapy_page):
        url = "http://comment.ifeng.com/get?job=7&format=json&pagesize=20&guid=%s&p=%s" % (guid, page)

        req = requests.get(url, headers=headers, timeout=timeout)
        if req.status_code == 200:
            data = json.loads(req.text)
            comments = data["comments"]
            for comment in comments:
                _url = comment["doc_url"]
                # 判断url是否支持
                res = r"://(.*?)/"
                ret = re.findall(res, _url)
                if ret:
                    shorturl = ret[0]
                    if shorturl in ["news.ifeng.com"]:

                        title = comment["doc_name"]  # 帖子标题
                        content = comment["comment_contents"]

                        createTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(comment["create_time"])))
                        commentId = comment["comment_id"]
                        nickname = comment["uname"]

                        url_post = (
                            "http://gentie.ifeng.com/view.html?docUrl="
                            + quote(_url.encode("utf8"))
                            + "&docName="
                            + quote(title.encode("utf8"))
                        )

                        # 判断帖子是否保存过
                        sql = "select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s"
                        r = mysql_cursor.query(sql, commentId, createTime, adminid)
                        if not r:
                            # 判断url是否添加过
                            crc32_url = crc32(url_post) & 0xFFFFFFFF
                            sql = "select urlid from system_url_list where `crc32url`=%s and adminid=%s"
                            ret = mysql_cursor.query(sql, crc32_url, adminid)

                            if ret:  # 添加过
                                urlid = ret[0]["urlid"]
                            else:
                                sql = "insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`,`status`, `adminid`) values(%s, %s, %s, %s, now(), 1, %s)"
                                urlid = mysql_cursor.execute_lastrowid(sql, siteid, title, url_post, crc32_url, adminid)
                                processlog(
                                    "auto_scrapyuser", 1, "scrapy_comment_user_ifeng", "url未添加过,添加url,urlid:%s" % urlid
                                )

                            try:
                                # 保存帖子
                                sql = (
                                    "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`"
                                    ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)"
                                )
                                postid = mysql_cursor.execute_lastrowid(
                                    sql, urlid, userid, commentId, content, nickname, createTime, adminid
                                )

                                print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid)
                                processlog(
                                    "auto_scrapyuser",
                                    1,
                                    "scrapy_comment_user_ifeng",
                                    "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid),
                                )
                            except Exception, e:
                                if "Data too long for column" in str(e):
                                    processlog(
                                        "auto_scrapyuser",
                                        1,
                                        "scrapy_comment_user_ifeng",
                                        "帖子内容过长,重新截取写入,urlid:%s" % urlid,
                                    )
                                    content = content[:255]
                                    # 保存帖子
                                    sql = (
                                        "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`"
                                        ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)"
                                    )
                                    postid = mysql_cursor.execute_lastrowid(
                                        sql, urlid, userid, commentId, content, nickname, createTime, adminid
                                    )

                                    print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid)
                                    processlog(
                                        "auto_scrapyuser",
                                        1,
                                        "scrapy_comment_user_ifeng",
                                        "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid),
                                    )
                                # 更新site_user状态
                                setAddressStatus(userid, 0)
                        else:
                            print "帖子已经添加过: commentId:%s" % commentId
                            # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '帖子已经添加过: commentId:%s' % commentId)

            # 如果到最后一页,退出循环
            total = data["count"]
            if (page + 1) * 20 >= total:
                break

        else:
            print req.text
示例#24
0
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    print ex
    processlog('autovote_agent', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host,
                                 mysql_db,
                                 user=mysql_user,
                                 password=mysql_pass)


def autovote():
    # 获取commentid
示例#25
0
    config_data = fp.read()
    fp.close()

    config_json = json.loads(config_data)["config"]

    redis_host = config_json["redis_host"]
    redis_port = config_json["redis_port"]
    mysql_host = config_json["mysql_host"]
    mysql_db = config_json["mysql_db"]
    mysql_user = config_json["mysql_user"]
    mysql_pass = config_json["mysql_pass"]

    pre_system = config_json["pre_system"]
    serverport = config_json["http_port"]
except Exception, ex:
    processlog("autoscreenshot", 0, "config", str(ex))
    sys.exit(-1)


# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass)


def reWriteScreenShotQueue(postid, docid, shorturl, commentid, crttime):
    """
    判断alue是否超时,不超时写回队列
    """
示例#26
0
def scrapy_comment_user_ifeng(guid, username, adminid, address):
    '''
    凤凰网个人页面爬取
    http://comment.ifeng.com/get? job=7 & format=json & pagesize=20 & _1460705534 & guid=65969467 & p=1
    '''
    username_decode = unquote(username)
    siteid = 2
    # 判断用户是否存在
    sql = 'select userid,adminid from system_site_user where siteid=%s and username=%s'
    r = mysql_cursor.query(sql, siteid, username_decode)
    if r:
        if int(adminid) != int(r[0]['adminid']):
            print '网站帐号存在,且adminid不符'
            processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng',
                       '网站帐号存在,添加人不匹配,现:%s, 原:%s' % (adminid, r[0]['adminid']))
            return
        print '网站帐号存在'
        userid = r[0]['userid']
        setAddressStatus(userid, 1)
    else:
        processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng',
                   '网站帐号不存在,添加:%s' % username)
        crc32_address = crc32(address) & 0xffffffff
        sql = 'insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`, `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)'
        userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode,
                                                adminid, address,
                                                crc32_address)

    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    # 默认爬取6页
    for page in xrange(scrapy_page):
        url = 'http://comment.ifeng.com/get?job=7&format=json&pagesize=20&guid=%s&p=%s' % (
            guid, page)

        req = requests.get(url, headers=headers, timeout=timeout)
        if req.status_code == 200:
            data = json.loads(req.text)
            comments = data['comments']
            for comment in comments:
                _url = comment['doc_url']
                # 判断url是否支持
                res = r'://(.*?)/'
                ret = re.findall(res, _url)
                if ret:
                    shorturl = ret[0]
                    if shorturl in [
                            'news.ifeng.com',
                    ]:

                        title = comment['doc_name']  #帖子标题
                        content = comment['comment_contents']

                        createTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(int(comment['create_time'])))
                        commentId = comment['comment_id']
                        nickname = comment['uname']

                        url_post = 'http://gentie.ifeng.com/view.html?docUrl=' + quote(
                            _url.encode('utf8')) + '&docName=' + quote(
                                title.encode('utf8'))

                        #判断帖子是否保存过
                        sql = 'select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s'
                        r = mysql_cursor.query(sql, commentId, createTime,
                                               adminid)
                        if not r:
                            #判断url是否添加过
                            crc32_url = crc32(url_post) & 0xffffffff
                            sql = 'select urlid from system_url_list where `crc32url`=%s and adminid=%s'
                            ret = mysql_cursor.query(sql, crc32_url, adminid)

                            if ret:  #添加过
                                urlid = ret[0]['urlid']
                            else:
                                sql = 'insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`,`status`, `adminid`) values(%s, %s, %s, %s, now(), 1, %s)'
                                urlid = mysql_cursor.execute_lastrowid(
                                    sql, siteid, title, url_post, crc32_url,
                                    adminid)
                                processlog('auto_scrapyuser', 1,
                                           'scrapy_comment_user_ifeng',
                                           'url未添加过,添加url,urlid:%s' % urlid)

                            try:
                                #保存帖子
                                sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\
                                      ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)'
                                postid = mysql_cursor.execute_lastrowid(
                                    sql, urlid, userid, commentId, content,
                                    nickname, createTime, adminid)

                                print '保存帖子: %s; postid :%s ; adminid : %s' % (
                                    nickname, postid, adminid)
                                processlog(
                                    'auto_scrapyuser', 1,
                                    'scrapy_comment_user_ifeng',
                                    '保存帖子: %s; postid :%s ; adminid : %s' %
                                    (nickname, postid, adminid))
                            except Exception, e:
                                if 'Data too long for column' in str(e):
                                    processlog(
                                        'auto_scrapyuser', 1,
                                        'scrapy_comment_user_ifeng',
                                        '帖子内容过长,重新截取写入,urlid:%s' % urlid)
                                    content = content[:255]
                                    #保存帖子
                                    sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\
                                          ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)'
                                    postid = mysql_cursor.execute_lastrowid(
                                        sql, urlid, userid, commentId, content,
                                        nickname, createTime, adminid)

                                    print '保存帖子: %s; postid :%s ; adminid : %s' % (
                                        nickname, postid, adminid)
                                    processlog(
                                        'auto_scrapyuser', 1,
                                        'scrapy_comment_user_ifeng',
                                        '保存帖子: %s; postid :%s ; adminid : %s' %
                                        (nickname, postid, adminid))
                                # 更新site_user状态
                                setAddressStatus(userid, 0)
                        else:
                            print '帖子已经添加过: commentId:%s' % commentId
                            # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_ifeng', '帖子已经添加过: commentId:%s' % commentId)

            #如果到最后一页,退出循环
            total = data['count']
            if (page + 1) * 20 >= total:
                break

        else:
            print req.text
示例#27
0
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    print ex
    processlog('auto_scrapyuser', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host,
                                 mysql_db,
                                 user=mysql_user,
                                 password=mysql_pass)

# 爬取页数
scrapy_page = 4
#网易从个人页面获取所有帖子
示例#28
0
def scrapy_comment_user_163(username, adminid, address):
    """
    网易用户所有跟贴的爬取
    """
    username_decode = base64.b64decode(username)
    siteid = mysql_cursor.query('select siteid from system_site_list where shorturl="comment.news.163.com"')[0][
        "siteid"
    ]
    # 判断用户是否存在
    sql = "select userid,adminid from system_site_user where siteid=%s and username=%s "
    r = mysql_cursor.query(sql, int(siteid), username_decode)
    if r:
        if int(adminid) != int(r[0]["adminid"]):
            print "网站帐号存在,添加人不匹配"
            processlog(
                "auto_scrapyuser", 1, "scrapy_comment_user_163", "网站帐号存在,添加人不匹配,现:%s, 原:%s" % (adminid, r[0]["adminid"])
            )
            return
        userid = r[0]["userid"]
        setAddressStatus(userid, 1)
    else:
        processlog("auto_scrapyuser", 1, "scrapy_comment_user_163", "网站帐号不存在,添加:%s,userid:%s" % (username, adminid))

        crc32_address = crc32(address) & 0xFFFFFFFF
        sql = "insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`,  `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)"
        userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode, adminid, address, crc32_address)

    headers = {
        "Host": "comment.news.163.com",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    # 默认爬取6页
    for page in xrange(scrapy_page):
        url = (
            "http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/users/0/comments?username=%s&offset=%s&limit=30&ibc=newspc"
            % (username, page)
        )

        req = requests.get(url, headers=headers, timeout=timeout)
        if req.status_code == 200:
            data = json.loads(req.text)

            threads = data["threads"]
            urllist = []

            for k, v in threads.items():
                param = {}

                _url = v["url"]
                # 判断url是否支持
                res = r"://(.*?)/"
                ret = re.findall(res, _url)
                if ret:
                    shorturl = ret[0]
                    if shorturl in ["news.163.com"]:
                        boardId = v["boardId"]
                        param["docId"] = v["docId"]
                        param["title"] = v["title"]
                        param["url"] = "http://comment.news.163.com/" + boardId + "/" + v["docId"] + ".html"
                        urllist.append(param)
                else:
                    processlog("auto_scrapyuser", 1, "crapy_comment_user_163", "url不支持:%s" % _url)

            comments = data["comments"]

            for k, v in comments.items():
                url_post = ""
                title = ""
                for u in urllist:
                    if u["docId"] == k.split("_")[0]:
                        url_post = u["url"]
                        title = u["title"]
                buildLevel = v["buildLevel"]
                # 判断是否含有nickname, 是否是最外层的评论
                if url_post and title and v["user"].has_key("nickname") and buildLevel == 1:
                    nickname = v["user"]["nickname"]
                    commentId = v["commentId"]
                    createTime = v["createTime"]
                    content = v["content"].encode("utf8")

                    # 判断帖子是否保存过
                    sql = "select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s"
                    r = mysql_cursor.query(sql, commentId, createTime, adminid)
                    if not r:
                        # 判断url是否添加过
                        crc32_url = crc32(url_post) & 0xFFFFFFFF
                        sql = "select urlid from system_url_list where `crc32url`=%s and `adminid`=%s"
                        ret = mysql_cursor.query(sql, crc32_url, adminid)
                        if ret:  # 添加过
                            urlid = ret[0]["urlid"]
                        else:
                            sql = "insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`, `status`, `adminid`) values(%s,%s,%s,%s,now(),1, %s)"
                            urlid = mysql_cursor.execute_lastrowid(sql, siteid, title, url_post, crc32_url, adminid)

                            processlog(
                                "auto_scrapyuser", 1, "scrapy_comment_user_163", "url未添加过,添加url,urlid:%s" % urlid
                            )
                        # 保存帖子
                        try:
                            sql = (
                                "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`"
                                ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)"
                            )
                            postid = mysql_cursor.execute_lastrowid(
                                sql, urlid, userid, commentId, content, nickname, createTime, adminid
                            )
                            print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid)
                            processlog(
                                "auto_scrapyuser",
                                1,
                                "scrapy_comment_user_163",
                                "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid),
                            )

                        except Exception, e:
                            # 有的字符集无法保存
                            if "Incorrect string value:" in str(e):
                                print "存在表情,无法保存content, nickname:%s" % nickname
                                processlog(
                                    "auto_scrapyuser",
                                    0,
                                    "scrapy_comment_user_163",
                                    "存在表情,无法保存content, nickname:%s" % nickname,
                                )

                            elif "Data too long for column" in str(e):
                                processlog(
                                    "auto_scrapyuser", 1, "scrapy_comment_user_163", "帖子内容过长,重新截取写入,urlid:%s" % urlid
                                )
                                content = content[:255]
                                sql = (
                                    "insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`"
                                    ", `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)"
                                )
                                postid = mysql_cursor.execute_lastrowid(
                                    sql, urlid, userid, commentId, content, nickname, createTime, adminid
                                )
                                print "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid)
                                processlog(
                                    "auto_scrapyuser",
                                    1,
                                    "scrapy_comment_user_163",
                                    "保存帖子: %s; postid :%s ; adminid : %s" % (nickname, postid, adminid),
                                )

                            else:
                                print e
                                processlog("auto_scrapyuser", 0, "scrapy_comment_user_163", str(e))
                            # 更新site_user状态
                            setAddressStatus(userid, 0)
                    else:
                        print "帖子保存过:postid:%s" % r[0]["postid"]
                        # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '帖子保存过:postid:%s' % r[0]['postid'])

            # 如果到最后一页,退出循环
            total = data["total"]
            if (page + 1) * 30 >= total:
                break
        else:
            print req.text
示例#29
0
def autovote():
    # 获取commentid
    postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque')
    if not postid_commentid:
        print '%s null' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        return 0

    postid = postid_commentid.split('|')[0]
    commentId = postid_commentid.split('|')[1]
    docId = postid_commentid.split('|')[2]
    shorturl = postid_commentid.split('|')[3]
    adminid = postid_commentid.split('|')[4]
    
    processlog('autovote_agent', 1, 'autovote', '爬取:postid:%s, commetnid:%s, drcid:%s, shorturl: %s' % (postid, commentId, docId, shorturl))


    # TODO 修改完成判断为截图完成
    # 搜索截图完成队列中是否含有本条帖子
    # if redis_cursor.hexists(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)):

    #     return 0


    # 获取代理ip 如果没有代理ip则为127.0.0.1
    proxy_ip = redis_cursor.lpop(pre_system + 'proxylist')
    if not proxy_ip:
        proxy_ip = '127.0.0.1'

    # 判断帖子现在是否是开启状态
    ret = redis_cursor.hget(pre_system + 'commentidstatus', postid_commentid)
    if ret == '0':
        # 帖子处于停止状态
        print '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid)
        processlog('autovote_agent', 1, 'autovote', '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid))
        # # 将commentid写回队列
        # redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))

        # 将代理ip写入队列
        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
        return 0

    # 判断此代理ip是否能够访问
    ret = redis_cursor.hget(pre_system + 'ipinterval_' + postid_commentid, proxy_ip)
    if ret:
        t = int(time.mktime(datetime.datetime.now().timetuple()))
        if t <= int(ret):
            # 将代理ip写入队列
            redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

            # 将commentid写回队列
            redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
            return 0


    # 判断次数有没有顶贴完成
    sql = 'select `count`, `maxcount` from system_url_posts where `postid`=%s'
    ret = mysql_cursor.query(sql, postid)
    if ret:
        count_now = int(ret[0]['count'])
        count_max = int(ret[0]['maxcount'])
        if count_now >= count_max:
            # 设置状态postid状态为已完成
            sql = 'update system_url_posts set `status`=3 where `postid`=%s'
            mysql_cursor.execute(sql, postid)

            # 删除本条postid对应的hash表
            redis_cursor.delete(pre_system + 'ipinterval_' + postid_commentid)

            # 已经采集完成 删除status信息
            redis_cursor.hdel(pre_system + 'commentidstatus', postid_commentid)

            # 将代理ip写入队列
            redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

            # 删除autovote_commentidadded 表中的记录 以免在顶贴目标完成之后  再次增加次数无法在进行添加
            redis_cursor.hdel(pre_system + 'commentidadded', postid_commentid)
            print 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
            processlog('autovote_agent', 1, 'autovote', 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
            # # TODO 截图
            # # 方案二:如果顶帖完成,保存到redis一个完成的列表,截图程序轮询这个列表
            # redis_cursor.lpush(pre_system + 'postidfinished', postid + "|" + shorturl)
            
            # if shorturl in 'comment.news.163.com':
            #     urltype = '163'
            # if shorturl in 'gentie.ifeng.com':
            #     urltype = 'ifeng'

            # try:
            #     sql = 'select url from system_url_post as posts, system_url_list as list where postid=%s and post.urlid=list.urlid'
            #     url_img = mysql_cursor.query(sql, postid)[0]['url']
            #     OutPutImg(url, urltype, postid)

            # except Exception,e:
            #     print '截图错误:%s' % str(e)
            return 0 
    else:
        print '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        processlog('autovote_agent', 1, 'reWriteScreenShotQueue', '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        return 0

    try:
        if shorturl in 'comment.news.163.com':
            url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/%s/action/upvote?ibc=newspc' % (
            docId, commentId)

        if shorturl in 'gentie.ifeng.com':
            url = 'http://comment.ifeng.com/vote.php?callback=recmCallback&cmtId=%s&job=up&docUrl=%s&callback=recmCallback&format=js' % (commentId, docId)

        headers = {
            'Host': '%s' % shorturl,
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest'
        }

        # 网易新闻参数
        data = {
            'ibc': 'newspc'
        }

        if proxy_ip == '127.0.0.1':
            if shorturl in 'comment.news.163.com':
                ret = requests.post(url, data=data, headers=headers, timeout=2)
            if shorturl in 'gentie.ifeng.com':
                ret = requests.get(url, headers=headers, timeout=2)
        else:
            try:
                proxies = {'http': 'http://' + proxy_ip}
                # 网易新闻
                if shorturl in 'comment.news.163.com':
                    ret = requests.post(url, data=data, proxies=proxies, headers=headers, timeout=2)
                # 凤凰新闻
                if shorturl in 'gentie.ifeng.com':
                    ret = requests.get(url, headers=headers, timeout=2)
            except requests.RequestException:
                # 判断ip发生异常的次数  超过三次则移除IP
                timeout_count = redis_cursor.hget(pre_system + 'iptimeoutcount', proxy_ip)
                print 'timeout_count: %s' % timeout_count
                if timeout_count:
                    if int(timeout_count) > 2:
                        print 'IP: %s 发生异常, 异常次数: %s  移除IP!' % (proxy_ip, int(timeout_count))
                        processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s  移除IP!' % (proxy_ip, int(timeout_count)))
                        # 删除autovote_ipinterval hash表
                        redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip)
                        redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip)
                    else:
                        print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count))
                        processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, int(timeout_count)))
                        redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, int(timeout_count) + 1)
                        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                else:
                    print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1)
                    processlog('autovote_agent', 1, 'autovote', 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1))
                    redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip, 1)
                    redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                # 将commentid写回队列
                redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
                return 1
        if ret.status_code == 200:
            if len(ret.text) <= 60:
                # 删除之前timeout的次数
                redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip)
                # 一分钟之后才能继续采集
                if shorturl in 'comment.news.163.com':
                    nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 60)))
                # 时间间隔5秒钟
                if shorturl in 'gentie.ifeng.com':
                    nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 5)))
                redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time)
                if shorturl in 'gentie.ifeng.com':
                    if 'alert' not in ret.text:
                        sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s'
                        mysql_cursor.execute(sql, postid)
                        print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)
                        processlog('autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip))
                    else:
                        print '%s 凤凰新闻顶贴时间过快!' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                        processlog('autovote_agent', 1, 'autovote', '%s 凤凰新闻顶贴时间过快!' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
                    # 将commentid写回队列
                    redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
                    # 将代理ip写入队列
                    redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                    return 1
                if shorturl in 'comment.news.163.com':
                    sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s'
                    mysql_cursor.execute(sql, postid)

                # 写入统计表
                sql = 'insert into system_post_detail (`postTime`, `count`, `adminid`) values (%s, 1, %s)'
                t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                mysql_cursor.execute(sql, t, adminid)
                del t
                print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)
                processlog('autovote_agent', 1, 'autovote', '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip))

                # 将代理ip写入队列
                redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
            else:
                print 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (proxy_ip, len(ret.text))
                processlog('autovote_agent', 1, 'autovote', 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (proxy_ip, len(ret.text)))
                # 删除autovote_ipinterval hash表
                redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip)
        elif ret.status_code == 429:
            # 10秒之后才能继续采集
            print '%s 顶贴太频繁!' % proxy_ip
            nxt_time = int(time.mktime(time.localtime(time.time() + int(1) * 10)))
            redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid, proxy_ip, nxt_time)
        else:
            print '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip)
            processlog('autovote_agent', 1, 'autovote', '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip))
            # 删除autovote_ipinterval hash表
            redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid, proxy_ip)

        # 将commentid写回队列
        redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
        return 1
    except Exception, ex:
        print ex
        processlog('autovote_agent', 0, 'autovote', str(ex))
        # 将代理ip写入队列
        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

        # 将commentid写回队列
        redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
        return 0
示例#30
0
def scrapy_hot_comments_news_163_com(docId, commentid, postid, shorturl, crttime):
    """


    """
    print "爬取%s,%s,%s,%s" % (docId, commentid, postid, shorturl)
    processlog(
        "autoscreenshot",
        1,
        "scrapy_hot_comments_news_163_com",
        "爬取docId:%s,commentid:%s, postid:%s, %s" % (docId, commentid, postid, shorturl),
    )
    page = 1
    inhot = False
    headers = {
        "Host": "comment.news.163.com",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }

    try:
        comment_url = (
            "http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/hotTopList?offset=0&limit=40&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc"
            % docId
        )
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = re.findall("getData\(\n(.*)\);", html, re.S | re.M)
        if ret:
            post = json.loads(ret[0])
            indexs = post["commentIds"]
            for commid in indexs:
                if str(commentid) in commid:
                    # 获取热门帖子索引 k
                    index = indexs.index(commid)
                    page = math.ceil((index + 1) / 10.0)
                    # 保存楼层信息到mysql
                    sql = "update system_url_posts set floor=%s where postid=%s"
                    mysql_cursor.execute(sql, index + 1, postid)
                    inhot = True

            if not inhot:
                print "热门帖中无法找到: %s" % postid
                processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "热门帖中无法找到: %s" % postid)
                reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
                return

            comments = post["comments"]
            for k, v in comments.items():
                commentId = v["commentId"]
                if int(commentId) == int(commentid):
                    # 截图
                    res = OutPutImg(postid, "163", postid, page)
                    if not res:
                        print "截图失败"
                        processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "截图False")
                        reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
                        return
                    print "截图完成: %s !" % postid
                    processlog("autoscreenshot", 1, "scrapy_hot_comments_news_163_com", "截图完成: %s " % postid)
                # print 'commentid找不到匹配'
                # processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', 'commentid找不到匹配: %s ' % commentid)
        del ret
    except Exception, ex:
        print ex
        processlog("autoscreenshot", 0, "scrapy_hot_comments_news_163_com", str(ex))
        reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
示例#31
0
def scrapy_hot_comments_news_163_com(docId, commentid, postid, shorturl,
                                     crttime):
    '''


    '''
    print '爬取%s,%s,%s,%s' % (docId, commentid, postid, shorturl)
    processlog(
        'autoscreenshot', 1, 'scrapy_hot_comments_news_163_com',
        '爬取docId:%s,commentid:%s, postid:%s, %s' %
        (docId, commentid, postid, shorturl))
    page = 1
    inhot = False
    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    try:
        comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/hotTopList?offset=0&limit=40&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc' % docId
        req = requests.get(comment_url, headers=headers, timeout=timeout)
        html = req.text
        ret = re.findall('getData\(\n(.*)\);', html, re.S | re.M)
        if ret:
            post = json.loads(ret[0])
            indexs = post['commentIds']
            for commid in indexs:
                if str(commentid) in commid:
                    # 获取热门帖子索引 k
                    index = indexs.index(commid)
                    page = math.ceil((index + 1) / 10.0)
                    # 保存楼层信息到mysql
                    sql = 'update system_url_posts set floor=%s where postid=%s'
                    mysql_cursor.execute(sql, index + 1, postid)
                    inhot = True

            if not inhot:
                print '热门帖中无法找到: %s' % postid
                processlog('autoscreenshot', 1,
                           'scrapy_hot_comments_news_163_com',
                           '热门帖中无法找到: %s' % postid)
                reWriteScreenShotQueue(postid, docId, shorturl, commentid,
                                       crttime)
                return

            comments = post['comments']
            for k, v in comments.items():
                commentId = v['commentId']
                if int(commentId) == int(commentid):
                    # 截图
                    res = OutPutImg(postid, '163', postid, page)
                    if not res:
                        print '截图失败'
                        processlog('autoscreenshot', 1,
                                   'scrapy_hot_comments_news_163_com',
                                   '截图False')
                        reWriteScreenShotQueue(postid, docId, shorturl,
                                               commentid, crttime)
                        return
                    print '截图完成: %s !' % postid
                    processlog('autoscreenshot', 1,
                               'scrapy_hot_comments_news_163_com',
                               '截图完成: %s ' % postid)
                # print 'commentid找不到匹配'
                # processlog('autoscreenshot', 1, 'scrapy_hot_comments_news_163_com', 'commentid找不到匹配: %s ' % commentid)
        del ret
    except Exception, ex:
        print ex
        processlog('autoscreenshot', 0, 'scrapy_hot_comments_news_163_com',
                   str(ex))
        reWriteScreenShotQueue(postid, docId, shorturl, commentid, crttime)
示例#32
0
def seturlstatus(urlid, adminid):
    # 设置url的状态
    sql = 'update system_url_list set `status`=1 where `urlid`=%s and `adminid`=%s'
    mysql_cursor.execute(sql, urlid, adminid)
    processlog('autovote_main', 1, 'seturlstatus', '设置url的状态,urlid:%s' % urlid)
示例#33
0
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    print ex
    processlog('autovote_main', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user,
                                 password=mysql_pass)


def getuserlist(siteid):
    # 获取内部帐号列表
    userlist = []
    sql = 'select username from system_site_user where `siteid`=%s'
示例#34
0
def seturlstatus(urlid, adminid):
    # 设置url的状态
    sql = 'update system_url_list set `status`=1 where `urlid`=%s and `adminid`=%s'
    mysql_cursor.execute(sql, urlid, adminid)
    processlog('autovote_main', 1, 'seturlstatus', '设置url的状态,urlid:%s' % urlid)
示例#35
0
def autovote():
    # 获取commentid
    postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque')
    if not postid_commentid:
        print '%s null' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        return 0

    postid = postid_commentid.split('|')[0]
    commentId = postid_commentid.split('|')[1]
    docId = postid_commentid.split('|')[2]
    shorturl = postid_commentid.split('|')[3]
    adminid = postid_commentid.split('|')[4]

    processlog(
        'autovote_agent', 1, 'autovote',
        '爬取:postid:%s, commetnid:%s, drcid:%s, shorturl: %s' %
        (postid, commentId, docId, shorturl))

    # TODO 修改完成判断为截图完成
    # 搜索截图完成队列中是否含有本条帖子
    # if redis_cursor.hexists(pre_system + 'complete', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl)):

    #     return 0

    # 获取代理ip 如果没有代理ip则为127.0.0.1
    proxy_ip = redis_cursor.lpop(pre_system + 'proxylist')
    if not proxy_ip:
        proxy_ip = '127.0.0.1'

    # 判断帖子现在是否是开启状态
    ret = redis_cursor.hget(pre_system + 'commentidstatus', postid_commentid)
    if ret == '0':
        # 帖子处于停止状态
        print '%s 帖子:%s处于停止状态!' % (time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime()), postid)
        processlog(
            'autovote_agent', 1, 'autovote', '%s 帖子:%s处于停止状态!' %
            (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), postid))
        # # 将commentid写回队列
        # redis_cursor.rpush(pre_system + 'commentidqueque', '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))

        # 将代理ip写入队列
        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
        return 0

    # 判断此代理ip是否能够访问
    ret = redis_cursor.hget(pre_system + 'ipinterval_' + postid_commentid,
                            proxy_ip)
    if ret:
        t = int(time.mktime(datetime.datetime.now().timetuple()))
        if t <= int(ret):
            # 将代理ip写入队列
            redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

            # 将commentid写回队列
            redis_cursor.rpush(
                pre_system + 'commentidqueque',
                '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
            return 0

    # 判断次数有没有顶贴完成
    sql = 'select `count`, `maxcount` from system_url_posts where `postid`=%s'
    ret = mysql_cursor.query(sql, postid)
    if ret:
        count_now = int(ret[0]['count'])
        count_max = int(ret[0]['maxcount'])
        if count_now >= count_max:
            # 设置状态postid状态为已完成
            sql = 'update system_url_posts set `status`=3 where `postid`=%s'
            mysql_cursor.execute(sql, postid)

            # 删除本条postid对应的hash表
            redis_cursor.delete(pre_system + 'ipinterval_' + postid_commentid)

            # 已经采集完成 删除status信息
            redis_cursor.hdel(pre_system + 'commentidstatus', postid_commentid)

            # 将代理ip写入队列
            redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

            # 删除autovote_commentidadded 表中的记录 以免在顶贴目标完成之后  再次增加次数无法在进行添加
            redis_cursor.hdel(pre_system + 'commentidadded', postid_commentid)
            print 'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' % (
                postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
            processlog(
                'autovote_agent', 1, 'autovote',
                'Postid: %s 已经达到顶贴目标, 无需在加入队列: [%s]' %
                (postid, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
            # # TODO 截图
            # # 方案二:如果顶帖完成,保存到redis一个完成的列表,截图程序轮询这个列表
            # redis_cursor.lpush(pre_system + 'postidfinished', postid + "|" + shorturl)

            # if shorturl in 'comment.news.163.com':
            #     urltype = '163'
            # if shorturl in 'gentie.ifeng.com':
            #     urltype = 'ifeng'

            # try:
            #     sql = 'select url from system_url_post as posts, system_url_list as list where postid=%s and post.urlid=list.urlid'
            #     url_img = mysql_cursor.query(sql, postid)[0]['url']
            #     OutPutImg(url, urltype, postid)

            # except Exception,e:
            #     print '截图错误:%s' % str(e)
            return 0
    else:
        print '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime())
        processlog(
            'autovote_agent', 1, 'reWriteScreenShotQueue',
            '没有记录 [%s]' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        return 0

    try:
        if shorturl in 'comment.news.163.com':
            url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/comments/%s/action/upvote?ibc=newspc' % (
                docId, commentId)

        if shorturl in 'gentie.ifeng.com':
            url = 'http://comment.ifeng.com/vote.php?callback=recmCallback&cmtId=%s&job=up&docUrl=%s&callback=recmCallback&format=js' % (
                commentId, docId)

        headers = {
            'Host': '%s' % shorturl,
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest'
        }

        # 网易新闻参数
        data = {'ibc': 'newspc'}

        if proxy_ip == '127.0.0.1':
            if shorturl in 'comment.news.163.com':
                ret = requests.post(url, data=data, headers=headers, timeout=2)
            if shorturl in 'gentie.ifeng.com':
                ret = requests.get(url, headers=headers, timeout=2)
        else:
            try:
                proxies = {'http': 'http://' + proxy_ip}
                # 网易新闻
                if shorturl in 'comment.news.163.com':
                    ret = requests.post(url,
                                        data=data,
                                        proxies=proxies,
                                        headers=headers,
                                        timeout=2)
                # 凤凰新闻
                if shorturl in 'gentie.ifeng.com':
                    ret = requests.get(url, headers=headers, timeout=2)
            except requests.RequestException:
                # 判断ip发生异常的次数  超过三次则移除IP
                timeout_count = redis_cursor.hget(
                    pre_system + 'iptimeoutcount', proxy_ip)
                print 'timeout_count: %s' % timeout_count
                if timeout_count:
                    if int(timeout_count) > 2:
                        print 'IP: %s 发生异常, 异常次数: %s  移除IP!' % (
                            proxy_ip, int(timeout_count))
                        processlog(
                            'autovote_agent', 1, 'autovote',
                            'IP: %s 发生异常, 异常次数: %s  移除IP!' %
                            (proxy_ip, int(timeout_count)))
                        # 删除autovote_ipinterval hash表
                        redis_cursor.hdel(
                            pre_system + 'ipinterval_' + postid_commentid,
                            proxy_ip)
                        redis_cursor.hdel(pre_system + 'iptimeoutcount',
                                          proxy_ip)
                    else:
                        print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip,
                                                           int(timeout_count))
                        processlog(
                            'autovote_agent', 1, 'autovote',
                            'IP: %s 发生异常, 异常次数: %s !' %
                            (proxy_ip, int(timeout_count)))
                        redis_cursor.hset(pre_system + 'iptimeoutcount',
                                          proxy_ip,
                                          int(timeout_count) + 1)
                        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                else:
                    print 'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1)
                    processlog('autovote_agent', 1, 'autovote',
                               'IP: %s 发生异常, 异常次数: %s !' % (proxy_ip, 1))
                    redis_cursor.hset(pre_system + 'iptimeoutcount', proxy_ip,
                                      1)
                    redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                # 将commentid写回队列
                redis_cursor.rpush(
                    pre_system + 'commentidqueque',
                    '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
                return 1
        if ret.status_code == 200:
            if len(ret.text) <= 60:
                # 删除之前timeout的次数
                redis_cursor.hdel(pre_system + 'iptimeoutcount', proxy_ip)
                # 一分钟之后才能继续采集
                if shorturl in 'comment.news.163.com':
                    nxt_time = int(
                        time.mktime(time.localtime(time.time() + int(1) * 60)))
                # 时间间隔5秒钟
                if shorturl in 'gentie.ifeng.com':
                    nxt_time = int(
                        time.mktime(time.localtime(time.time() + int(1) * 5)))
                redis_cursor.hset(
                    pre_system + 'ipinterval_' + postid_commentid, proxy_ip,
                    nxt_time)
                if shorturl in 'gentie.ifeng.com':
                    if 'alert' not in ret.text:
                        sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s'
                        mysql_cursor.execute(sql, postid)
                        print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)
                        processlog(
                            'autovote_agent', 1, 'autovote',
                            '%s 成功顶贴一次! 顶贴IP: %s' %
                            (time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime()), proxy_ip))
                    else:
                        print '%s 凤凰新闻顶贴时间过快!' % time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.localtime())
                        processlog(
                            'autovote_agent', 1, 'autovote',
                            '%s 凤凰新闻顶贴时间过快!' % time.strftime(
                                '%Y-%m-%d %H:%M:%S', time.localtime()))
                    # 将commentid写回队列
                    redis_cursor.rpush(
                        pre_system + 'commentidqueque',
                        '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
                    # 将代理ip写入队列
                    redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
                    return 1
                if shorturl in 'comment.news.163.com':
                    sql = 'update system_url_posts set `count`=`count`+1 where `postid`=%s'
                    mysql_cursor.execute(sql, postid)

                # 写入统计表
                sql = 'insert into system_post_detail (`postTime`, `count`, `adminid`) values (%s, 1, %s)'
                t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                mysql_cursor.execute(sql, t, adminid)
                del t
                print '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip)
                processlog(
                    'autovote_agent', 1, 'autovote',
                    '%s 成功顶贴一次! 顶贴IP: %s' % (time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime()), proxy_ip))

                # 将代理ip写入队列
                redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)
            else:
                print 'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' % (
                    proxy_ip, len(ret.text))
                processlog(
                    'autovote_agent', 1, 'autovote',
                    'IP: %s , 返回值长度为: %s, 不是有效代理!, 从代理列表剔除!' %
                    (proxy_ip, len(ret.text)))
                # 删除autovote_ipinterval hash表
                redis_cursor.hdel(
                    pre_system + 'ipinterval_' + postid_commentid, proxy_ip)
        elif ret.status_code == 429:
            # 10秒之后才能继续采集
            print '%s 顶贴太频繁!' % proxy_ip
            nxt_time = int(
                time.mktime(time.localtime(time.time() + int(1) * 10)))
            redis_cursor.hset(pre_system + 'ipinterval_' + postid_commentid,
                              proxy_ip, nxt_time)
        else:
            print '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip)
            processlog('autovote_agent', 1, 'autovote',
                       '顶贴失败! 状态码: %s IP: %s' % (ret.status_code, proxy_ip))
            # 删除autovote_ipinterval hash表
            redis_cursor.hdel(pre_system + 'ipinterval_' + postid_commentid,
                              proxy_ip)

        # 将commentid写回队列
        redis_cursor.rpush(
            pre_system + 'commentidqueque',
            '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
        return 1
    except Exception, ex:
        print ex
        processlog('autovote_agent', 0, 'autovote', str(ex))
        # 将代理ip写入队列
        redis_cursor.rpush(pre_system + 'proxylist', proxy_ip)

        # 将commentid写回队列
        redis_cursor.rpush(
            pre_system + 'commentidqueque',
            '%s|%s|%s|%s' % (postid, commentId, docId, shorturl))
        return 0
示例#36
0
def scrapy_comment_user_163(username, adminid, address):
    '''
    网易用户所有跟贴的爬取
    '''
    username_decode = base64.b64decode(username)
    siteid = mysql_cursor.query(
        'select siteid from system_site_list where shorturl="comment.news.163.com"'
    )[0]['siteid']
    # 判断用户是否存在
    sql = 'select userid,adminid from system_site_user where siteid=%s and username=%s '
    r = mysql_cursor.query(sql, int(siteid), username_decode)
    if r:
        if int(adminid) != int(r[0]['adminid']):
            print '网站帐号存在,添加人不匹配'
            processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163',
                       '网站帐号存在,添加人不匹配,现:%s, 原:%s' % (adminid, r[0]['adminid']))
            return
        userid = r[0]['userid']
        setAddressStatus(userid, 1)
    else:
        processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163',
                   '网站帐号不存在,添加:%s,userid:%s' % (username, adminid))

        crc32_address = crc32(address) & 0xffffffff
        sql = 'insert into system_site_user(`siteid`, `username`,`createtime`, `adminid`, `address`,  `crc32address`, `status`) values(%s, %s, now(), %s, %s, %s, 1)'
        userid = mysql_cursor.execute_lastrowid(sql, siteid, username_decode,
                                                adminid, address,
                                                crc32_address)

    headers = {
        'Host': 'comment.news.163.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    #默认爬取6页
    for page in xrange(scrapy_page):
        url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/users/0/comments?username=%s&offset=%s&limit=30&ibc=newspc' % (
            username, page)

        req = requests.get(url, headers=headers, timeout=timeout)
        if req.status_code == 200:
            data = json.loads(req.text)

            threads = data['threads']
            urllist = []

            for k, v in threads.items():
                param = {}

                _url = v['url']
                # 判断url是否支持
                res = r'://(.*?)/'
                ret = re.findall(res, _url)
                if ret:
                    shorturl = ret[0]
                    if shorturl in [
                            'news.163.com',
                    ]:
                        boardId = v['boardId']
                        param['docId'] = v['docId']
                        param['title'] = v['title']
                        param[
                            'url'] = 'http://comment.news.163.com/' + boardId + '/' + v[
                                'docId'] + '.html'
                        urllist.append(param)
                else:
                    processlog('auto_scrapyuser', 1, 'crapy_comment_user_163',
                               'url不支持:%s' % _url)

            comments = data['comments']

            for k, v in comments.items():
                url_post = ''
                title = ''
                for u in urllist:
                    if u['docId'] == k.split('_')[0]:
                        url_post = u['url']
                        title = u['title']
                buildLevel = v['buildLevel']
                # 判断是否含有nickname, 是否是最外层的评论
                if url_post and title and v['user'].has_key(
                        'nickname') and buildLevel == 1:
                    nickname = v['user']['nickname']
                    commentId = v['commentId']
                    createTime = v['createTime']
                    content = v['content'].encode('utf8')

                    #判断帖子是否保存过
                    sql = 'select postid from system_url_posts where `commentIds`=%s and createTime=%s and `adminid`=%s'
                    r = mysql_cursor.query(sql, commentId, createTime, adminid)
                    if not r:
                        #判断url是否添加过
                        crc32_url = crc32(url_post) & 0xffffffff
                        sql = 'select urlid from system_url_list where `crc32url`=%s and `adminid`=%s'
                        ret = mysql_cursor.query(sql, crc32_url, adminid)
                        if ret:  #添加过
                            urlid = ret[0]['urlid']
                        else:
                            sql = 'insert into system_url_list(`siteid`, `title`, `url`, `crc32url`, `addtime`, `status`, `adminid`) values(%s,%s,%s,%s,now(),1, %s)'
                            urlid = mysql_cursor.execute_lastrowid(
                                sql, siteid, title, url_post, crc32_url,
                                adminid)

                            processlog('auto_scrapyuser', 1,
                                       'scrapy_comment_user_163',
                                       'url未添加过,添加url,urlid:%s' % urlid)
                        #保存帖子
                        try:
                            sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\
                                  ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)'
                            postid = mysql_cursor.execute_lastrowid(
                                sql, urlid, userid, commentId, content,
                                nickname, createTime, adminid)
                            print '保存帖子: %s; postid :%s ; adminid : %s' % (
                                nickname, postid, adminid)
                            processlog(
                                'auto_scrapyuser', 1,
                                'scrapy_comment_user_163',
                                '保存帖子: %s; postid :%s ; adminid : %s' %
                                (nickname, postid, adminid))

                        except Exception, e:
                            # 有的字符集无法保存
                            if 'Incorrect string value:' in str(e):
                                print '存在表情,无法保存content, nickname:%s' % nickname
                                processlog(
                                    'auto_scrapyuser', 0,
                                    'scrapy_comment_user_163',
                                    '存在表情,无法保存content, nickname:%s' % nickname)

                            elif 'Data too long for column' in str(e):
                                processlog('auto_scrapyuser', 1,
                                           'scrapy_comment_user_163',
                                           '帖子内容过长,重新截取写入,urlid:%s' % urlid)
                                content = content[:255]
                                sql = 'insert into system_url_posts(`urlid`, `userid`, `commentIds`, `content`, `nickname`'\
                                  ', `createTime`, `adminid`) values(%s,%s,%s,%s,%s,%s,%s)'
                                postid = mysql_cursor.execute_lastrowid(
                                    sql, urlid, userid, commentId, content,
                                    nickname, createTime, adminid)
                                print '保存帖子: %s; postid :%s ; adminid : %s' % (
                                    nickname, postid, adminid)
                                processlog(
                                    'auto_scrapyuser', 1,
                                    'scrapy_comment_user_163',
                                    '保存帖子: %s; postid :%s ; adminid : %s' %
                                    (nickname, postid, adminid))

                            else:
                                print e
                                processlog('auto_scrapyuser', 0,
                                           'scrapy_comment_user_163', str(e))
                            # 更新site_user状态
                            setAddressStatus(userid, 0)
                    else:
                        print '帖子保存过:postid:%s' % r[0]['postid']
                        # processlog('auto_scrapyuser', 1, 'scrapy_comment_user_163', '帖子保存过:postid:%s' % r[0]['postid'])

            #如果到最后一页,退出循环
            total = data['total']
            if (page + 1) * 30 >= total:
                break
        else:
            print req.text
示例#37
0
def main():
    while 1:
        urlinfo = redis_cursor.rpop(pre_system + 'urlqueque')
        # urlinfo = '3|1970-01-01 23:59:00|http://news.sina.com.cn/c/gat/2016-04-05/doc-ifxqxcnr5291732.shtml'
        if urlinfo:
            urlinfo = urlinfo.split('|')
            urlid = urlinfo[0]
            lastcreateTime = urlinfo[1]
            url = urlinfo[2]
            adminid = urlinfo[3]
            res = r'://(.*?)/'
            ret = re.findall(res, url)
            if ret:
                shorturl = ret[0]
                # 通过shorturl来获取siteid
                siteid = ''
                sql = 'select siteid from system_site_list where `shorturl`=%s'
                r = mysql_cursor.query(sql, shorturl)
                if r:
                    siteid = r[0]['siteid']
                    del r
                else:
                    print '没有siteid'
                    del r
                    time.sleep(2)
                    continue
                if shorturl in ['comment.news.163.com']:
                    docId = re.findall('(.*?).html', url.split('/')[-1])[0]
                    scrapy_comment_news_163_com(docId, urlid, lastcreateTime, siteid, url, adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                if shorturl in ['gentie.ifeng.com']:
                    docUrl = urllib2.unquote(re.findall(r'docUrl=(.*?.shtml)', url)[0])
                    scrapy_comment_ifeng_com(docUrl, urlid, siteid, url, adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                if shorturl in ['news.sina.com.cn']:
                    channel_newsid = news_sina_com_cn_getNewsid(url)
                    if channel_newsid == 'error':
                        print '%s 获取帖子url错误, url: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), url)
                        processlog('autovote_main', 1, 'main', '%s 获取帖子url错误, url: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), url))
                        time.sleep(2)
                        continue
                    channel = channel_newsid['channel']
                    newsid = channel_newsid['newsid']
                    scrapy_news_sina_com_cn(channel, newsid, urlid, siteid, adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                else:
                    print '网站还不支持'
                    processlog('autovote_main', 1, 'main', '网站还不支持:%s' % url)
            else:
                print 'url wrong, %s' % url
                processlog('autovote_main', 1, 'main', 'url wrong, %s' % url)
        else:
            print '^sleep 10 seconds^'

        time.sleep(10)
示例#38
0
    fp.close()

    config_json = json.loads(config_data)["config"]

    redis_host = config_json["redis_host"]
    redis_port = config_json["redis_port"]
    mysql_host = config_json["mysql_host"]
    mysql_db = config_json["mysql_db"]
    mysql_user = config_json["mysql_user"]
    mysql_pass = config_json["mysql_pass"]

    pre_system = config_json["pre_system"]
    serverport = config_json["http_port"]
except Exception, ex:
    print ex
    processlog("auto_scrapyuser", 0, "config", str(ex))
    sys.exit(-1)


# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user, password=mysql_pass)

# 爬取页数
scrapy_page = 4
# 网易从个人页面获取所有帖子

示例#39
0
    fp.close()

    config_json = json.loads(config_data)['config']

    redis_host = config_json['redis_host']
    redis_port = config_json['redis_port']
    mysql_host = config_json['mysql_host']
    mysql_db = config_json['mysql_db']
    mysql_user = config_json['mysql_user']
    mysql_pass = config_json['mysql_pass']

    pre_system = config_json['pre_system']
    serverport = config_json['http_port']
except Exception, ex:
    print ex
    processlog('autovote_agent', 0, 'config', str(ex))
    sys.exit(-1)

# 链接redis
pool = redis.ConnectionPool(host=redis_host, port=redis_port)
redis_cursor = redis.Redis(connection_pool=pool)

# 链接mysql
mysql_cursor = torndb.Connection(mysql_host, mysql_db, user=mysql_user,
                                 password=mysql_pass)


def autovote():
    # 获取commentid
    postid_commentid = redis_cursor.lpop(pre_system + 'commentidqueque')
    if not postid_commentid:
示例#40
0
def main():
    while 1:
        urlinfo = redis_cursor.rpop(pre_system + 'urlqueque')
        # urlinfo = '3|1970-01-01 23:59:00|http://news.sina.com.cn/c/gat/2016-04-05/doc-ifxqxcnr5291732.shtml'
        if urlinfo:
            urlinfo = urlinfo.split('|')
            urlid = urlinfo[0]
            lastcreateTime = urlinfo[1]
            url = urlinfo[2]
            adminid = urlinfo[3]
            res = r'://(.*?)/'
            ret = re.findall(res, url)
            if ret:
                shorturl = ret[0]
                # 通过shorturl来获取siteid
                siteid = ''
                sql = 'select siteid from system_site_list where `shorturl`=%s'
                r = mysql_cursor.query(sql, shorturl)
                if r:
                    siteid = r[0]['siteid']
                    del r
                else:
                    print '没有siteid'
                    del r
                    time.sleep(2)
                    continue
                if shorturl in ['comment.news.163.com']:
                    docId = re.findall('(.*?).html', url.split('/')[-1])[0]
                    scrapy_comment_news_163_com(docId, urlid, lastcreateTime,
                                                siteid, url, adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                if shorturl in ['gentie.ifeng.com']:
                    docUrl = urllib2.unquote(
                        re.findall(r'docUrl=(.*?.shtml)', url)[0])
                    scrapy_comment_ifeng_com(docUrl, urlid, siteid, url,
                                             adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                if shorturl in ['news.sina.com.cn']:
                    channel_newsid = news_sina_com_cn_getNewsid(url)
                    if channel_newsid == 'error':
                        print '%s 获取帖子url错误, url: %s' % (time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.localtime()), url)
                        processlog(
                            'autovote_main', 1, 'main',
                            '%s 获取帖子url错误, url: %s' % (time.strftime(
                                '%Y-%m-%d %H:%M:%S', time.localtime()), url))
                        time.sleep(2)
                        continue
                    channel = channel_newsid['channel']
                    newsid = channel_newsid['newsid']
                    scrapy_news_sina_com_cn(channel, newsid, urlid, siteid,
                                            adminid)
                    print '%s 扫描完成!' % url
                    processlog('autovote_main', 1, 'main', '%s 扫描完成!' % url)
                    time.sleep(2)
                    continue
                else:
                    print '网站还不支持'
                    processlog('autovote_main', 1, 'main', '网站还不支持:%s' % url)
            else:
                print 'url wrong, %s' % url
                processlog('autovote_main', 1, 'main', 'url wrong, %s' % url)
        else:
            print '^sleep 10 seconds^'

        time.sleep(10)