예제 #1
0
        def get_index_2(url):  #跟上边的相区分,是为了获得具体某一个版块的里边的帖子的具体链接。
            next_page_has_visited = 0
            while True:
                response1 = get_response_and_text(url=url,
                                                  headers=self.headers)
                response_in_function_text = response1[
                    'response_in_function_text']
                try:
                    datasoup = BeautifulSoup(response_in_function_text, 'lxml')
                    for content in datasoup.select(
                            '.content > table > tr')[1:-1]:
                        title = content.select(
                            'td.title > a[title]')[0].text.strip()  #title
                        url = 'http://bbs.csdn.net' + content.select(
                            'td.title > a[title]')[0].get('href').strip()
                        publish_user = content.select('td.tc a[title]')[0].get(
                            'title').strip()  #publish_user
                        # content.select('td.tc a[title]')[0].get('href')#publish_user_href
                        reply_count = content.select(
                            'td:nth-of-type(4)')[0].text  #read_count
                        this_nodes = {
                            'url':
                            url,
                            'publish_user':
                            publish_user,
                            'title':
                            title,
                            'reply_count':
                            reply_count,
                            'spider_time':
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                            'id':
                            url.split('/')[-1],
                            'like_count':
                            0,  #根本没有
                            'content':
                            None,
                            'reproduce_count':
                            0,
                            'publish_user_photo':
                            '',
                            'reply_nodes': []
                        }
                        self.content_data_list.append(this_nodes)

                    next_page_url = datasoup.select('a.next')
                    if next_page_url and next_page_has_visited < 10:
                        next_page_has_visited += 1
                        url_next = 'http://bbs.csdn.net' + next_page_url[
                            0].get('href')
                        url = url_next
                    else:
                        break

                except Exception as e:
                    # print e
                    pass
예제 #2
0
파일: people.py 프로젝트: possager/YFZX_new
        def get_content_inside(data):
            #手机端的内容,电脑端的时间
            url_for_debug = data['url']

            response1 = get_response_and_text(url=url_for_debug)
            response2 = get_response_and_text(url=url_for_debug,
                                              headers=self.headers)
            try:
                datasoup2 = BeautifulSoup(
                    response2['response_in_function_text'], 'lxml')
                publish_time = datasoup2.select(
                    '.replayInfo .float_l.mT10')[0].text.split(u'\xa0')[-1]
                data['publish_time'] = publish_time
            except Exception as e:
                # print e
                # print 'mark1'
                pass

            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            try:
                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            except Exception as e:
                return

            # content=datasoup.select('div.artCont .content-text')
            content = datasoup.select('div.artCont')
            if content:  #没有结果content应该回事空
                content_text = content[0].text.strip()
                Re_find_img_url = re.compile(r'src="(.*?)"')
                img_list = Re_find_img_url.findall(str(content[0]))
                if img_list:
                    for i in range(len(img_list)):
                        if 'http' not in img_list[i]:
                            img_list[
                                i] = 'http://bbs1.people.com.cn' + img_list[i]
                            # print img_list[i]
                else:
                    img_list = []
                data['content'] = content_text
                data['img_urls'] = img_list

                self.comments_data_list.append(data)
예제 #3
0
 def get_index_1():  #获得该论坛所有子论坛,子版块的链接,
     while True:
         try:  #response_in_function_text有时候乱码有时候不乱码
             url_rukou = 'http://bbs.csdn.net/home'
             response1 = get_response_and_text(url=url_rukou,
                                               headers=self.headers)
             response_in_function_text = response1[
                 'response_in_function_text']
             datasoup = BeautifulSoup(response_in_function_text, 'lxml')
             for a in datasoup.select('.dropdown-menu a[href]'):
                 url_bankuai = 'http://bbs.csdn.net' + a.get('href')
                 # print url_bankuai
                 self.index_data_list.append(url_bankuai)
             if self.index_data_list:
                 break
         except Exception as e:
             # print e
             pass
예제 #4
0
파일: people.py 프로젝트: possager/YFZX_new
        def get_comment_inside(data):
            comment_list = []
            error_time = 5
            page_num = 1
            while True:
                try:
                    comment_url = 'http://bbs1.people.com.cn/mobile.do?action=moreComment&threadId=' + str(
                        data['id']) + '&pageNo=' + str(page_num)
                    response1 = get_response_and_text(url=comment_url)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']
                    datajson = json.loads(response_in_function_text)
                    if not datajson['elements']:
                        break
                    for i in datajson['elements']:
                        id = i['id']
                        title = i['title']
                        publish_user = i['userNick']

                        one_comment = {
                            'id': id,
                            'content': title,
                            'publish_user': publish_user,
                            'parent_id': data['id'],
                            'ancestor_id': data['id']
                        }
                        comment_list.append(one_comment)
                    page_num += 1
                except Exception as e:

                    error_time -= 1
                    if error_time < 0:
                        break
                    time.sleep(5)

            data['reply_nodes'] = comment_list
            self.result_data_list.append(data)
예제 #5
0
        def get_content_inside(data):
            url_debug = data['url'] + '?page=1'

            while True:
                response1 = get_response_and_text(url=url_debug,
                                                  headers=self.headers)
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                response_in_function_text = handleContent(
                    response_in_function_text)
                datasoup = BeautifulSoup(response_in_function_text, 'lxml')

                page_begain = 0

                if not data['content']:  #用来记录是否需要获取content,publish_user的信息
                    page_begain = 1
                    try:
                        content_div = datasoup.select(
                            'div.detailed table.post .post_body')[0]
                        content = content_div.text.strip()
                    except Exception as e:
                        # print e
                        # print data['url']
                        return
                    img_urls_content = Re_find_img_url.findall(
                        str(content_div))
                    publish_user_photo = datasoup.select(
                        'div.detailed table.post .user_info .user_head a img'
                    )[0].get('src')
                    publish_time = datasoup.select(
                        'div.detailed table.post .time')[0].text.strip().split(
                            '\n')[1].strip()

                    data['content'] = content
                    data['publish_user_photo'] = publish_user_photo
                    data['publish_time'] = publish_time
                    data['img_urls'] = img_urls_content

                for one_reply in datasoup.select(
                        'div.detailed table.post')[page_begain:]:
                    try:
                        j = one_reply.select('div.post_body')
                        img_urls = Re_find_img_url.findall(str(j))
                        img_urls2 = []
                        for img_url_maybe_have_js in img_urls:
                            if '.js' not in img_url_maybe_have_js:
                                img_urls2.append(img_url_maybe_have_js)

                        content = one_reply.select(
                            '.post_body')[0].text.strip()
                        publish_user_photo = one_reply.select(
                            '.user_info .user_head a img')[0].get(
                                'src')  #publish_user_photo
                        publish_time = one_reply.select(
                            '.time')[0].text.strip().split('\n')[1].strip()
                        louceng_url = one_reply.select('.fr a[href]')[0].get(
                            'href')
                        like_count = one_reply.select(
                            ' div.control .fr a.red')[0].text.split(
                                '[')[1].split(']')[0]
                        dislike_count = one_reply.select(
                            ' div.control .fr a.bury')[0].text.split(
                                '[')[1].split(']')[0]
                        publish_user = one_reply.select(
                            '.user_info .nickname span')[0].text
                        ancestor_id = data['id']
                        parent_id = data['id']
                        publish_user_id = louceng_url.split('post-')[1]
                        url = data['url'] + louceng_url

                        thisnode = {
                            'publish_user_photo': publish_user_photo,
                            'publish_time': publish_time,
                            'like_count': like_count,
                            'dislike_count': dislike_count,
                            'publish_user': publish_user,
                            'ancestor_id': ancestor_id,
                            'parent_id': parent_id,
                            'publish_user_id': publish_user_id,
                            'url': url,
                            'img_urls': img_urls2,
                            'content': content,
                            'id': louceng_url.split('-')[1]
                        }
                        data['reply_nodes'].append(thisnode)
                    except Exception as e:
                        # print e
                        pass

                next_page_div = datasoup.select('.page_nav .next')
                if next_page_div:
                    next_url = 'http://bbs.csdn.net' + next_page_div[0].get(
                        'href')
                    url_debug = next_url
                else:
                    self.result_data_list.append(data)
                    break
예제 #6
0
파일: people.py 프로젝트: possager/YFZX_new
        def get_index(url):
            charge_to_stop = 1
            while True:
                error_num = 5  #又是请求过于频繁,会出现eof错误
                while True:
                    response1 = get_response_and_text(url=url)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']
                    # print response_in_function_text

                    try:
                        datajson = json.loads(response_in_function_text)
                        #成功访问一次就重置error_num
                        error_num = 5
                        if not datajson['elements']:
                            charge_to_stop = 0
                            break  #没有数据了,请求完了

                        for one_data in datajson['elements']:
                            title = one_data['title']
                            reply_count = one_data['replyCount']
                            publish_user = one_data['usernick']
                            read_count = one_data['readCount']
                            like_count = one_data['like']
                            url_index = one_data['url']
                            id = one_data['id']
                            publish_time = u'2017-' + one_data[
                                'createTime'].replace(u'月', u'-').replace(
                                    u'日', u'') + u':00'

                            this_index_info = {
                                'title':
                                title,
                                'reply_count':
                                reply_count,
                                'publish_user':
                                publish_user,
                                'read_count':
                                read_count,
                                'like_count':
                                like_count,
                                'url':
                                u'http://bbs1.people.com.cn' + url_index,
                                'id':
                                id,
                                'publish_time':
                                publish_time,
                                'reply_nodes': [],
                                'spider_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                            }
                            self.content_data_list.append(this_index_info)

                        urlsplit = url.split('pageNo=')
                        url = urlsplit[0] + 'pageNo=' + str(
                            int(urlsplit[1]) + 1)
                    except Exception as e:
                        error_num -= 1
                        if error_num < 0:
                            break
                        time.sleep(3)
                if charge_to_stop == 0:
                    break
                else:
                    url_split = response_in_function.url.split('pageNo=')
                    urlnext = url_split[0] + 'pageNo=' + str(
                        int(url_split[1]) + 1)
                    # get_index(url=urlnext)
                    url = urlnext
예제 #7
0
        def get_content_inside(data):
            is_first=0
            url_debug = data['url']
            # url_debug='http://www.chengshiluntan.com/5942261-1.html'
            # url_debug='http://www.chengshiluntan.com/7561-1.html'
            # url_debug='http://www.chengshiluntan.com/731-1.html'
            # url_debug='http://www.chengshiluntan.com/5070-1.html'
            # url_debug='http://www.chengshiluntan.com/5942282-1.html'

            while True:
                response1=get_response_and_text(url=url_debug,headers=self.headers)
                response_in_function=response1['response_in_function']
                response_in_function_text=response1['response_in_function_text']
                datasoup=BeautifulSoup(response_in_function_text,'lxml')

                if is_first==0:
                    is_first=1
                    try:
                        data['title']=datasoup.select('#thread_subject')[0].text
                    except Exception as e:
                        # print e
                        return #有些帖子是没有的
                        # print data['url']
                    data['reply_count']= datasoup.select('#postlist > div.bm_h.comiis_snvbt > span.y.comiis_hfs > strong')[0].text
                    data['read_count']= datasoup.select('#postlist > div.bm_h.comiis_snvbt > span.y.comiis_cks > strong')[0].text
                    try:
                        # data['reply_count']=datasoup.select('#postlist > table:nth-of-type(1) > tbody > tr > td.pls.ptm.pbm > div > span:nth-of-type(2)')
                        # data['read_count']=datasoup.select('#postlist > table:nth-of-type(1) > tbody > tr > td.pls.ptm.pbm > div > span:nth-of-type(5)')
                        data['publish_user_photo']=datasoup.select('#postlist div[id] .pls .avatar.comiis_zxtx a img')[0].get('src')
                        data['publish_user_id']=datasoup.select('#postlist div[id] .pls .avatar.comiis_zxtx a')[0].get('href').split('/')[-1]
                        data['id']=datasoup.select('#postlist div[id]')[0].get('id')
                    except Exception as e:
                        # print e
                        data['publish_user_photo']=''
                        data['id']=''
                        data['publish_user_id']=''
                        # print '用户已被删除,所以没有头像'
                    content_div=datasoup.select('#postlist > div[id] div.t_fsz > div.t_f')[0]
                    content_div_this=datasoup.select('#postlist > div[id]')[0]
                    content_div_str=str(content_div_this)
                    img_urls=Re_find_img_url.findall(content_div_str)
                    #9-20添加图片过滤模块,目前只用于去重
                    img_urls_set=set()
                    for img_url_raw in img_urls:
                        if '.js' not in img_url_raw:
                            img_urls_set.add(img_url_raw)

                    img_urls2=list(img_urls_set)



                    data['img_urls']=img_urls2
                    data['publish_user']= content_div_this.select('td.plc > div.pi > div.pti > div.authi > a.xi2.kmyzz')[
                        0].text  # publish_user
                    publish_time_content=content_div_this.select(' .pti .authi em')[0].text.replace(u'发表于', '').strip()+':00'  # pubtlish_time
                    data['publish_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(publish_time_content,'%Y-%m-%d %H:%M:%S'))#为什么这么写,因为个是会出现11-1这样的错误,应该的是11-01这样子的。
                    data['content']= content_div_this.select(' div.t_fsz > div.t_f')[0].text.strip()  # content
                    # data['publish_user_photo']=content_div_this.select('')

                follow_div=datasoup.select('#postlist > div[id]')[is_first:-1]
                for one_reply in follow_div:
                    try:
                        comment_reply_nodes=[]
                        publish_user=one_reply.select(' div.pti div.authi a.xi2.kmyzz')[
                            0].text  # publish_user
                        publish_time= one_reply.select('tr:nth-of-type(1) > td.plc > div.pi > div.pti > div.authi  > em')[
                            0].text.replace(u'发表于', '').strip()+':00'  # pubtlish_time
                        content= one_reply.select(' div.t_fsz > div.t_f')[0].text.strip()  # content
                        id=one_reply.get('id')
                        if one_reply.select('div.cm')[0].text.strip():
                            id= one_reply.select('div.cm')[0].get('id')  # comment_id
                            try:
                                publish_user_photo= one_reply.select('div.cm div.pstl div.psta a > img')[0].get('src')  # publish_photo
                            except :
                                publish_user_photo=''

                            content= one_reply.select('div.pstl div.psti')[0].text.split(u'详情')[0].strip()  # content
                            publish_user= one_reply.select('div.pstl div.psta a.xi2')[0].text  # publish_user
                            publish_time= one_reply.select('div.pstl div.psti span.xg1')[0].text.replace(u'发表于',
                                                                                                       '').strip()  # publish_time

                            comment_reply_node={
                                'id':id,
                                'publish_user_photo':publish_user_photo,
                                'content':content,
                                'publish_user':publish_user,
                                'publish_time':publish_time,

                            }
                            comment_reply_nodes.append(comment_reply_node)

                        img_urls_reply = Re_find_img_url.findall(str(one_reply.select('.t_fsz')))
                        img_urls_reply2=[]
                        for i in img_urls_reply:
                            if '.js' in i:
                                continue
                            elif 'http' not in i:
                                i='http://www.chengshiluntan.com/'+i
                                img_urls_reply2.append(i)
                            else:
                                img_urls_reply2.append(i)
                        try:
                            publish_user_id = datasoup.select('.pls .avatar.comiis_zxtx a')[0].get('href').split('/')[
                                -1]
                            publish_user_photo= one_reply.select('td.pls > div.pls div div.avatar a img')[0].get(
                                'src')  # publish_user_photo
                            # publish_user_id= one_reply.select('td.pls > div.pls div.m.z div[id]')[0].get('id')  # publish_user_id
                        except Exception as e:
                            # print e
                            id=''
                            publish_user_photo=''
                            publish_user_id=''

                        this_comment_node={
                            'publish_user':publish_user,
                            'publish_time':time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(publish_time,'%Y-%m-%d %H:%M:%S')),
                            'content':content,
                            'id':id,
                            'publish_user_photo':publish_user_photo,
                            'publish_user_id':publish_user_id,
                            'reply_nodes':comment_reply_nodes,
                            'url':url_debug+"#"+id,
                            'img_urls':img_urls_reply
                        }
                        data['reply_nodes'].append(this_comment_node)
                    except Exception as e:
                        # print e
                        pass
                url_next_div=datasoup.select('a.nxt')
                if url_next_div:
                    url_next=url_next_div[0].get('href')
                    # if len(url_next)<7:
                    url_debug='http://www.chengshiluntan.com/'+url_next
                    # else:
                    #     print len(url_next)
                    #     break
                else:
                    self.result_data_list.append(data)
                    break
예제 #8
0
        def get_comment_inside(data):  #也是分为两段设计,第一次获得contentid
            topicid = None
            cmspage_taotalnum = 1
            comments_data = []
            cmspagenum = 1

            #额外添加
            request_num = 1
            error_time = 5

            # comments_data=[]
            while True:
                # reply_count=0
                if not topicid:
                    comment_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id='
                    try:
                        comment_url = comment_url_without_id + data['sid']
                    except Exception as e:
                        print e
                        break  #图片类新闻没有评论
                else:
                    comment_url = 'http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=' + str(
                        topicid) + '&page_no=' + str(request_num)

                response1 = get_response_and_text(url=comment_url)
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                try:
                    data_json = json.loads(response_in_function_text)
                except Exception as e:
                    print e
                    # return
                    break
                try:
                    data_json['comments']
                except Exception as e:
                    print e
                    continue
                    error_time -= 1
                    if error_time < 1:
                        break
                if data_json['comments']:
                    data_json_comments = data_json['comments']
                    cmspage_taotalnum = data_json['cmt_sum']
                    topicid = data_json['topic_id']

                    for someone_comment in data_json_comments:
                        content = someone_comment['content']  # content
                        id = someone_comment['comment_id']  # id
                        publish_user_photo = someone_comment['passport'][
                            'img_url']  # publish_user_photo
                        try:
                            publish_user = someone_comment['passport'][
                                'nickname']  # publish_user
                        except Exception as e:
                            print e
                            publish_user = ''
                        publish_user_id = someone_comment['passport'][
                            'user_id']  # publish_user_id
                        create_time = someone_comment[
                            'create_time']  # publish_time
                        create_time = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(int(int(create_time / 1000))))
                        spider_time = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        like_count = someone_comment['support_count']
                        parent_id = data['id']  #mark这两个节点到底应该放什么东西呢?
                        ancestor_id = data['id']
                        this_comments = someone_comment['comments']
                        if this_comments:
                            parent_id = this_comments[0]['comment_id']
                        #用堆来解决这种类型的评论8-16
                        # for this_comments
                        cmspagenum += 1

                        thiscomments = {
                            'content': content,
                            'id': id,
                            'publish_user_photo': publish_user_photo,
                            'publish_user': publish_user,
                            'publish_user_id': publish_user_id,
                            'publish_time': create_time,
                            'spider_time': spider_time,
                            'like_count': like_count,
                            'parent_id': parent_id,
                            'ancestor_id': ancestor_id,
                        }
                        comments_data.append(thiscomments)

                    if cmspagenum >= cmspage_taotalnum - 1:
                        break

                request_num += 1
                if cmspagenum > cmspage_taotalnum / 30:
                    break

            data['reply_nodes'] = comments_data
            if not comments_data:
                data['reply_count'] = 0
            else:
                data['reply_count'] = cmspage_taotalnum
            while len(self.result_list) > 600:
                time.sleep(1)
                print 'is waiting the lenth of the result_list to decrease to 300'

            #最后处理,去掉不需要的字段:
            try:
                del data['sid']  #图片类的新闻没有sid
            except:
                pass
            self.result_list.append(data)
예제 #9
0
        def get_content_inside(data):
            #这里不设计去重功能就真的没法停下来了
            #这里就写第一次的代码功能就行
            url = data['url']
            page_num = url.split('/')[-1]
            response1 = get_response_and_text(
                url=url,
                needupdate=True,
                update_info={'page_num': page_num},
                charset='utf-8')
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            Re_find_sid = re.compile(r'sid=".*"')
            try:
                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            except Exception as e:
                print e
                return

            if ('class="swiper-container"' not in response_in_function_text
                ) and ('class="content"'
                       in response_in_function_text):  #这个是文字类的新闻
                sid = Re_find_sid.findall(response_in_function_text)[0].split(
                    '"')[1]
                data['sid'] = sid

                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
                for i in datasoup.select(
                        'body > div.content > div.neirong > h2'):
                    title = i.text
                for j in datasoup.select(
                        'body > div.content > div.neirong > p > span:nth-of-type(4)'
                ):
                    publish_time = j.text
                for k in datasoup.select(
                        'body > div.content > div.neirong > p > span:nth-of-type(3)'
                ):
                    publish_user = k.text.replace(' ', '').replace(
                        '\t', '').replace('\n',
                                          '').replace('\r',
                                                      '').replace(u'来源:', '')
                    break
                for publish_user_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(3)'
                ):
                    publish_user = publish_user_for.text
                    break
                # for publish_user_for in datasoup.select('body > div.content > p.jieshao > span:nth-child(3) > a')
                content = ''
                for l in datasoup.select(
                        'body > div.content > div.neirong > article > p'):
                    content += l.text
                img_urls = []
                neirong_content = datasoup.select(
                    'body > div.content > div.neirong')
                neirong_content = str(neirong_content)
                Re_find_img_url = re.compile(r'src=".*?"')
                img_find_by_re = Re_find_img_url.findall(neirong_content)
                for i in img_find_by_re:
                    img_urls.append(i.split('"')[1])
                try:
                    publish_time += ':00'
                except Exception as e:
                    print e
                data['title'] = title
                data['content'] = content
                data['publish_time'] = publish_time
                data['publish_user'] = publish_user
                data['reply_nodes'] = []
                data['spider_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                data['img_urls'] = img_urls
            elif 'class="swiper-container"' in response_in_function_text:  #这里可能是图片新闻
                content = ''
                img_urls = []
                for title_for in datasoup.select('body > div.content > h2'):
                    title = title_for.text
                for publish_time_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(4)'
                ):
                    publish_time = publish_time_for.text + ':00'
                for publish_user_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(3) > a'
                ):
                    publish_user = publish_user_for.text.replace(
                        ' ', '').replace('\t', '').replace('\n', '').replace(
                            '\r', '').replace(u'来源:', '')
                    break
                for publish_user_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(3)'
                ):
                    publish_user = publish_user_for.text
                    break
                for content_for in datasoup.select(
                        'body > div.content > p.zongjie'):
                    content += content_for.text
                for img_url in datasoup.select(
                        'div.swiper-container > div.swiper-wrapper > div.swiper-slide > div.imgdiv > img'
                ):
                    img_urls.append(img_url.get('src'))
                try:
                    data['title'] = title
                    data['content'] = content
                    data['publish_time'] = publish_time
                    data['publish_user'] = publish_user
                    data['reply_nodes'] = []
                    data['spider_time'] = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    data['img_urls'] = img_urls
                except Exception as e:
                    print e
                    return

            else:
                print url, '-----not in neirong and picture deal module'
                return

            while len(self.comments_url_list) > LEN_COMMENT_LIST:
                time.sleep(1)
            # print data
            self.comments_url_list.append(data)
            pass