Пример #1
0
        def get_content_inside(data):
            url_debug = data['url']
            response1 = get_response_and_text(url_debug)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            try:
                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            except Exception as e:
                return

            img_list = []  #会报错,从try中提出来
            result_read_count = 0
            video_urls = []

            try:
                content_raw = datasoup.select('#artContent')
                content = content_raw[0].text.strip()
                img_list2 = Re_find_img_url.findall(str(content_raw[0]))
                for img_url_raw in img_list2:
                    if img_url_raw not in [
                            'http://image21.360doc.com/DownloadImg/2010/12/2413/7923021_1.gif'
                    ]:
                        if 'swf' not in img_url_raw:
                            img_list.append(img_url_raw)
                        else:
                            video_urls.append(img_url_raw)
            except Exception as e:
                # print e
                content = ''
            try:
                url_debug2 = 'http://webservice.360doc.com/GetArtInfo20130912NewV.ashx?UID=-100,' + data[
                    'publish_user_id'] + ',GetBookTwo,' + data[
                        'id'] + ',0,0@cg@0&jsoncallback=jsonp'
                response2 = get_response_and_text(url_debug2)
                response_in_function_text2 = response2[
                    'response_in_function_text']
                result_read_count = response_in_function_text2.split(
                    u'@c@g@tl@c@g@t')[1].split(u'l@c@g@t')[0]
            except Exception as e:
                # print e
                pass

            data['content'] = content
            data['read_count'] = int(result_read_count)
            data['img_urls'] = img_list
            data['video_urls'] = video_urls

            self.comments_data_list.put(data)
Пример #2
0
def get_index2(index_queue):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

    index_page_1_url = 'http://www.ibeitun.net/xinxi/s0_a0_m0_p1.html'
    response1 = get_response_and_text(url=index_page_1_url, headers=headers)
    response_in_function = response1['response_in_function']
    response_in_function_text = response1['response_in_function_text']
    if not response_in_function:
        return
    datasoup = BeautifulSoup(response_in_function_text, 'lxml')
    url_list_div = datasoup.select('div.indexMessBox')
    Re_find_all_url = re.compile(r'\<a href\=\"(\/\d{4,5}.html)"')
    url_list = Re_find_all_url.findall(str(url_list_div))
    for i in url_list:
        url = 'http://www.ibeitun.net' + str(i)
        index_queue.put({
            'url':
            url,
            'spider_time':
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'id':
            url.split('/')[-1].split('.')[0]
        })
Пример #3
0
def get_index(url,content_queue):
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'connection': 'close',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    }
    response1=get_response_and_text(url=url,headers=headers)
    response_in_function=response1['response_in_function']
    response_in_function_text=response1['response_in_function_text']
    try:
        datasoup=BeautifulSoup(response_in_function_text,'lxml')
    except Exception as e:
        return
    for i in datasoup.select('body > div.content > div.shishiimportantnews > div.left > ul > li'):
        publish_time= i.select('span')[0].text
        url= i.select('a')[0].get('href')
        title= i.select('a')[0].get('title')
        datadict={
            'publish_time':publish_time,
            'url':url,
            'title':title,
            'id':url.split('id=')[0],
        }
        content_queue.put(datadict)
Пример #4
0
        def get_index_inside_wenben(url):
            user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
            headers = {
                'User-Agent': user_agent
            }
            response1 = get_response_and_text(url=url,headers=headers)
            response_in_function=response1['response_in_function']
            response_in_function_text=response1['response_in_function_text']
            if not response_in_function:
                return
            datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            for div_content in datasoup.select('body > div'):
                try:
                    try:
                        reply_count=div_content.select('span.reply')[0].text
                    except Exception as e:
                        # print e
                        pass
                        reply_count=0#因为有些新闻index请求中看不到评论消息。
                    url= 'http://m.thepaper.cn/' + div_content.select('div > a')[0].get('href')  # url
                    publish_time = div_content.select('p > span')[0].text  # publish_time
                    #这里需要对publish_time做处理吗?

                    title= div_content.select('div > p > a')[1].text  # title
                    publish_user= div_content.select('div > p > a')[0].text  # publish_user
                    # print div_content
                    if u'分钟' in publish_time:
                        minulate = publish_time.replace(u'分钟前', '')
                        time_b = datetime.now() - timedelta(minutes=int(minulate))
                        print time_b
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c
                    elif u'小时前' in publish_time:
                        hourse = publish_time.replace(u'小时前', '')
                        time_b = datetime.now() - timedelta(hours=int(hourse))
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c
                    elif u'天前' in publish_time:
                        days = publish_time.replace(u'天前', '')
                        time_b = datetime.now() - timedelta(days=int(days))
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c

                    print '\n\n\n'
                except Exception as e:
                    # print e
                    pass
                id=url.split('_')[-1]
                this_dict={
                    'id':id,
                    'url':url,
                    'publish_time':publish_time,
                    'title':title,
                    'publish_user':publish_user,
                    'is_movie':False,
                    'reply_count':reply_count,
                    'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                self.content_data_list.append(this_dict)
Пример #5
0
def get_content(data, result_queue):
    try:
        url = data['url']
        headers = {
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        }
        if 'nssbt' not in url:
            return
        response1 = get_response_and_text(url=url, headers=headers)
        response_in_function = response1['response_in_function']
        response_in_function_text = response1['response_in_function_text']
        if not response_in_function:
            return
        datasoup = BeautifulSoup(response_in_function_text, 'lxml')

        # title= datasoup.select('body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.title')[0].text

        try:
            publish_user = datasoup.select(
                'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.bingtuanxinxi'
            )[0].text
        except Exception as e:
            print url
        try:
            publish_user.split(u'作者:')[1].split(' ')[0]
        except Exception as e:
            print e
            try:
                print publish_user
            except:
                pass
        source = datasoup.select(
            'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.bingtuanxinxi > a'
        )[0].text
        content = ''
        for content_p in datasoup.select(
                'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.networkinformation > p'
        ):
            content += content_p.text

        data['publish_user'] = publish_user
        data['source'] = source
        data['content'] = content

        result_queue.put(data)
        # print data
        # print 'send to result_queue one!!!!!!!'
    except Exception as e:
        # traceback.extract_stack()
        print e
Пример #6
0
def get_index(contentqueue):

    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    urls=['http://bts.gov.cn/xw/gjgn/']#国内
    url2=['http://bts.gov.cn/xw/zdxw/']#师事
    url3=['http://www.bts.gov.cn/zcms/']#部门动态
    url4=['http://bts.gov.cn/xw/zsjg/']#直属单位
    url5=['http://bts.gov.cn/xw/gjgn/']
    url6=['http://bts.gov.cn/xw/qt/']#其它
    url7=['http://www.bts.gov.cn/gk/tzgg/']#通知公告
    url9=['http://www.bts.gov.cn/gk/rsxx/']#人事信息
    url10=['http://www.bts.gov.cn/gk/ywgz/']#业务工作
    url11=['http://www.bts.gov.cn/gk/wjzc/']#文件政策
    url12=['http://www.bts.gov.cn/gk/zcjd1/']#政策解读
    url13=['http://www.bts.gov.cn/gk/tjxx/']#统计信息


    urls_all=urls+url2+url3+url4+url5+url6+url7+url9+url10+url11+url12+url13

    for one_url in urls_all:
        response1=get_response_and_text(url=one_url,headers=headers,charset='utf-8')
        response_in_fucntion=response1['response_in_function']
        if not response_in_fucntion:
            continue
        response_in_fucntion_text=response1['response_in_function_text']
        datasoup=BeautifulSoup(response_in_fucntion_text,'lxml')
        try:
            for one_li in datasoup.select(
                    'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li'):
                # print one_li.text
                url_raw = one_li.select('a')[0].get('href')
                title = one_li.select('a')[0].text.strip()
                url_end = urljoin(basic_url, url_raw)
                id=url_end.split('/')[-1].split('.')[0]

                if 'bts.gov.cn' in url_end:
                    print url_end

                print title
                # print one_li.select('a')[1].text#publish_time//2017-04-04
                index_dict={
                    'title':title,
                    'url':url_end,
                    'id':id,
                    'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'parent_id':id,
                }
                contentqueue.put(index_dict)
        except Exception as e:
            print e
            print one_url
Пример #7
0
def get_index(queue):
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }

    urls1=['http://www.altxw.com/news/system/count//0012002/000000000000/000/001/c0012002000000000000_0000012{}.shtml'.format(str(i)) for i in range(55,58)]#
    urls2=['http://www.altxw.com/news/node_2031.htm']
    urls3=urls1+urls2
    urls3.append('http://www.altxw.com/gblw/index.shtml')

    for url in urls3:
        print url
        response1=get_response_and_text(url=url,headers=headers,charset='utf-8')
        if not response1['response_in_function']:
            continue
        response_in_function_text=response1['response_in_function_text']
        datasoup=BeautifulSoup(response_in_function_text,'lxml')
        for one_url_div in datasoup.select('div.bd > ul  li'):
            url= one_url_div.select('a')[0].get('href')
            if 'com' not in url:
                url='http://www.altxw.com/news/'+url
            if 'altxw.com/news/' not in url:
                continue
            title= one_url_div.select('a')[0].text.strip()
            # publish_time= '20'+one_url_div.select('span')[0].text.strip()+':00'
            publish_time=one_url_div.select('span')[0].text.strip()
            if len(publish_time.split('-')[0])<4:
                publish_time='20'+publish_time
            if len(publish_time)<11:
                publish_time+=' 00:00:00'
            elif 4<len(publish_time.split(u' ')[1])<8:
                publish_time=publish_time+':00'
            if len(publish_time)<18:
                print publish_time
            spider_time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            id=url.split('/')[-1].split('.')[0]

            index_data_dict={
                'url':url,
                'title':title,
                'publish_time':publish_time,
                'spider_time':spider_time,
                'id':id,
                'parent_id':id,
                'publish_user':''
            }
            queue.put(index_data_dict)
Пример #8
0
def get_content(data, result_queue):

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

    url = data['url']
    response1 = get_response_and_text(url=url, headers=headers)
    response_in_function = response1['response_in_function']
    response_in_function_text = response1['response_in_function_text']
    if not response_in_function:
        return
    datasoup = BeautifulSoup(response_in_function_text, 'lxml')
    # print str(datasoup)
    try:
        title = datasoup.select('div.main div.infobox h2.bt1')[0].text  #title
        publish_time = datasoup.select('div.main div.infobox div.infoDate')[
            0].text.split('\n')[0].split(u':')[1].strip()  #publish_time
        read_count = datasoup.select('div.main div.infobox div.infoDate')[
            0].text.split('\n')[1].split(u':')[1].strip()  #read_count

        content = datasoup.select(
            '#infobox > div.infoLeft > div.infoContent div.textwrap')[0].text
        img_urls = []
        for picurl in datasoup.select(
                '#infobox > div.infoLeft > div.infoContent div.picwrap a'):
            img_urls.append('http://www.ibeitun.net' + picurl.get('href'))

        # news_content={
        #     'title':title,
        #     'publish_time':publish_time,
        #     'read_count':read_count,
        #     'content':content,
        #     'img_urls':img_urls
        # }
        data['content'] = content
        data['title'] = title
        data['read_count'] = read_count
        data['img_urls'] = img_urls
        data['publish_time'] = str(publish_time) + ' 00:00:00'
        result_queue.put(data)
    except Exception as e:
        # print 'the error pages url is ------>',url
        # traceback.print_exc()
        return
Пример #9
0
def get_content(data, comment_queue):

    Re_find_img = re.compile(r'src\=\"(.*?)\"')

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    basic_url = 'http://www.altxw.com/news/content/'

    url = data['url']
    response1 = get_response_and_text(url=url,
                                      headers=headers,
                                      charset='utf-8')
    response_in_function = response1['response_in_function']
    if not response_in_function:
        return
    try:
        response_in_function_text = response1['response_in_function_text']
        datasoup = BeautifulSoup(response_in_function_text, 'lxml')
        # title=datasoup.select('body > div.body > div > div.main.l > div > div > h1')[0].text()
        source = datasoup.select(
            'body > div.body > div > div.main.l > div > div > div > li:nth-of-type(2)'
        )[0].text.split(u':')[1]
        content = ''
        for i in datasoup.select(
                'body > div.body > div > div.main.l > div > div > ul > p'):
            content += i.text
        content_div = datasoup.select('div > div.main.l > div > div > ul')[0]
        img_urls = Re_find_img.findall(str(content_div))
        img_urls2 = []
        for one_img_url in img_urls:
            one_img_url = urljoin(basic_url, one_img_url.strip('../'))
            img_urls2.append(one_img_url)
        print img_urls2
        data['source'] = source
        data['content'] = content
        data['img_urls'] = img_urls2

        comment_queue.put(data)
    except Exception as e:
        print e
Пример #10
0
            def get_content_inside_movie(data):
                url_for_debug=data['url']
                response1=get_response_and_text(url=url_for_debug)
                response_in_function=response1['response_in_function']
                response_in_function_text=response1['response_in_function_text']
                if not response_in_function:
                    return
                datasoup=BeautifulSoup(response_in_function_text,'lxml')
                Re_find_content = re.compile(r'desc: \'(.*)\'')
                content_data=Re_find_content.findall(response_in_function_text)
                #8-30
                like_count=datasoup.select('#news_praise')
                if like_count:
                    like_count_value=int(like_count[0].text.strip())
                else:
                    like_count_value=0

                vedio=datasoup.select('video > source')
                if vedio:
                    vedio_urls=[]
                    for vedio1 in vedio:
                        vedio_urls.append(vedio1.get('src'))
                else:
                    vedio_urls=[]
                #8-30


                try:
                    content= content_data[0]
                except Exception as e:
                    print e#这里有时候会报错说这里的content没有内容
                    content=''

                try:
                    source=datasoup.select('#v3cont_id > div.news_content > div > br')[0].text.split(u'来源:')[1]
                except:
                    source=''

                # publish_time= datasoup.select('#v3cont_id > div.news_content > div:nth-of-type(3)')[0][0:16]
                data['content']=content
                data['like_count']=like_count_value
                data['video_urls']=vedio_urls
                data['source']=source
                self.comments_url_list.append(data)
Пример #11
0
def get_content(data, result_queue):  #
    Re_find_img = re.compile(r'img .*? src="(.*?)"')
    Re_find_movie = re.compile(r'\<video.*?src="(.*?)"')

    url = data['url']
    headers = {
        'Accept':
        '*/*',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'connection':
        'close',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    }
    response1 = get_response_and_text(url=url, headers=headers)
    response_in_function = response1['response_in_function']
    if not response_in_function:
        return
    response_in_function_text = response1['response_in_function_text']
    try:
        datasoup = BeautifulSoup(response_in_function_text, 'lxml')
    except:
        return
    content_str = str(datasoup.select('div.content_main')[0])
    img_urls = Re_find_img.findall(content_str)
    video_url = Re_find_movie.findall(content_str)
    img_urls2 = []
    video_url2 = []
    for one_img_url in img_urls:
        img_urls2.append('http://www.xjbtssbtszhdj.com/' + one_img_url)
    for one_video_url in video_url:
        video_url2.append('http://www.xjbtssbtszhdj.com' + one_video_url)
    # print img_urls2
    # print video_url2
    data['img_urls'] = img_urls2
    data['video_urls'] = video_url2
    result_queue.put(data)
Пример #12
0
def get_index(content_queue):
    # url=data['url']
    outside_url = [
        'http://186t.ibeitun.net/news.aspx?s=0&p={}'.format(str(i))
        for i in range(1, 5)
    ]

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }
    for url in outside_url:
        response1 = get_response_and_text(url=url, headers=headers)
        response_in_function = response1['response_in_function']
        response_in_function_text = response1['response_in_function_text']
        # response1=requests.get(url=url,headers=headers)
        # response_in_function_text=response1.text
        datasoup = BeautifulSoup(response_in_function_text, 'lxml')

        for one_title in datasoup.select(
                'div.main.fixed > div.right.wow.fadeInUp > ul > li'):
            url = 'http://186t.ibeitun.net' + one_title.select('a')[0].get(
                'href')
            publish_time = one_title.select(
                'a span')[0].text.strip() + ' 00:00:00'
            title = one_title.select('a div')[0].text.strip()

            one_dict = {
                'url':
                url,
                'publish_time':
                publish_time,
                'title':
                title,
                'id':
                url.split('mid=')[1],
                'spider_time':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            content_queue.put(one_dict)
Пример #13
0
        def get_content_inside(data):
            url = data['url']
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            needproxy = int(random.randint(1, 10) / 7)

            response1 = get_response_and_text(url=url,
                                              headers=headers,
                                              needproxy=needproxy)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            if not response_in_function:
                return
            real_url = response_in_function.url
            if 'toutiao' not in real_url:
                # logger_toutiao.log(level=logging.WARNING, msg='toutiao was not in thisurl---------' + real_url)
                return
            elif 'http://www.toutiao.com/api/pc/subject/' in real_url:
                # logger_toutiao.log(level=logging.WARNING,msg='http://www.toutiao.com/api/pc/subject/ was in thisurl----------'+real_url)
                return
            else:
                url = real_url

            Re_find_chineseTag = re.compile(r"chineseTag: '.*?'")

            #######################################################

            chineseTag = Re_find_chineseTag.findall(response_in_function_text)
            if chineseTag:
                try:
                    # print 'the lenth of response-------',len(response_in_function_text)
                    chineseTag = chineseTag[0].split("'")[1]
                    if chineseTag == u'图片' or chineseTag == '图片':
                        content_time_img = get_content_picture({
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text
                        })
                    elif chineseTag == u'问答' or chineseTag == '问答':
                        content_time_img = get_content_wenda(htmldata={
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text,
                            'data':
                            data
                        },
                                                             data=data)
                        return
                    else:
                        # print chineseTag,'is gonging to get_content_news'
                        content_time_img = get_content_news({
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text
                        })
                except Exception as e:
                    # print e, '在区分是属于图片、问答等模块时出错'
                    pass
            else:

                # print chineseTag
                return
            #如果不是问答,那么就进入到这里边
            Re_find_itmeId = re.compile(r'itemId: \'.*?\'')  # 普通头条
            Re_find_itme_Id = re.compile(r'item_id:\'.*?\'')  # 图片
            if Re_find_itmeId.findall(response_in_function_text):
                try:
                    item_id = Re_find_itmeId.findall(
                        response_in_function_text)[0].split("'")[1]
                except Exception as e:
                    # logger_toutiao.log(level=logging.WARNING, msg={'where': 'itemid来split失败了', 'contetn':
                    #     Re_find_itmeId.findall(response_in_function_text)[0]})
                    # print e, 'itemid在re中找到了,但是split失败了'
                    pass
            else:
                try:
                    item_id = Re_find_itme_Id.findall(
                        response_in_function_text)[0].split("'")[1]
                except Exception as e:
                    pass
                    # print e, '在item——id中没找到值,图片的item_id'
                    # msg = {'errormsg': e.message + '在item——id中没找到值,图片的item_id',
                    #        'htmldata': response_in_function_text,
                    #        'url': response_in_function.url,
                    #        'code': response_in_function.code,
                    #        'msg': response_in_function.msg}
                    # logger_toutiao.log(level=logging.WARNING, msg=msg)
                    return

            try:
                data['img_urls'] = content_time_img['img_urls']
                data['content'] = content_time_img['content']
                if len(content_time_img['publish_time']) < 12:
                    data['publish_time'] = content_time_img[
                        'publish_time'] + ' 00:00:00'
                else:
                    data['publish_time'] = content_time_img['publish_time']
                data['item_id'] = item_id
                data['reply_nodes'] = []
            except Exception as e:
                # print e, 'data合成的时候除了问题'
                pass

            self.comments_url_list.append(data)
Пример #14
0
        def get_content_inside(data):

            url_for_debug = data['url']

            is_first = 1

            reply_nodes = []
            error_times = 5

            while True:
                response1 = get_response_and_text(url=url_for_debug,
                                                  headers=self.headers,
                                                  charset='utf-8')
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                if not response_in_function:  #10-9,因为返回经常会有空的,网络不好。
                    return
                # response_in_function_text=response_in_function_text.decode('utf-8').encode('utf-8')

                img_may_no_user = [
                    'http://image.gfan.com/static/image/common/rright.gif',
                    'http://image.gfan.com/static/image/common/none.gif',
                    'http://image.gfan.com/static/image/common/rleft.gif'
                ]

                result_text = handle_content(response_in_function_text)
                Re_find_img_url = re.compile(r'file="(.*?)"')

                datasoup = BeautifulSoup(result_text, 'lxml')

                if is_first == 1:
                    try:
                        main_div = datasoup.select('table[id]')[0]
                        main_content = datasoup.select(
                            '.plc .pct .pcb')[0].text.strip()
                        main_content_div = datasoup.select('.plc .pct .pcb')[0]
                        main_read_count = datasoup.select(
                            ' tr  td.pls.ptm.pbm > div > span:nth-of-type(2)'
                        )[0].text
                        main_reply_count = datasoup.select(
                            'tr > td.pls.ptm.pbm > div > span:nth-of-type(5)'
                        )[0].text
                        main_img_urls = Re_find_img_url.findall(
                            str(main_content_div))

                        main_img_urls_list2 = []
                        for img_url_raw in main_img_urls:
                            img_url_dealed1 = img_url_raw.replace(
                                '.thumb.jpg', '')
                            main_img_urls_list2.append(img_url_dealed1)

                    except Exception as e:
                        # print e
                        return
                    try:
                        main_publish_user_photo = main_div.select(
                            'div.avatar a img')[0].get('src')  #有可能被删除了
                    except Exception as e:
                        main_publish_user_photo = ''
                    main_publish_user = main_div.select(
                        '.pls .pi .authi  a')[0].text

                    data['read_count'] = main_read_count
                    data['img_urls'] = list(set(main_img_urls_list2))
                    data['reply_count'] = main_reply_count
                    data['content'] = main_content
                    data['publish_user_photo'] = main_publish_user_photo
                    data['publish_user'] = main_publish_user

                #9-19
                try:
                    datasoup.select('.plc .pct .pcb')[0].text.strip()
                except Exception as e:
                    # print e
                    #因为网络原因,导致下一页数据可能获取不完全
                    error_times -= 1
                    if error_times > 1:
                        continue
                    else:
                        return

                for one_div in datasoup.select(
                        '#postlist > div[id]')[is_first:-1]:
                    img_list = []
                    # print one_div.select('div.authi a')[0].text.strip()#publish_user
                    # print one_div.text.strip()
                    try:
                        maybe_url_list = Re_find_img_url.findall(
                            str(one_div.select('.plc .pct .pcb')[0]))
                        for url_img_one in maybe_url_list:
                            if url_img_one not in img_may_no_user and 'http://bit.ly/' not in url_img_one:
                                img_list.append(url_img_one)
                        img_urls = img_list  # img_list
                        content = one_div.select('.plc .pct .pcb')[
                            0].text.strip()  # content#这里有时候不同的网页内部内容是不一样的
                        publish_time = one_div.select(
                            '.authi em')[0].text.replace(u'发表于 ', '').replace(
                                '\n', '').strip() + ':00'  # publish_time
                        try:
                            publish_user_photo = one_div.select(
                                'div.avatar a img'
                            )[0].get(
                                'src'
                            )  # publish_user_photo#因为会有用户删除这种情况,导致不能正常获取对应的图片
                        except Exception as e:
                            # print e
                            publish_user_photo = ''
                        # if data['publish_user_photo']=='None':#功能有冲突不过也能用
                        #     data['publish_user_photo']=publish_user_photo
                        # else:
                        #     data['publish_user_photo']=''
                        url = one_div.select('.plc .pi strong a')[0].get(
                            'href').replace(';', '&')  # url
                        id = one_div.select('.plc .pi strong a')[0].get(
                            'id')  # id
                        publish_user_id = one_div.select(
                            '.plc .pi strong a')[0].get('id').replace(
                                'postnum', '')  # publish_user_id
                        publish_user = one_div.select(
                            '.pls .pi .authi  a')[0].text

                        this_comment_info = {
                            'img_urls':
                            img_urls,
                            'content':
                            content,
                            'publish_time':
                            time.strftime(
                                '%Y-%m-%d %H:%M:%S',
                                time.strptime(publish_time,
                                              '%Y-%m-%d %H:%M:%S')),
                            'publish_user_photo':
                            publish_user_photo,
                            'url':
                            url,
                            'id':
                            id,
                            'publish_user_id':
                            publish_user_id,
                            'parent_id':
                            data['id'],
                            'ancestor_id':
                            data['id'],
                            'publish_user':
                            publish_user
                        }
                        reply_nodes.append(this_comment_info)

                        # print one_div.select('.t_fsz')[0].text
                    except Exception as e:
                        # print e, '这里楼层数据被删除了'
                        pass

                next_page_url_raw = datasoup.select('.nxt')
                if next_page_url_raw:
                    is_first = 0
                    next_url = next_page_url_raw[0]
                    url_for_debug = next_url.get('href')
                    # print 'is going to deal next page-------------',url_for_debug
                else:

                    data['reply_nodes'] = reply_nodes
                    self.result_data_list.append(data)
                    break
Пример #15
0
        def get_index_inside(url_get_index):
            next_page_num = 0
            next_page_num_error = 0  #因为网络原因会导致下一页判断出错,这个变量来设置重访次数
            while True:
                response1 = get_response_and_text(url=url_get_index,
                                                  headers=self.headers)
                respnse_in_function = response1['response_in_function']
                respnse_in_function_text = response1[
                    'response_in_function_text']
                if not respnse_in_function:
                    continue

                try:
                    datasoup = BeautifulSoup(respnse_in_function_text, 'lxml')
                except Exception as e:
                    print e
                for one_forum in datasoup.select('#moderate  tbody')[1:]:
                    try:
                        title = one_forum.select('th > a')[0].text  # title
                        publish_user = one_forum.select(
                            'td.by a')[0].text  # publish_user
                        # print one_forum.select('td.by a')[0].get('href')  # publish_user_url
                        publish_time = one_forum.select('td.by em span')[
                            0].text.strip() + ':00'  # publish_time
                        reply_count = one_forum.select(
                            'td.num a')[0].text  # reply_count
                        read_count = one_forum.select(
                            'td.num em')[0].text  # view_num
                        url = one_forum.select('th > a')[0].get('href')  # url

                        time_secends = time.strptime(publish_time,
                                                     '%Y-%m-%d %H:%M:%S')
                        this_reply_node = {
                            'title':
                            title,
                            'publish_user':
                            publish_user,
                            'publish_time':
                            time.strftime('%Y-%m-%d %H:%M:%S', time_secends),
                            'reply_count':
                            reply_count,
                            'read_count':
                            read_count,
                            'url':
                            url,
                            # 'publish_user':None,
                            'id':
                            url.split('-')[1],
                            'spider_time':
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                            'publish_user_photo':
                            'None',
                            'content':
                            'None'
                        }
                        self.content_data_list.append(this_reply_node)
                        next_page_num_error += 3  #既然这里都有列表了,说明网页访问没有问题,就给这个值设置大一些,免得最后一页重复访问太多次。
                    except Exception as e:
                        pass
                        # print e

                next_page_url = datasoup.select('.nxt')

                if next_page_url:
                    next_page_num += 1
                    next_url = next_page_url[0].get('href')
                    url_get_index = next_url
                    next_page_num_error = 0  #这里一定要重置

                else:  #为什么这里会提前停止?很有可能是下一页的数据获取的时候网页因为网络原因导致没有解析出下一页,所以这里改成请求次数的这种
                    if next_page_num_error > 5:
                        break
                    else:
                        # print respnse_in_function.status_code
                        next_page_num_error += 1
Пример #16
0
    def get_Index(self):
        while True:
            for url_to_get_index in self.urls:
                for i in range(10):
                    try:
                        needproxy = int(random.randint(1, 10) / 7)
                        response1 = get_response_and_text(url=url_to_get_index,
                                                          needproxy=needproxy)
                        response_in_function = response1[
                            'response_in_function']
                        response_in_function_text = response1[
                            'response_in_function_text']
                        response_text = response_in_function_text.decode(
                            'utf-8')
                        datajson = json.loads(response_text)
                        datajson_index_data = datajson['data']
                        for one_index in datajson_index_data:
                            try:
                                title = one_index['title']
                            except:
                                title = ''
                            try:
                                reply_count = int(one_index['comments_count'])
                            except:
                                reply_count = 0
                            url = 'http://www.toutiao.com' + one_index[
                                'source_url']
                            try:
                                publish_user = one_index['source']  # publisher
                            except:
                                publish_user = ''
                            try:
                                publish_user_photo = one_index[
                                    'media_avatar_url']
                                if 'http' not in publish_user_photo:
                                    publish_user_photo = 'http:' + publish_user_photo
                            except:
                                publish_user_photo = ''
                            try:
                                vedio_id = one_index['video_id']
                            except:
                                vedio_id = None
                            try:
                                is_ad = one_index['label']
                            except:
                                is_ad = False

                            if vedio_id:
                                continue  # 如果是视频,直接舍弃
                            if is_ad == u'广告':
                                continue

                            id = one_index['group_id']

                            dict1 = {
                                'id':
                                id,
                                'url':
                                url,
                                'reply_count':
                                reply_count,
                                'title':
                                title,
                                'publish_user':
                                publish_user,
                                'publish_user_photo':
                                publish_user_photo,
                                'spider_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                            }

                            self.content_data_list.append(dict1)
                    except Exception as e:
                        pass
            print '歇一会,现在在等待那600秒'
            time.sleep(600)
Пример #17
0
        def get_comment_comment(data1):  # 评论中有评论,起名data1是为了防止覆盖data变量
            id = data1['id']
            error_time = 3
            while True:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
                    # 'Upgrade-Insecure-Requests':'1',
                    # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    # 'Accept-Encoding':'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    # 'Cache-Control':'max-age=0',
                    'Connection': 'close'
                }
                while True:
                    try:
                        comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str(
                            id) + '&item_id=' + str(id) + '&offset=0&count=20'

                        response1 = get_response_and_text(url=comment_url,
                                                          headers=headers)
                        response_in_function = response1[
                            'response_in_function']
                        response_in_function_text = response1[
                            'response_in_function_text']
                        datajson = json.loads(response_in_function_text)
                        break

                    except Exception as e:
                        # print e,'mark2'
                        error_time -= 1
                        if error_time < 1:
                            return

                reply_nodes = []
                # datajson=json.loads(response_in_function.text)
                try:
                    datajson = json.loads(
                        response_in_function_text
                    )  #报错  ValueError: No JSON object could be decoded  8-28日错误很多
                except Exception as e:
                    # print e
                    pass
                try:
                    datajson['data'][
                        'data']  #sometimes this will be wrong! the response returned is not what you need!9-7
                except Exception as e:
                    # print e
                    error_time -= 1
                    if error_time < 1:
                        # print 'wrong time too much'
                        break
                    continue
                for one_comment in datajson['data']['data']:
                    content = one_comment['text']
                    like_count = one_comment['digg_count']
                    publish_time = one_comment['create_time']
                    publish_time = time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time)))
                    publish_user_id = one_comment['user']['user_id']
                    publish_user = one_comment['user']['screen_name']
                    publish_user_photo = one_comment['user']['avatar_url']
                    id = one_comment['id']
                    try:
                        ancestor_id = data1['ancestor_id']
                    except Exception as e:
                        # print e,'mark3'
                        ancestor_id = 'wrong'
                    parent_id = data1['id']
                    thisnode = {
                        'publish_user': publish_user,
                        'content': content,
                        'like_count': like_count,
                        'publish_time': publish_time,
                        'publish_user_id': publish_user_id,
                        'publish_user_photo': publish_user_photo,
                        'id': id,
                        'ancestor_id': ancestor_id,
                        'parent_id': parent_id,
                        # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    }
                    reply_nodes.append(thisnode)

            return reply_nodes
Пример #18
0
        def get_comment_inside(data):
            # session1 = requests.session()
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }

            while True:  # 强制请求
                try:
                    # print data
                    comment_url = 'http://www.toutiao.com/api/comment/list/?group_id=' + str(
                        data['id']) + '&item_id=' + str(
                            data['item_id']) + '&offset=0&count=20'
                    needproxy = int(random.randint(1, 10) / 7)

                    response1 = get_response_and_text(url=comment_url,
                                                      needproxy=needproxy)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                    break
                except Exception as e:
                    # print e,'mark1'
                    if 'item_id' in e:
                        messege = {'msg': e.message}
                        # logger_toutiao.log(msg=messege, level=logging.WARNING)
            comments_data = []
            try:
                data_json = json.loads(response_in_function_text)
                data_json['data']['comments']
            except Exception as e:
                # print e,'mark1'#这里本来是应该返回正常的json数据,但是会返回一抹莫名奇妙的location跳转的网站。因此直接把它结束了,宁愿没抓,也不要误抓。
                return
            for one_comment in data_json['data']['comments']:
                content = one_comment['text']
                like_count = one_comment['digg_count']
                publish_time = one_comment['create_time']
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                publish_user_photo = one_comment['user']['avatar_url']
                publish_user_id = one_comment['user']['user_id']
                publish_user = one_comment['user']['name']  #8-17日改
                id = one_comment['id']
                reply_count = one_comment['reply_count']
                parent_id = data['id']
                ancestor_id = data['id']

                if reply_count > 0:
                    reply_nodes = get_comment_comment({
                        'id': id,
                        'ancestor_id': data['id']
                    })
                else:
                    reply_nodes = []

                thisnode = {
                    'content': content,
                    'like_count': like_count,
                    'publish_time': publish_time,
                    'publish_user_photo': publish_user_photo,
                    'publish_user_id': publish_user_id,
                    'publish_user': publish_user,
                    'id': id,
                    'reply_count': reply_count,
                    'reply_nodes': reply_nodes,
                    'parent_id': parent_id,
                    'ancestor_id': ancestor_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                # data['reply_nodes'].append(thisnode)
                comments_data.append(thisnode)

            # 这里的评论能获取的就20个,所以不设计下一页,评论中的评论也是不设计下一页

            data['reply_nodes'] = comments_data
            while len(self.result_list) > 600:
                time.sleep(1)
                print 'result_list 的长度低于300了,等待输入存储中。。。'

            self.result_list.append(data)
Пример #19
0
        def get_content_in_wenda_comments_more(id_replynodes, data=None):
            error_time = 5

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            try:
                if not id_replynodes['next_comment_url']:
                    url_comments_more = 'https://www.wukong.com/wenda/web/question/loadmorev1/?count=10&qid=' + \
                                        id_replynodes['id'] + '&offset=10&req_type=1'
                    response1 = get_response_and_text(url=url_comments_more,
                                                      headers=headers)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                else:
                    response1 = get_response_and_text(
                        url=id_replynodes['next_comment_url'], headers=headers)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

            except Exception as e:
                print e
            datajson = json.loads(response_in_function_text)
            for one_comment in datajson['data']['ans_list']:
                datasoup_content = BeautifulSoup(one_comment['content'],
                                                 'lxml')
                content = datasoup_content.text
                img_urls = []
                Re_find_img = re.compile(r'src=".*?"')
                img_urls_find_by_re = Re_find_img.findall(
                    one_comment['content'])
                for img_url in img_urls_find_by_re:
                    img_url_split = img_url.split('"')[1]
                    img_urls.append(img_url_split)
                like_count = one_comment['digg_count']
                id = one_comment['ansid']
                publish_time = one_comment['create_time']  # 时间戳mark
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                reply_count = one_comment['comment_count']
                publish_user_photo = one_comment['user']['avatar_url']
                publish_user = one_comment['user']['uname']
                publish_user_id = one_comment['user']['user_id']
                try:
                    reply_nodes = get_content_in_wenda_comments_comments({
                        'id':
                        id,
                        'reply_nodes': [],
                        'next_comment_url':
                        None
                    })
                except Exception as e:
                    # print e
                    reply_nodes = []
                parent_id = id_replynodes['id']
                ancestor_id = data['id']

                try:
                    this_node = {
                        'publish_time': publish_time,
                        'content': content,
                        'like_count': like_count,
                        'id': id,
                        'reply_count': reply_count,
                        'publish_user_photo': publish_user_photo,
                        'publish_user': publish_user,
                        'publish_user_id': publish_user_id,
                        'reply_nodes': reply_nodes,
                        'ancestor_id': ancestor_id,
                        'parent_id': parent_id,
                        # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    }
                    id_replynodes['reply_nodes'].append(this_node)
                except Exception as e:
                    print e
            if datajson['data']['has_more']:
                url_offset = response_in_function.url.split('&offset=')
                offset = int(url_offset[1].split('&')[0]) + 10
                url = url_offset[0] + '&offset=' + str(offset)
                id_replynodes['next_comment_url'] = url
                reply_nodes2 = get_content_in_wenda_comments_more(
                    id_replynodes)
                return reply_nodes2
            else:
                return id_replynodes['reply_nodes']
Пример #20
0
        def get_comment_inside(data):#这种写法可能有问题



            data['source']=data['source'].strip()
            isFirst_req = True
            start_id = 0
            comments_list = []
            while True:
                if isFirst_req==True:
                    comment_req='http://www.thepaper.cn/load_moreFloorComment.jsp?contid='+data['id']
                else:
                    comment_req='http://www.thepaper.cn/load_moreFloorComment.jsp?contid='+data['id']+'&startId='+start_id
                headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
                }
                response1=get_response_and_text(url=comment_req,headers=headers)
                response_in_function=response1['response_in_function']
                response_in_function_text=response1['response_in_function_text']
                if not response_in_function:
                    return
                Re_find_startid = re.compile(r'startId="(.*?)"')
                data_re = Re_find_startid.findall(response_in_function_text)

                start_id=0#8-24日添加
                try:
                    start_id=data_re[0]
                except Exception as e:
                    # print e
                    pass
                datasoup=BeautifulSoup(response_in_function_text,'lxml')
                for one_div in datasoup.select('div.comment_que'):
                    #手机端和电脑端的显示页面不一样,需要分别处理。
                    try:
                        publish_user_photo=one_div.select('div.aqwleft > div > a > img')[0].get('src')
                    except Exception as e:
                        # print e
                        pass
                        publish_user_photo='http://www.thepaper.cn/img/headerimg_bg50.png'#可能没有,先舍弃了,后边有问题再回来检擦 mark1
                    publish_user=one_div.select('div.aqwright > h3 > a')[0].text
                    try:
                        id_1=str(one_div.select('div.aqwright > h3 > a')[0]).split('commentId=')
                        # id=one_div.select('div.aqwright > h3 > a')[0].split('commentId=')[1]
                        id=id_1[1].split('"')[0]
                    except Exception as e:
                        print e
                    publish_user_id=str(one_div.select('div.aqwright > h3 > a')[0]).split('userId=')[1].split('&')[0]
                    publish_time=one_div.select('div.aqwright > h3 > span')[0].text
                    content=''
                    for content_in_for in one_div.select('div > a[href^="javascript:replyFloor"]'):#有问题
                        if u'回复' not in content_in_for:
                            content+=content_in_for.text
                    # content=content.replace(u'回复',u'')
                    try:
                        like_count=int(one_div.select('div.aqwright > div.ansright_time > a[href^="javascript:priseCommtFloor"]')[0].text)
                    except:
                        like_count=0


                    if u'小时前' in publish_time:
                        publish_time_num=int(publish_time.replace(u'小时前',''))
                        publish_time=(datetime.now()-timedelta(hours=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S')
                    elif u'天前' in publish_time:
                        publish_time_num=int(publish_time.replace(u'天前',''))
                        publish_time=(datetime.now()-timedelta(days=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S')
                    elif u'分钟前' in publish_time:
                        publish_time_num=int(publish_time.replace(u'分钟前',''))
                        publish_time=(datetime.now()-timedelta(minutes=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S')
                    else:
                        publish_time=publish_time

                    #8-25日添加parentid处理模块
                    Re_find_publish_user = re.compile(ur'回复@(.*)\:')
                    has_at_re=Re_find_publish_user.match(content)
                    has_at=''
                    if has_at_re:
                        has_at=has_at_re.group(1)
                    thisdata={
                        'publish_user_photo':publish_user_photo,
                        'publish_user':publish_user,
                        'id':id,
                        'publish_user_id':publish_user_id,
                        'publish_time':publish_time,
                        'content':content,
                        'like_count':like_count,
                        'ancestor_id':data['id'],
                        'parent_id':data['id'],#这一个暂且这么设计,之后统计content里边有没有@到的人,之后再做统计
                        # 'reply_nodes':[],
                        # 'has_at':has_at
                    }
                    comments_list.append(thisdata)

                if int(start_id)==0:
                    #8-25添加parent_id处理功能
                    # comments_list2=comments_list[:]
                    # for comment_one_data in comments_list2:
                    #     if comment_one_data['has_at']:
                    #         def merge_comment(comment_one_data):
                    #             #有@,就根据@后的人名来统计parent_id
                    #             for num in range(len(comments_list2)):
                    #                 if comments_list2[num]['publish_user']==comment_one_data['publish_user']:
                    #                     _=copy.deepcopy(comment_one_data)
                    #                     del(comment_one_data['has_at'])
                    #                     comments_list2[num]['reply_nodes'].append(comment_one_data)
                    #                     comments_list2.remove(_)#这样结构依然不完整
                    #                     if comments_list2[num]['has_at']:#这里注意顺序!!!!python语法就是list都是调用不是开辟新的空间
                    #                         new_child_comment=copy.deepcopy(comments_list2[num])


                    data['reply_nodes']=comments_list
                    data['reply_count']=len(comments_list)
                    try:
                        data['publish_time']=data['publish_time'].replace(u' ',u'').encode('utf-8')
                        data['publish_time']=data['publish_time'].split(' ')[0]+' '+data['publish_time'].split(' ')[1]
                    except Exception as e:
                        # print e
                        pass
                    self.result_list.append(data)
                    break
Пример #21
0
    def get_Index(self):
        # while True:
        thisurls_list=[]
        for url in self.urls:
            response1=get_response_and_text(url=url,needproxy=False)
            response_in_function=response1['response_in_function']
            response_in_function_text=response1['response_in_function_text']
            if not response_in_function_text:
                return
            Re_pattern = re.compile(r'data	:	\"(.*?)\".*?Math\.random\b')
            datare = Re_pattern.findall(response_in_function_text)
            try:
                url_in_content=datare[0]
            except Exception as e:
                continue
            if 'http://m.thepaper.cn/channel_26916' in url:
                nexturl = 'http://www.thepaper.cn/load_index.jsp?' + url_in_content#发现手机端的数据获得地更多一些,电脑端http://m.thepaper.cn/load_channel.jsp?
            else:
                nexturl='http://m.thepaper.cn/load_channel.jsp?'+url_in_content

            thisurls_list.append(nexturl)
        for url_to_visit in thisurls_list:
            for i in range(10):
                self.index_url_list.append(url_to_visit+str(i))



        def get_index_inside_movie(url):
            response2=get_response_and_text(url=url)
            response_in_function=response2['response_in_function']
            response_in_function_text=response2['response_in_function_text']
            if len(response_in_function_text)<10:
                return
            datasoup=BeautifulSoup(response_in_function_text,'lxml')
            for one_url in datasoup.select('body > div'):
                thisurl=one_url.select('h2 > a')[0].get('href')
                publish_user=one_url.select('a')[2].text
                title=one_url.select('a')[1].text
                try:
                    publish_time=one_url.select('a > span')[0].text
                except:
                    publish_time='00:00:00'#这些网页的格式不规则
                try:
                    publish_time_date=one_url.select('span')[1].text
                    if u'天前' in publish_time_date:
                        publish_time_date=publish_time_date.replace(u'天前','')
                        date_now=datetime.now()
                        date_now2=date_now-timedelta(days=int(publish_time_date))
                        publish_time_date=date_now2
                        publish_time_date=str(publish_time_date.strftime('%Y-%m-%d %H:%M'))
                    elif u'小时前' in publish_time_date:
                        publish_time_date = publish_time_date.replace(u'小时前', '')
                        date_now = datetime.now()
                        date_now2 = date_now - timedelta(hours=int(publish_time_date))
                        publish_time_date = date_now2
                        publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S'))
                    elif u'分钟前' in publish_time_date:
                        publish_time_date = publish_time_date.replace(u'分钟前', '')
                        date_now = datetime.now()
                        date_now2 = date_now - timedelta(minutes=int(publish_time_date))
                        publish_time_date = date_now2
                        publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S'))
                except Exception as e:
                    # print e
                    pass
                    try:
                        publish_time_date=one_url.select('span')[0].text
                    except Exception as e:
                        # print e,'两次都没有找到publish_time_data,在index视频处理部分'
                        pass
                    try:
                        if len(one_url.select('span')[0].text)==10:
                            publish_time_date=one_url.select('span')[0].text
                        else:
                            continue
                    except:
                        continue
                publish_time=publish_time_date+' '+publish_time+':00'
                id=one_url.select('h2 > a')[0].get('id')
                try:
                    reply_count= one_url.select('span.trbszan')[0].text
                    if 'k' in reply_count:
                        reply_count=float(reply_count)*1000
                except:
                    reply_count= 0
                video_urls=[]
                try:
                    video_urls1=datasoup.select('video source')
                    for i in video_urls1:
                        video_urls.append(i.get('src'))
                except Exception as e:
                    # print e
                    pass

                data_index={
                    'url':'http://m.thepaper.cn/'+thisurl,
                    'publish_user':publish_user,
                    'title':title,
                    'publish_time':publish_time,
                    'id':id,
                    'reply_count':reply_count,
                    'is_movie':True,
                    'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'video_urls':video_urls
                }
                self.content_data_list.append(data_index)

        def get_index_inside_wenben(url):
            user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
            headers = {
                'User-Agent': user_agent
            }
            response1 = get_response_and_text(url=url,headers=headers)
            response_in_function=response1['response_in_function']
            response_in_function_text=response1['response_in_function_text']
            if not response_in_function:
                return
            datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            for div_content in datasoup.select('body > div'):
                try:
                    try:
                        reply_count=div_content.select('span.reply')[0].text
                    except Exception as e:
                        # print e
                        pass
                        reply_count=0#因为有些新闻index请求中看不到评论消息。
                    url= 'http://m.thepaper.cn/' + div_content.select('div > a')[0].get('href')  # url
                    publish_time = div_content.select('p > span')[0].text  # publish_time
                    #这里需要对publish_time做处理吗?

                    title= div_content.select('div > p > a')[1].text  # title
                    publish_user= div_content.select('div > p > a')[0].text  # publish_user
                    # print div_content
                    if u'分钟' in publish_time:
                        minulate = publish_time.replace(u'分钟前', '')
                        time_b = datetime.now() - timedelta(minutes=int(minulate))
                        print time_b
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c
                    elif u'小时前' in publish_time:
                        hourse = publish_time.replace(u'小时前', '')
                        time_b = datetime.now() - timedelta(hours=int(hourse))
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c
                    elif u'天前' in publish_time:
                        days = publish_time.replace(u'天前', '')
                        time_b = datetime.now() - timedelta(days=int(days))
                        time_c = time_b.strftime('%Y-%m-%d %H:%M:%S')
                        publish_time= time_c

                    print '\n\n\n'
                except Exception as e:
                    # print e
                    pass
                id=url.split('_')[-1]
                this_dict={
                    'id':id,
                    'url':url,
                    'publish_time':publish_time,
                    'title':title,
                    'publish_user':publish_user,
                    'is_movie':False,
                    'reply_count':reply_count,
                    'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                self.content_data_list.append(this_dict)


        threadlist=[]
        # self.index_url_list=self.index_url_list.reverse()
        the_index_url_list_here=self.index_url_list#每次到这里就从新从self.index_url_list
        while the_index_url_list_here:  # 如果index中的任务完了,content_url_list中是空的的时候,就停止
            while the_index_url_list_here or threadlist:
                for threadi in threadlist:
                    if not threadi.is_alive():
                        threadlist.remove(threadi)
                while len(threadlist) < CONTENT_THREADING_NUM and the_index_url_list_here:
                    data_in_while = the_index_url_list_here.pop()
                    if 'http://www.thepaper.cn/load_index.jsp?' in data_in_while:
                        thread_in_while = threading.Thread(target=get_index_inside_movie, args=(data_in_while,))
                    else:
                        thread_in_while=threading.Thread(target=get_index_inside_wenben,args=(data_in_while,))
                    thread_in_while.start()
                    threadlist.append(thread_in_while)
                for childthread in threadlist:
                    childthread.join(600)


        # print '正在index中等待那600秒'
        # time.sleep(1800)
        time.sleep(5*60)
        self.global_status_num_content=0

        while True:
            self.global_status_num_content=0
            time.sleep(5)
            if self.global_status_num_content==0:
                break
Пример #22
0
def get_content(data, result_queue):
    url = data['url']
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    basic_url = 'http://bts.gov.cn/'
    Re_sub_javascript2 = re.compile(r'<script[\S|\s]*?>[\s|\S]*?<\/script\>')
    Re_find_time = re.compile(r'(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2})')
    Re_find_img = re.compile(r'src\=\"(.*?)\"')
    Re_find_cource = re.compile(ur'来源:(.*?) ')

    try:
        response1 = get_response_and_text(url=url,
                                          headers=headers,
                                          charset='utf-8')
        response_in_function = response1['response_in_function']
        if not response_in_function:
            return
        response_in_function_text = response1['response_in_function_text']
        response_in_function_text_dealed = Re_sub_javascript2.sub(
            '', response_in_function_text)
        datasoup = BeautifulSoup(response_in_function_text_dealed, 'lxml')
        title = datasoup.select(
            'body > div > div.detailMain.pageWidth > div.pargraph > h1'
        )[0].text
        content = ''
        for i in datasoup.select(
                'body > div > div.detailMain.pageWidth > div.pargraph  div.detailPar  p'
        ):
            content += i.text
        # print content
        source = Re_find_cource.findall(response_in_function_text_dealed)
        if source:
            source = source[0]
        else:
            source = ''
        content_str = datasoup.select(
            'body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar'
        )[0]
        content_str2 = str(content_str)
        img_urls = Re_find_img.findall(content_str2)
        img_urls2 = []
        for one_img_url in img_urls:
            img_url_dealed = urljoin(basic_url, one_img_url)
            img_urls2.append(img_url_dealed)

        publish_div = datasoup.select(
            'body > div > div.detailMain.pageWidth > div.pargraph > h6'
        )[0].text
        publish_time = Re_find_time.findall(publish_div)[0]

        data['content'] = content
        data['publish_time'] = publish_time
        data['img_urls'] = img_urls2
        data['source'] = source
        data['publish_user'] = ''

        pass
        result_queue.put(data)
    except Exception as e:
        print e
Пример #23
0
        def get_index_inside_movie(url):
            response2=get_response_and_text(url=url)
            response_in_function=response2['response_in_function']
            response_in_function_text=response2['response_in_function_text']
            if len(response_in_function_text)<10:
                return
            datasoup=BeautifulSoup(response_in_function_text,'lxml')
            for one_url in datasoup.select('body > div'):
                thisurl=one_url.select('h2 > a')[0].get('href')
                publish_user=one_url.select('a')[2].text
                title=one_url.select('a')[1].text
                try:
                    publish_time=one_url.select('a > span')[0].text
                except:
                    publish_time='00:00:00'#这些网页的格式不规则
                try:
                    publish_time_date=one_url.select('span')[1].text
                    if u'天前' in publish_time_date:
                        publish_time_date=publish_time_date.replace(u'天前','')
                        date_now=datetime.now()
                        date_now2=date_now-timedelta(days=int(publish_time_date))
                        publish_time_date=date_now2
                        publish_time_date=str(publish_time_date.strftime('%Y-%m-%d %H:%M'))
                    elif u'小时前' in publish_time_date:
                        publish_time_date = publish_time_date.replace(u'小时前', '')
                        date_now = datetime.now()
                        date_now2 = date_now - timedelta(hours=int(publish_time_date))
                        publish_time_date = date_now2
                        publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S'))
                    elif u'分钟前' in publish_time_date:
                        publish_time_date = publish_time_date.replace(u'分钟前', '')
                        date_now = datetime.now()
                        date_now2 = date_now - timedelta(minutes=int(publish_time_date))
                        publish_time_date = date_now2
                        publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S'))
                except Exception as e:
                    # print e
                    pass
                    try:
                        publish_time_date=one_url.select('span')[0].text
                    except Exception as e:
                        # print e,'两次都没有找到publish_time_data,在index视频处理部分'
                        pass
                    try:
                        if len(one_url.select('span')[0].text)==10:
                            publish_time_date=one_url.select('span')[0].text
                        else:
                            continue
                    except:
                        continue
                publish_time=publish_time_date+' '+publish_time+':00'
                id=one_url.select('h2 > a')[0].get('id')
                try:
                    reply_count= one_url.select('span.trbszan')[0].text
                    if 'k' in reply_count:
                        reply_count=float(reply_count)*1000
                except:
                    reply_count= 0
                video_urls=[]
                try:
                    video_urls1=datasoup.select('video source')
                    for i in video_urls1:
                        video_urls.append(i.get('src'))
                except Exception as e:
                    # print e
                    pass

                data_index={
                    'url':'http://m.thepaper.cn/'+thisurl,
                    'publish_user':publish_user,
                    'title':title,
                    'publish_time':publish_time,
                    'id':id,
                    'reply_count':reply_count,
                    'is_movie':True,
                    'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'video_urls':video_urls
                }
                self.content_data_list.append(data_index)
Пример #24
0
        def get_content_in_wenda_comments_comments(id_replynodes, data=None):

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            try:
                if not id_replynodes['next_comment_url']:
                    url_comments_more = 'https://www.wukong.com/wenda/web/comment/brow/?ansid=' + \
                                        id_replynodes['id'] + '&count=10&offset=0'

                    needproxy = int(random.randint(1, 10) / 7)

                    response1 = get_response_and_text(url=url_comments_more,
                                                      needproxy=needproxy)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                else:
                    needproxy = int(random.randint(1, 10) / 7)

                    response1 = get_response_and_text(
                        url=id_replynodes['next_comment_url'],
                        headers=headers,
                        needproxy=needproxy)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                # break
            except Exception as e:

                print e

            datajson_comment2 = json.loads(response_in_function_text)
            try:
                datajson_comment2['comments']
            except Exception as e:
                print e
            for comment2 in datajson_comment2['comments']:
                id = comment2['comment_id']
                like_count = comment2['digg_count']
                content = comment2['content']
                publish_user_id = comment2['user_info']['user_id']
                publish_user = comment2['user_info']['uname']
                publish_user_photo = comment2['user_info']['avatar_url']
                publish_time = comment2['create_time']
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                try:
                    ancestor_id = id_replynodes['ancestor_id']
                except Exception as e:
                    print e
                parent_id = id_replynodes['id']

                thisnode = {
                    'id': id,
                    'like_count': like_count,
                    'content': content,
                    'publish_user_id': publish_user_id,
                    'publish_user': publish_user,
                    'publish_user_photo': publish_user_photo,
                    'publish_time': publish_time,  #发布时间
                    'parent_id': parent_id,
                    'ancestor_id': ancestor_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                id_replynodes['reply_nodes'].append(thisnode)
            if datajson_comment2['has_more']:
                url_offset = response_in_function.url.split('&offset=')
                offset = int(url_offset[1].split('&')[0]) + 10
                url = url_offset[0] + '&offset=' + str(offset)
                id_replynodes['next_comment_url'] = url
                reply_nodes2 = get_content_in_wenda_comments_comments(
                    id_replynodes)
                return reply_nodes2
            else:
                return id_replynodes['reply_nodes']
Пример #25
0
            def get_content_inside_no_movie(data):
                url_for_debug=data['url']
                vedio_list=[]
                respons1=get_response_and_text(url=url_for_debug)
                response_in_function=respons1['response_in_function']
                response_in_function_text=respons1['response_in_function_text']
                if not response_in_function:
                    return
                Re_find_img=re.compile(r'src=".*?"')
                datasoup=BeautifulSoup(response_in_function_text,'lxml')
                content=''
                img_urls=[]
                for content_in_soup in datasoup.select('#v3cont_id > div.news_content > div.news_part'):
                    content+=content_in_soup.text
                for content_in_soup in datasoup.select('#v3cont_id > div.news_content > div.news_part_father > div.news_part.news_part_limit > div'):
                    content+=content_in_soup.text
                try:
                    title=datasoup.select('#v3cont_id > div.news_content > h1')[0].text
                except:
                    # print response_in_function.url
                    pass
                    # title=''
                    return #这里偶尔会转到莫名其妙的网页
                try:
                    publish_user= datasoup.select('#v3cont_id > div.news_content > p.about_news')[0].text
                except Exception as e:
                    # print e
                    pass
                try:
                    source=datasoup.select('#v3cont_id > div.news_content > p.about_news')[1].text.split(u'来源:')[1]
                except:
                    source=''
                try:
                    publish_time=datasoup.select('.news_content .about_news')[1].text.split(u'\xa0')[0]+':00'
                    data['publish_time']=publish_time
                except Exception as e:
                    # print e
                    pass
                for i in datasoup.select('source'):
                    url_vedio= i.get('src')
                    vedio_list.append(url_vedio)

                # publish_time= datasoup.select('#v3cont_id > div.news_content > p.about_news')[1].text.strip()#还是有一个莫名奇妙的空格
                try:
                    datasoup_content=datasoup.select('#v3cont_id > div.news_content')[0]
                except Exception as e:
                    # print e
                    pass
                img_urls_original=Re_find_img.findall(str(datasoup_content))
                img_urls_selected_by_doup=datasoup_content.select('img')
                for url in img_urls_selected_by_doup:
                    print url.get('src')
                for url in img_urls_original:
                    url_split=url.split('"')[1]
                    img_urls.append(url_split)
                # if len(publish_time)> 17:
                #     publish_time=publish_time.split('\n')[0]

                #8-30
                like_count = datasoup.select('#news_praise')
                if like_count:
                    like_count_value = int(like_count[0].text.strip())
                else:
                    like_count_value = 0
                #8-30

                data['like_count']=like_count_value
                data['publish_user']=publish_user
                data['img_urls']=img_urls
                data['content']=content
                # data['publish_user']=publish_user
                # data['publish_time']=publish_time
                data['title']=title
                data['source']=source
                data['video_urls'] = vedio_list
                self.comments_url_list.append(data)