Пример #1
0
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')
        result = dict()
        result['newsProductId'] = time.strftime(
            '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq()
        result['newsCateId'] = response.request.meta['newsCateId']
        result['name'] = response.request.meta['name']
        result['imageUrl'] = response.request.meta['img_name']
        result['newsCateId'] = response.request.meta['newsCateId']
        result['shortDes'] = response.request.meta['shortDes']
        result['createTime'] = response.request.meta['createTime']
        result['newsFromWebUrl'] = response.request.url
        span_list = soup.select('div.article-header p.text-gray-9 span')
        for span in span_list:
            if '来源:' in span.text:
                result['newsFrom'] = span.text.replace('来源:', '').strip()
                break
            else:
                result['newsFrom'] = '互联网'
        longDes = soup.select('div.article-content')[0]

        tag_list = longDes.find_all()
        # 去除样式
        for tag in tag_list:
            attrs = copy.copy(tag.attrs)
            for key in attrs.iterkeys():
                if key != 'src':
                    del tag.attrs[key]

        result['longDes'] = str(longDes)

        yield pipeItem(['database', 'console'], result)
Пример #2
0
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')
        result = dict()
        result['newsProductId'] = time.strftime(
            '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq()
        result['newsCateId'] = response.request.meta['newsCateId']
        result['name'] = response.request.meta['name']
        result['imageUrl'] = response.request.meta['img_name']
        result['newsCateId'] = response.request.meta['newsCateId']
        result['shortDes'] = response.request.meta['shortDes']
        result['newsFromWebUrl'] = response.request.url
        result['newsFrom'] = soup.select('p.writ span')[1].text.replace(
            '来源:', '')
        result['createTime'] = soup.select('p.writ span')[2].text.replace(
            '时间:', '')
        longDes = soup.select('div#art_content')[0]

        # 去除广告
        adv_list = longDes.select(
            'img[src=http://www.zhue.com.cn/images/zhue888.jpg]')
        for adv in adv_list:
            adv.decompose()

        tag_list = longDes.find_all()
        # 去除样式
        for tag in tag_list:
            attrs = copy.copy(tag.attrs)
            for key in attrs.iterkeys():
                if key != 'src':
                    del tag.attrs[key]

        result['longDes'] = str(longDes)

        yield pipeItem(['database', 'console'], result)
Пример #3
0
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')
        result = dict()
        result['newsProductId'] = time.strftime(
            '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq()
        result['newsCateId'] = response.request.meta['newsCateId']
        result['name'] = response.request.meta['name']
        result['imageUrl'] = response.request.meta['img_name']
        result['newsCateId'] = response.request.meta['newsCateId']
        result['shortDes'] = response.request.meta['shortDes']
        result['createTime'] = response.request.meta['createTime']
        result['newsFromWebUrl'] = response.request.url
        result['newsFrom'] = '互联网'
        longDes = soup.select('td#article_content')[0]
        longDes.name = 'div'

        tag_list = longDes.find_all()
        # 去除样式
        for tag in tag_list:
            attrs = copy.copy(tag.attrs)
            for key in attrs.iterkeys():
                if key != 'src':
                    del tag.attrs[key]
                else:
                    tag.attrs[
                        key] = 'http://www.gengzhongbang.com/' + tag.attrs[key]

        result['longDes'] = str(longDes)

        yield pipeItem(['database', 'console'], result)
Пример #4
0
 def process_paper(self, response):
     soup = BeautifulSoup(response.m_response.content, 'html.parser')
     straname = soup.find('header').text.split(',')
     catory = straname[0]
     parl = response.request.meta['paperFrom'].split('/')
     parl = parl[len(parl) - 1].split('.')[0]
     volume = int(re.findall("\d+", parl)[0])
     trasplist = soup.find_all('li', class_="entry article")
     articleinfo = []
     for item in trasplist:
         atag = item.find('div', class_='head').find('a')
         if atag is None:
             return 0, 0, 0
         paperurl = atag.get('href')
         articleinfo = item.find('article', class_="data").find_all('span')
         title = item.find('span', class_='title').text
         articleinfo.pop()
         articleinfo.pop()
         authors = ""
         for author in articleinfo:
             authors = authors + author.text + ";"
         result = dict()
         result['title'] = title
         result['authors'] = authors
         result['paperUrl'] = paperurl
         result['catory'] = catory
         result['volume'] = volume
         yield pipeItem(['database'], result)
Пример #5
0
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')

        dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace(
            '来源: ', '').replace('来源:', '').split(' ')
        date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace(
            '|', '')
        newsFrom = dd_tail[0].strip()

        result = dict()
        result['date_time'] = date_time
        result['newsFrom'] = newsFrom

        yield pipeItem(['console'], result)
Пример #6
0
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')
        result = dict()
        result['newsProductId'] = time.strftime('%Y%m%d%H%M%S',
                                                time.localtime(time.time()))
        result['newsCateId'] = response.request.meta['newsCateId']
        result['name'] = response.request.meta['name']
        result['imageUrl'] = response.request.meta['img_name']
        result['newsCateId'] = response.request.meta['newsCateId']
        result['shortDes'] = response.request.meta['shortDes']
        result['createTime'] = response.request.meta['createTime']
        result['newsFromWebUrl'] = response.request.url
        span_list = soup.select('div.article-header p.text-gray-9 span')
        for span in span_list:
            if '来源:' in span.text:
                result['newsFrom'] = span.text.replace('来源:', '').strip()
                break
            else:
                result['newsFrom'] = '互联网'
        longDes = soup.select('div.article-content')[0]

        result['longDes'] = str(longDes)

        yield pipeItem(['console'], result)
Пример #7
0
 def process_pic(self, response):
     item = dict()
     item['content'] = response.m_response.content
     item['name'] = response.request.meta['img_name']
     yield pipeItem(['pic'], item)
Пример #8
0
 def process_pic(self, response):
     result = response.m_response.content
     yield pipeItem(['save'], result)