예제 #1
0
    def crawl(self):
        homepage = self.key
        data = self.data
        html_stream = _get_url(homepage)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',class_=['rich_media_content',\
                                'rich_media_thumb_wrp'])
        xp_title = "//div[@class='rich_media_area_primary']/\
                    h2[@class='rich_media_title']/text()"

        xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\
                    /text()"

        xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()"
        xp_publisher = "//div/a[@id='post-user']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        publisher = HandleContent.get_author(html_stream, xpath=xp_publisher)
        comment = {}
        # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \
        #                     y.text.replace('\n','').replace('\r','')
        # comment['content'] = reduce(con,content)

        content = clear_label(content, root=homepage)
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        date = new_time()
        crawl_data = {}
        crawl_data = {
            'province': self.data.get('province', ''),
            'city': self.data.get('city', ''),
            'district': self.data.get('district', ''),
            'url': homepage,
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'sogou',
            'author': author,
            'publisher': self.data.get('publisher', publisher),
            'origin_source': u'微信公共账号',
            'type': u'微信',
            'comment': comment
        }
        if data.get('key'):
            crawl_data.update(data)
            model = SearchArticleModel(crawl_data)
        else:
            model = WeixinArticleModel(crawl_data)

        export(model)
예제 #2
0
파일: sogou.py 프로젝트: xxguo/crawler
    def crawl(self): 
        homepage = self.key
        data = self.data
        html_stream = _get_url(homepage)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',class_=['rich_media_content',\
                                'rich_media_thumb_wrp'])
        xp_title = "//div[@class='rich_media_area_primary']/\
                    h2[@class='rich_media_title']/text()"
        xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\
                    /text()"
        xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()"
        xp_publisher = "//div/a[@id='post-user']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        publisher = HandleContent.get_author(html_stream, xpath=xp_publisher)
        comment = {}
        # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \
        #                     y.text.replace('\n','').replace('\r','')
        # comment['content'] = reduce(con,content)

        content = clear_label(content, root=homepage)
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        date = new_time()
        crawl_data = {}
        crawl_data = {
            'province': self.data.get('province',''),
            'city': self.data.get('city',''),
            'district': self.data.get('district',''),
            'url': homepage,
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'sogou',
            'author': author,
            'publisher': self.data.get('publisher', publisher),
            'origin_source': u'微信公共账号',
            'type': u'微信',
            'comment': comment
        }
        if data.get('key'):
            crawl_data.update(data)
            model = SearchArticleModel(crawl_data)
        else:
            model = WeixinArticleModel(crawl_data)

        export(model)
예제 #3
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', 'contaner_nr')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()"
        xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'zjbts',
            'publisher': u'浙江质监局',
            'source_type': u'质监局',
            #    'origin_source': u'浙江质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
예제 #4
0
파일: zjbts.py 프로젝트: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','contaner_nr')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()"    
        xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'zjbts',
            'publisher': u'浙江质监局',
            'source_type': u'质监局',
        #    'origin_source': u'浙江质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
예제 #5
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td', 'conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream,
                                          xpath=xp_author,
                                          xp_text=u'来源:')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
            # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
예제 #6
0
파일: sdqts.py 프로젝트: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td','conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"    
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
           # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)