def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', 'Index_ShowDetail_Content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()" xp_putime = "//div[@class='Index_ShowDetail_Time']//text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) date = new_time() crawl_data = { 'url': url, 'province': u'全国', 'title': title, 'content': content, 'pubtime': pubtime, 'publisher': u'中国质量新闻网', 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'cqn', 'source_type': u'中国质量新闻网', # 'origin_source': u'中国质量新闻网', 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','contaner_nr') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()" xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()" xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'zjbts', 'publisher': u'浙江质监局', 'source_type': u'质监局', # 'origin_source': u'浙江质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','Index_ShowDetail_Content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()" xp_putime = "//div[@class='Index_ShowDetail_Time']//text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) date = new_time() crawl_data = { 'url': url, 'province': u'全国', 'title': title, 'content': content, 'pubtime': pubtime, 'publisher': u'中国质量新闻网', 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'cqn', 'source_type': u'中国质量新闻网', # 'origin_source': u'中国质量新闻网', 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', 'contaner_nr') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()" xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()" xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'zjbts', 'publisher': u'浙江质监局', 'source_type': u'质监局', # 'origin_source': u'浙江质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',class_=['rich_media_content',\ 'rich_media_thumb_wrp']) xp_title = "//div[@class='rich_media_area_primary']/\ h2[@class='rich_media_title']/text()" xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\ /text()" xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()" xp_publisher = "//div/a[@id='post-user']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) publisher = HandleContent.get_author(html_stream, xpath=xp_publisher) comment = {} # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \ # y.text.replace('\n','').replace('\r','') # comment['content'] = reduce(con,content) content = clear_label(content, root=homepage) text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) date = new_time() crawl_data = {} crawl_data = { 'province': self.data.get('province', ''), 'city': self.data.get('city', ''), 'district': self.data.get('district', ''), 'url': homepage, 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'sogou', 'author': author, 'publisher': self.data.get('publisher', publisher), 'origin_source': u'微信公共账号', 'type': u'微信', 'comment': comment } if data.get('key'): crawl_data.update(data) model = SearchArticleModel(crawl_data) else: model = WeixinArticleModel(crawl_data) export(model)
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',class_=['rich_media_content',\ 'rich_media_thumb_wrp']) xp_title = "//div[@class='rich_media_area_primary']/\ h2[@class='rich_media_title']/text()" xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\ /text()" xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()" xp_publisher = "//div/a[@id='post-user']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) publisher = HandleContent.get_author(html_stream, xpath=xp_publisher) comment = {} # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \ # y.text.replace('\n','').replace('\r','') # comment['content'] = reduce(con,content) content = clear_label(content, root=homepage) text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) date = new_time() crawl_data = {} crawl_data = { 'province': self.data.get('province',''), 'city': self.data.get('city',''), 'district': self.data.get('district',''), 'url': homepage, 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'sogou', 'author': author, 'publisher': self.data.get('publisher', publisher), 'origin_source': u'微信公共账号', 'type': u'微信', 'comment': comment } if data.get('key'): crawl_data.update(data) model = SearchArticleModel(crawl_data) else: model = WeixinArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td',id='td_news_content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='content-title']/div/text()" xp_putime = "//tr/td[@class='bottom-line-gray']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'广州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'gzq', 'publisher': u'广东广州质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key data = self.data html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',id='right-text_d') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@id='right-title_d']//text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'佛山', 'title': title, 'content': content, 'pubtime': data.get('pubtime', pubtime), 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'fsjsjd', 'publisher': u'广东佛山质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), '---',crawl_data['pubtime'] # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key data = self.data html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', id='right-text_d') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@id='right-title_d']//text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'佛山', 'title': title, 'content': content, 'pubtime': data.get('pubtime', pubtime), 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'fsjsjd', 'publisher': u'广东佛山质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), '---',crawl_data['pubtime'] # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td', id='td_news_content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='content-title']/div/text()" xp_putime = "//tr/td[@class='bottom-line-gray']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'广州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'gzq', 'publisher': u'广东广州质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', ['article-box', 'files']) content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='article']/h2/text()|//h3/text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'湖北', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hbzljd', 'publisher': u'湖北质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td', 'conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',['article-box','files']) content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='article']/h2/text()|//h3/text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'湖北', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hbzljd', 'publisher': u'湖北质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', 'TRS_Editor') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@align='center']/h1/text()" xp_putime = "//div[@class='xj2']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'全国', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'aqsiq', 'publisher': u'国家质量监督检验检疫总局', 'source_type': u'国家质量监督检验检疫总局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print '===',pubtime,title.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('span','ny') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='dhz']/span/text()" xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()" # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hzqts', 'publisher': u'浙江杭州质监局', 'source_type': u'质监局', # 'origin_source': u'浙江杭州质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('span', 'ny') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='dhz']/span/text()" xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()" # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hzqts', 'publisher': u'浙江杭州质监局', 'source_type': u'质监局', # 'origin_source': u'浙江杭州质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','TRS_Editor') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@align='center']/h1/text()" xp_putime = "//div[@class='xj2']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'全国', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'aqsiq', 'publisher': u'国家质量监督检验检疫总局', 'source_type': u'国家质量监督检验检疫总局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print '===',pubtime,title.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td','conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('li','show_con') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='xl_content']/h1/text()" xp_putime = "//div[@class='xl_content']/div[@class='time']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'福建', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'fjqi', 'publisher': u'福建质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)