def parse(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.weibo(settings.SNWB_STORE, response.url) loader.add_value('path', path) self.load_text(response, loader) item = loader.load_item() yield item next_page = self.load_next(response) if next_page: yield scrapy.Request(next_page, dont_filter=True)
def parse_view(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.host(settings.WY163_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h3/text()') loader.add_xpath('text', '//div[@class="feed-text"]/p/text()') return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.SBKK8_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') ps = response.xpath('//div[@id="f_article"]//p') if not ps: ps = response.xpath('//div[@id="f_article"]/div') if not ps: ps = response.xpath('//div[@id="f_article"]') if not ps: ps = response.xpath('//div[@id="articleText"]//p') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) item = loader.load_item() # if ('text' not in item) or (item['text'] == ''): # with open('url.txt', 'a') as url_file: # url = response.url + '\n' # url_file.write(url.encode('utf-8')) return item
def parse_spec(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.host(settings.WY163_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') loader.add_xpath('text', '//h2/text()') loader.add_xpath('text', '//h3/text()') ps = response.xpath('//p') for p in ps: ts = p.xpath('./text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_item(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.baike(settings.BAIKE_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') loader.add_xpath('text', '//h2//text()') loader.add_xpath('text', '//h3//text()') ds = response.xpath('//div[@class="para"]') for d in ds: ts = d.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.tianya(settings.TY_STORE, response) loader.add_value('path', path) loader.add_xpath('title', '//h1//text()') loader.add_xpath('text', '//div[@class="bbs-content"]/text()') return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.HAO1111_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//div[@class="article-title"]/h1/text()') #loader.add_xpath('title', '//div[class="article-summary"]/text()') loader.add_xpath('text', '//div[@class="article-content"]/text()') return loader.load_item()
def parse_item(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.tieba(settings.TB_STORE, response) loader.add_value('path', path) loader.add_value('title', '') # main content #loader.add_xpath('text', '//div[re:test(@class, "d_post_content j_d_post_content[\s\S]*")]/text()') loader.add_xpath('text', '//div[contains(@class, "d_post_content j_d_post_content")]/text()') # comment content comnt_list = self.pat_fnd.findall(response.body.decode('utf-8')) for comnt in comnt_list: text = self.pat_sub.sub('', comnt) text = text.decode('raw_unicode_escape') # or text = eval('u"%s"' % text) loader.add_value('text', text) return loader.load_item()
def parse_shtml(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.SOHU_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') ps = response.xpath('//div[@id="contentText"]//p') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) # old pages if not ps: ps = response.xpath('//div[@id="sohu_content"]/p/text()') for p in ps: text = p.extract() loader.add_value('text', text) return loader.load_item()
def parse_wx(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.TY2016_STORE, response.url) loader.add_value('path', path) loader.add_value('title', '') loader.add_xpath('text', '//p//text()') item = loader.load_item() if ('text' not in item) or (item['text'] == ''): with open('url.txt', 'a') as url_file: url = response.url + '\n' url_file.write(url.encode('utf-8')) return item
def parse(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.weibo(settings.SNWB_STORE, response.url) loader.add_value('path', path) self.load_text(response, loader) item = loader.load_item() yield item deny_serv = self.deny_serv(response) if deny_serv: yield self.snwblogin.login([response.url]) else: next_page = self.load_next(response) #if next_page and ('text' in item): if next_page: log.msg('next page') yield scrapy.Request(next_page, dont_filter=True) else: follow = self.load_follow(response) log.msg('follow') log.msg(follow) yield scrapy.Request(follow, callback=self.parse_follow)
def parse_shtml(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.BJCG_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//div[@class="main_xl_bt"]/text()') ps = response.xpath('//div[@class="main_xl_center"]//p') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) text = self.pat_text.sub('', text) loader.add_value('text', text) return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.SINA_STORE, response.url) loader.add_value('path', path) loader.add_xpath('text', '//h1/text()') ps = response.xpath('//div[@id="artibody"]//p') if not ps: ps = response.xpath('//div[@id="article"]//p') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_item(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.host(settings.BK_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') # ps = response.xpath('//div[@class="basic-info cmn-clearfix"]//dt') # ps = response.xpath('//div[@class="basic-info cmn-clearfix"]//dd') # for p in ps: # ts = p.xpath('.//text()').extract() # text = ''.join(ts) # loader.add_value('text', text) loader.add_xpath('text', '//h2/span[@class="title-text"]/text()') loader.add_xpath('text', '//h3/span[@class="title-text"]/text()') ps = response.xpath('//div[@class="para"]') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.baidumusic(response.url) loader.add_value('path', path) text = self.getText(response) loader.add_value('text', text) return loader.load_item()
def parse_shtml(self, response): loader = TextLoader(item = TextItem(), response = response) path = self.pathextractor.host(settings.WY163_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') ps = response.xpath('//div[@id="endText"]/p') # old pages:http://news.163.com/05/0130/10/1BBB83S30001121Q.html if not ps: ps = response.xpath('//div[@id="text"]/p') if not ps: ps = response.xpath('//div[@id="content"]/p') for p in ps: ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_shtml(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.QQ_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') ps = response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@style="TEXT-INDENT: 2em"]') if not ps: ps = response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p') # old pages if not ps: loader.replace_xpath('title', '//div[@id="ArtTit"]/text()') ps = response.xpath('//div[@id="ArtCnt"]//p') if not ps: loader.replace_xpath('title', '//div[@id="ArticleTit"]/text()') ps = response.xpath('//div[@id="ArticleCnt"]//p') for p in ps: if p.xpath('./script'): continue ts = p.xpath('.//text()').extract() text = ''.join(ts) loader.add_value('text', text) return loader.load_item()
def parse_original(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.host(settings.QQ_STORE, response.url) loader.add_value('path', path) loader.add_xpath('title', '//h1/text()') loader.add_xpath('text', '//div[@class="daoyu"]//div[@class="intr"]/text()') loader.add_xpath('text', '//div[@id="articleContent"]/h2/text()') loader.add_xpath('text', '//div[@id="articleContent"]/h3/text()') loader.add_xpath('text', '//div[@id="articleContent"]/p/text()') loader.add_xpath('text', '//div[@class="jieyu"]//text()') return loader.load_item()
def parse_item(self, response): loader = TextLoader(item=TextItem(), response=response) path = self.pathextractor.zhihu(settings.ZH_STORE, response.url) loader.add_value("path", path) loader.add_value("title", "") loader.add_xpath("text", '//div[@id="zh-question-title"]//text()') loader.add_xpath("text", '//div[@id="zh-question-detail"]//text()') ps = response.xpath('//div[@class="zm-editable-content clearfix"]') for p in ps: ts = p.xpath(".//text()").extract() text = "".join(ts) loader.add_value("text", text) return loader.load_item()