def parse_article(self, response): child_url = response.url url_md5 = get_md5(child_url) articleItemLoader = ItemLoader(item=ArticleSpiderItem(), response=response) articleItemLoader.add_xpath('title', '//h1/text()') articleItemLoader.add_xpath('content', "//div[@class='content']/p/text()") articleItemLoader.add_xpath('key_words', "//meta[@name='keywords']/@content") articleItemLoader.add_value('ref', response.url) articleItemLoader.add_value('url_md5', url_md5) articleInfo = articleItemLoader.load_item() # print("articleInfo", articleInfo) yield articleInfo
def parse_article(self, response): child_url = response.url print(response.url) suffix = child_url.split('/')[-1] url_md5 = get_md5(child_url) # item = ArmsSpiderItem() # name = response.xpath("//div[@class='dataInfo']/ul[1]/li[1]/text()").extract()[0].replace("\t", "") # print("name",name) # item['name'] = name # content = response.xpath("//div[@class='intron']/div[@class='module']/p/text()").extract()[0] # item['content'] = content.replace("\t",) articleItemLoader = ItemLoader(item=ArmsSpiderItem(), response=response) articleItemLoader.add_xpath('src', "//div[@class='maxPic']/img/@src") articleItemLoader.add_xpath( 'content', "//div[@class='intron']/div[@class='module']//text()") articleItemLoader.add_xpath( 'ycg', "//div[@class='maxPic']/span[@class='country']/b/a/text()") articleItemLoader.add_xpath( 'datainfo', "//div[@class='dataInfo']/ul[1]/li/span/text() | //div[@class='dataInfo']/ul[1]/li/text()" ) # articleItemLoader.add_xpath('datalist', "//div[@class='dataInfo']/u2[@class='dataList']/li//text() ") # articleItemLoader.add_xpath('xingneng', "//div[@class='dataInfo']/u3[@class='dataList']/li//text()") articleItemLoader.add_xpath( 'othercontent', "//div[@class='info']/div[@class='module']//text()") articleItemLoader.add_value('suffix', suffix) articleItemLoader.add_value('child_url', child_url) articleInfo = articleItemLoader.load_item() # print("articleInfo", articleInfo) yield articleInfo