def parse(self, response): def getdomain(url): #proto,rest=urllib.splittype(url) #host,rest=urllib.splithost(rest) return "http:" sel = scrapy.Selector(response) links_in_a_page = sel.xpath('//a[@href]') for link_sel in links_in_a_page: item = SohuItem() link = str(link_sel.re('href="(.*?)"')[0]) if link: if not link.startswith('http'): link = getdomain(response.url) + link yield scrapy.Request(link, callback=self.parse) p1 = re.compile(r'.*/a/.*') p2 = re.compile(r'.*#comment_area$') p3 = re.compile(r'.*news.sohu.com.*s?html?$') if (re.match(p3, link) or re.match(p1, link)) and (not re.match(p2, link)): #print ('T: '+link) item['link'] = link yield item else: pass
def parse_item(self, response): items = SohuItem() hxs = HtmlXPathSelector(response) #打开渲染 self.driver.get(response.url) # 点击展开信息按钮 self.driver.find_element_by_class_name("info-arrT").click() items['name'] = self.driver.find_element_by_xpath( '//div[@class="crumbs"]/a[last()]').text items['fav'] = self.driver.find_element_by_xpath( '//div[@class="vBox vBox-ding"]//i').text items['step'] = self.driver.find_element_by_xpath( '//div[@class="vBox vBox-cai"]//i').text items['playcounts'] = self.driver.find_element_by_xpath( '//div[@class="vBox vBox-play vBox-play-panel"]//i').text items['actor'] = self.driver.find_element_by_xpath( '//div[@class="info info-con"]//a[@data-pb-other="actor"]').text items['director'] = self.driver.find_element_by_xpath( '//div[@class="info info-con"]//a[@data-pb-other="director"]').text items['introduce'] = self.driver.find_element_by_xpath( '//div[@class="info info-con"]/p[@class="intro open"]').text items['time'] = self.driver.find_element_by_xpath( '//li[@style="display: list-item;"]').text items['area'] = self.driver.find_element_by_xpath( '//a[@data-pb-other="area"]').text items['type'] = self.driver.find_element_by_xpath( '//a[@data-pb-other="category"]').text items['year'] = self.driver.find_element_by_xpath( '//a[@data-pb-other="year"]').text yield items
def parse(self, response): for sel in response.xpath('//div[@class = "focus-news-box"]/div/div[@class = "list16"][1]/ul/li'): item = SohuItem() item['title'] = sel.xpath('.//a/@title').extract() item['href'] = sel.xpath('.//a/@href').extract() #yield item yield scrapy.Request(response.urljoin(item['href'][0]), meta={'item': item}, callback=self.parse_content)
def parse_url(self, response): item = SohuItem() item["context"] = response.xpath( '//article[@class="article"]/p/text()').extract() print(item["context"]) # title=item["context"][0] #fh = open("D:/file/python_learning/CH5/news/" + str(self.times) + ".txt", "wb") self.times += 1 #print(self.times) #fh.write(context) yield item
def get_tab(self, response, category): items = [] for group in response.xpath('./ul[@class="rList"]'): for rank in group.xpath('./li'): item = SohuItem() item['rank_category'] = category item['rank_name'] = rank.xpath( './div[@class="vName"]/div/a/text()').extract() item['rank_type'] = rank.xpath( './span[@class="vSort"]/a/text()').extract() item['rank_index'] = rank.xpath( './span[@class="vTotal"]/text()').extract() item['rank_trend'] = rank.xpath( './span[@class="vRank"]/@title').extract() items.append(item) return items
def get_tab1(self, response, category): item = SohuItem() item['rank_category'] = category item['rank_name'] = response.xpath( './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/div[@class="vName"]/a/@title' ).extract() item['rank_type'] = response.xpath( './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vSort"]/a/text()' ).extract() item['rank_index'] = response.xpath( './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vTotal"]/text()' ).extract() item['rank_trend'] = response.xpath( './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vRank"]/@title' ).extract() return item
def parse(self, response): for agent in response.xpath('//div[@class="realtor-info"]'): item = SohuItem() item['name'] = agent.xpath( './/a[@class="realtor-info-name"]/text()').extract_first() item['company'] = agent.xpath( './/div[@class="rcompany"]//span/text()').extract()[1] listR = agent.xpath('.//div[@class="rparks"]//text()').extract() predict = str(listR[0]) + str(listR[1]) + str(listR[2]) listR = predict + "/".join( [str(listR[i]) for i in range(3, len(listR) - 1)]) listR = listR.replace('\n', "").replace(" ", "").replace("//", "/") item['region'] = listR reId = agent.xpath('.//a[@class="realtor-info-name"]/@href' ).extract_first().strip() reId = str(reId) reId = re.findall('[1-9]\d*', reId)[0] href = "http://esf.focus.cn/api/getVirtualPhone?realtorId=" + reId + "&ecoCityId=73&call_url=.html" yield scrapy.Request(href, meta={'item': item}, callback=self.parse_phone)