Пример #1
0
    def parse(self, response):
        def getdomain(url):
            #proto,rest=urllib.splittype(url)
            #host,rest=urllib.splithost(rest)
            return "http:"

        sel = scrapy.Selector(response)
        links_in_a_page = sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item = SohuItem()
            link = str(link_sel.re('href="(.*?)"')[0])

            if link:
                if not link.startswith('http'):
                    link = getdomain(response.url) + link

                yield scrapy.Request(link, callback=self.parse)

                p1 = re.compile(r'.*/a/.*')
                p2 = re.compile(r'.*#comment_area$')
                p3 = re.compile(r'.*news.sohu.com.*s?html?$')

                if (re.match(p3, link)
                        or re.match(p1, link)) and (not re.match(p2, link)):
                    #print ('T: '+link)
                    item['link'] = link
                    yield item
                else:
                    pass
Пример #2
0
 def parse_item(self, response):
     items = SohuItem()
     hxs = HtmlXPathSelector(response)
     #打开渲染
     self.driver.get(response.url)
     # 点击展开信息按钮
     self.driver.find_element_by_class_name("info-arrT").click()
     items['name'] = self.driver.find_element_by_xpath(
         '//div[@class="crumbs"]/a[last()]').text
     items['fav'] = self.driver.find_element_by_xpath(
         '//div[@class="vBox vBox-ding"]//i').text
     items['step'] = self.driver.find_element_by_xpath(
         '//div[@class="vBox vBox-cai"]//i').text
     items['playcounts'] = self.driver.find_element_by_xpath(
         '//div[@class="vBox vBox-play vBox-play-panel"]//i').text
     items['actor'] = self.driver.find_element_by_xpath(
         '//div[@class="info info-con"]//a[@data-pb-other="actor"]').text
     items['director'] = self.driver.find_element_by_xpath(
         '//div[@class="info info-con"]//a[@data-pb-other="director"]').text
     items['introduce'] = self.driver.find_element_by_xpath(
         '//div[@class="info info-con"]/p[@class="intro open"]').text
     items['time'] = self.driver.find_element_by_xpath(
         '//li[@style="display: list-item;"]').text
     items['area'] = self.driver.find_element_by_xpath(
         '//a[@data-pb-other="area"]').text
     items['type'] = self.driver.find_element_by_xpath(
         '//a[@data-pb-other="category"]').text
     items['year'] = self.driver.find_element_by_xpath(
         '//a[@data-pb-other="year"]').text
     yield items
Пример #3
0
 def parse(self, response):
     for sel in response.xpath('//div[@class = "focus-news-box"]/div/div[@class = "list16"][1]/ul/li'):
         item = SohuItem()
         item['title'] = sel.xpath('.//a/@title').extract()
         item['href'] = sel.xpath('.//a/@href').extract()
         #yield item
         yield scrapy.Request(response.urljoin(item['href'][0]), meta={'item': item}, callback=self.parse_content)
Пример #4
0
 def parse_url(self, response):
     item = SohuItem()
     item["context"] = response.xpath(
         '//article[@class="article"]/p/text()').extract()
     print(item["context"])
     # title=item["context"][0]
     #fh = open("D:/file/python_learning/CH5/news/" + str(self.times) + ".txt", "wb")
     self.times += 1
     #print(self.times)
     #fh.write(context)
     yield item
Пример #5
0
 def get_tab(self, response, category):
     items = []
     for group in response.xpath('./ul[@class="rList"]'):
         for rank in group.xpath('./li'):
             item = SohuItem()
             item['rank_category'] = category
             item['rank_name'] = rank.xpath(
                 './div[@class="vName"]/div/a/text()').extract()
             item['rank_type'] = rank.xpath(
                 './span[@class="vSort"]/a/text()').extract()
             item['rank_index'] = rank.xpath(
                 './span[@class="vTotal"]/text()').extract()
             item['rank_trend'] = rank.xpath(
                 './span[@class="vRank"]/@title').extract()
             items.append(item)
     return items
Пример #6
0
 def get_tab1(self, response, category):
     item = SohuItem()
     item['rank_category'] = category
     item['rank_name'] = response.xpath(
         './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/div[@class="vName"]/a/@title'
     ).extract()
     item['rank_type'] = response.xpath(
         './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vSort"]/a/text()'
     ).extract()
     item['rank_index'] = response.xpath(
         './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vTotal"]/text()'
     ).extract()
     item['rank_trend'] = response.xpath(
         './ul[@class="rList"]/li[@class="No1"]/div[@class="cfix"]/span[@class="vRank"]/@title'
     ).extract()
     return item
Пример #7
0
 def parse(self, response):
     for agent in response.xpath('//div[@class="realtor-info"]'):
         item = SohuItem()
         item['name'] = agent.xpath(
             './/a[@class="realtor-info-name"]/text()').extract_first()
         item['company'] = agent.xpath(
             './/div[@class="rcompany"]//span/text()').extract()[1]
         listR = agent.xpath('.//div[@class="rparks"]//text()').extract()
         predict = str(listR[0]) + str(listR[1]) + str(listR[2])
         listR = predict + "/".join(
             [str(listR[i]) for i in range(3,
                                           len(listR) - 1)])
         listR = listR.replace('\n', "").replace(" ", "").replace("//", "/")
         item['region'] = listR
         reId = agent.xpath('.//a[@class="realtor-info-name"]/@href'
                            ).extract_first().strip()
         reId = str(reId)
         reId = re.findall('[1-9]\d*', reId)[0]
         href = "http://esf.focus.cn/api/getVirtualPhone?realtorId=" + reId + "&ecoCityId=73&call_url=.html"
         yield scrapy.Request(href,
                              meta={'item': item},
                              callback=self.parse_phone)