Python SpiderItem 예제들, spider.items.SpiderItem Python 예제들

예제 #1

0

파일 보기

    def country_history_data_parse(self, response):
        # 取出返回结果
        # 通过 response.json()
        data = response.json()
        # 历史疫情数据
        data = data.get('data')

        # 得到 META 中的 country_name的值
        country_name = response.meta['country_name']
        countryShortCode = response.meta['countryShortCode']

        # 循环 历史数据，给每一条数据添加一个country_name 字段，以标注它是属于哪一个国家的数据
        for d in data:
            d['country_name'] = country_name
            d['countryShortCode'] = countryShortCode

        # file_path = f'datas/countries/{country_name}.json'
        # #文件的上一级路径
        # parent_path = pathlib.PosixPath(file_path).parent
        # # 如果文件的上一级不存在，则创建
        # if not parent_path.exists():
        #     #进行㠌套创建目录
        #     parent_path.mkdir(parents=True)
        # # # 保存数据
        # with open(file_path, 'w+') as f:
        #     json.dump(data, f, ensure_ascii=False)

        # 通过 Pipeline 方式进行数据的统一保存
        item = SpiderItem()
        item['is_last_updated'] = False
        item['data'] = data
        yield item

예제 #2

0

파일 보기

    def parse_info(self, response):
        item = SpiderItem()

        soup = BeautifulSoup(response.body, 'html.parser')

        soup_type = soup.find(id='lastfont')
        item['category'] = soup_type.string.strip()

        soup_title = soup.find(id='tdTitle').div
        item['title'] = soup_title.font.b.string.strip()

        soup_title = soup_title.next_sibling.next_sibling
        item['date'] = soup_title.get_text().split('\r\n')[1].strip()

        soup_content = soup.find(id='TDContent')
        item['content'] = soup_content.get_text()

        item['file_urls'] = []
        item['file_names'] = []
        soup_files = soup.find(id='filedown').find_all('a')
        for soup_file in soup_files:
            item['file_urls'].append(response.urljoin(soup_file.attrs['href']))
            item['file_names'].append(soup_file.get_text().strip())

        item['url'] = response.url
        return item

예제 #3

0

파일 보기

파일: pdd_sync_goods_v2.py 프로젝트: byst4nder/his_spider

    def parse(self, response):

        content = response.body.decode('utf-8')

        a = json.loads(content)  # re.search('window\.rawData= (.*)\;\s*\<\/script\>', content)
        if a:
            content = a
            goods_data = SpiderItem()
            content_goods = content['goods']
            goods_data['goods_id'] = content_goods["goods_id"]
            goods_data['mall_id'] = content_goods['mall_id']
            goods_data['goods_type'] = content_goods['goods_type']
            goods_data['category1'] = str(content_goods['cat_id_1'])
            goods_data['category2'] = str(content_goods['cat_id_2'])
            goods_data['category3'] = str(content_goods['cat_id_3'])
            goods_data['goods_name'] = content_goods['goods_name']
            goods_data['market_price'] = float(content_goods['market_price'] / 100)  # 单位：元，下同
            goods_data['max_group_price'] = float(content['price']['max_on_sale_group_price'] / 100)
            goods_data['min_group_price'] = float(content['price']['min_on_sale_group_price'] / 100)
            goods_data['max_normal_price'] = float(content['price']['max_on_sale_normal_price'] / 100)
            goods_data['min_normal_price'] = float(content['price']['min_on_sale_normal_price'] / 100)
            goods_data['thumb_url'] = content_goods['thumb_url']
            # goods_data['publish_date'] = goods['created_at']
            goods_data['total_sales'] = int(content_goods['sold_quantity'])  # 总销量
            goods_data['is_on_sale'] = content_goods['is_onsale']

            # ##获取核算价
            goods_data['price'] = goods_data['min_group_price']
            goods_data['total_amount'] = float(goods_data['total_sales'] * float(goods_data['price']))  # 总销售额
            # print(goods_data)
            yield goods_data

예제 #4

0

파일 보기

    def parse_list(self, response):
        soup = BeautifulSoup(response.body, 'html.parser')
        soup_list = soup.find(id='MoreInfoList1_tdcontent') or soup.find(
            id='DataGrid1')
        soup_list = soup_list.find_all('a')
        soup_type = soup.find(id='lastfont')

        for i in soup_list:
            if 'infodetail' in i.attrs['href'].lower():
                yield scrapy.Request(url=response.urljoin(i.attrs['href']),
                                     callback=self.parse_info)
            elif 'buyi_list' in i.attrs['href'].lower():
                yield scrapy.Request(url=response.urljoin(i.attrs['href']),
                                     callback=self.parse_parameters)
            else:
                item = SpiderItem()

                item['category'] = soup_type.string.strip()
                item['title'] = i.string.strip()
                item['date'] = i.parent.next_sibling.string.strip().replace(
                    '-', '/')
                item['content'] = ''

                item['file_urls'] = [response.urljoin(i.attrs['href'])]
                item['file_names'] = ['test.txt']

                item['url'] = response.urljoin(i.attrs['href'])

                yield item

예제 #5

0

파일 보기

파일: movie_spider.py 프로젝트: xinyi329/Moviepedia

 def parse(self, response):
     driver = webdriver.Chrome()
     reviews = response.xpath(
         "//div[contains(@class,'lister-item mode-detail imdb-user-review  with-spoiler')]"
     )
     driver.get(response.url)
     right_index = response.url.rfind('/')
     left_index = response.url[:right_index].rfind('/')
     movieID = response.url[left_index + 1:right_index]
     while len(reviews) <= 100:
         driver.find_element_by_class_name("ipl-load-more").click()
         html = driver.page_source
         reviews = scrapy.Selector(text=html).xpath(
             "//div[contains(@class,'lister-item mode-detail imdb-user-review  with-spoiler')]"
         )
     for review in reviews[:100].extract():
         element = scrapy.Selector(text=review)
         spiderItem = SpiderItem()
         spiderItem['url'] = response.url
         spiderItem['review'] = element.xpath(
             "//div[contains(@class, 'text show-more__control')]/text()"
         ).extract_first()
         spiderItem['movieID'] = movieID
         if len(element.xpath(
                 "//span[contains(@class, 'spoiler-warning')]")) != 0:
             spiderItem['spoiler'] = "true"
         else:
             spiderItem['spoiler'] = "false"
         score = element.xpath(
             "//div[contains(@class,'ipl-ratings-bar')]/span/span/text()")
         if len(score) != 0:
             spiderItem['score'] = score.extract_first()
         else:
             spiderItem['score'] = ""
         yield spiderItem

예제 #6

0

파일 보기

파일: pdfdownloader.py 프로젝트: mauroceaz/Crawler-and-pdf-parser

 def parse4(self, response):
     for link in response.xpath(
             '//div/a[contains(@class,"action")]/@href').extract():
         loader = ItemLoader(item=SpiderItem(), selector=link)
         urlabsoluta = response.urljoin(link)
         loader.add_value('file_urls', urlabsoluta)
         yield loader.load_item()

예제 #7

0

파일 보기

    def parse_link(self, response):

        item = SpiderItem()

        unique_name = response.meta['unique_name']
        full_name = response.meta['full_name']
        content = response.meta['content']
        category = response.meta['category']

        post_time = response.meta['post_time']
        file_urls = response.meta['file_urls']
        screen_urls = response.meta['screen_urls']
        image_urls = response.meta['image_urls']
        tag = response.meta['tag']

        print unique_name
        print '------'
        sel = Selector(response)

        try:
            link1 = sel.xpath(
                '//div[contains(@class, "downloadlink")]//a/@href')[0].extract(
                )
            link1_text = sel.xpath(
                '//div[contains(@class, "downloadlink")]//a/text()'
            )[0].extract()
        except IndexError, e:
            link1 = ''
            link1_text = ''
            f = open('no_link.html', 'a')
            f.write('%s\n' % full_name)
            f.close()

예제 #8

0

파일 보기

 def parse(self, response):
     reviews = response.xpath(
         "//div[contains(@class,'lister-item mode-detail imdb-user-review  with-spoiler')]"
     )
     group = response.url.split('/')
     movieID = group[4]
     for review in reviews.extract():
         element = scrapy.Selector(text=review)
         spiderItem = SpiderItem()
         spiderItem['url'] = response.url
         spiderItem['review'] = element.xpath(
             "//div[contains(@class, 'text show-more__control')]/text()"
         ).extract_first().replace('\n', ' ')
         spiderItem['movieID'] = movieID
         if len(element.xpath(
                 "//span[contains(@class, 'spoiler-warning')]")) != 0:
             spiderItem['spoiler'] = "true"
         else:
             spiderItem['spoiler'] = "false"
         score = element.xpath(
             "//div[contains(@class,'ipl-ratings-bar')]/span/span/text()")
         if len(score) != 0:
             spiderItem['score'] = score.extract_first()
         else:
             spiderItem['score'] = ""
         yield spiderItem
     if len(self.start_urls) <= 40000:
         loader = response.xpath(
             "//div[contains(@class,'load-more-data')]/@data-key"
         ).extract_first()
         if loader != None:
             url = 'https://www.imdb.com/title/' + movieID + '/reviews/_ajax?paginationKey=' + loader
             self.start_urls.append(url)

예제 #9

0

파일 보기

    def parse(self, response):
        # 这个表示有40个 ==
        # response 返回数据的方法xpath,css 但是通常用 xpath
        # print("==" * 40)
        # 第一个div 是获取的整体内容，第二个div是下属的
        # SelectorList 结合类型,注意  div的 id 要取 大id
        # contentLeft = response.xpath("//div[@id='content']/div")

        contentLeft = response.xpath("//div[@id='content']/div")
        # Selector
        for content in contentLeft:
            # 这是获取作者等单条记录的方法,只有一个标签的时候使用
            # author = content.xpath(".//li/text()").get().strip()
            # author = content.xpath(".//div[@class='line']//text()").getall()
            author = content.xpath(".//div[@class='line']//text()").getall()

            # 变成字符串的方法
            author = "".join(author).strip()
            # print(author)

            # 方法一：构造成生成器，给PIPELINE 使用  ？生成器的作用
            # text = {'text': author}
            # yield text

            # 方法二（建议）：引用ITEMS模型
            item = SpiderItem(author=author)
            # 返回当前item
            yield item

예제 #10

0

파일 보기

파일: quotes.py 프로젝트: mkubasz/other

 def scrape_home_page(self, response):
     open_in_browser(response)
     l = ItemLoader(item=SpiderItem(), response=response)
     h1_tag = response.xpath('//h1/a/text()').extract_first()
     tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()
     l.add_value('h1_tag', h1_tag)
     l.add_value('tags', tags)
     return l.load_item()

예제 #11

0

파일 보기

파일: iduilian.py 프로젝트: fenildf/duilian

 def parse_item(self, response):
     self.logger.info('parse url %s' % response.url)
     content = response.css('.detail').extract_first()
     content = re.sub('<.*?>|\t', '', content)
     duilian = re.findall('(.*?)；(.*?)。', content)
     item = SpiderItem()
     item['duilian'] = duilian
     item['content'] = content
     item['url'] = response.url
     return item

예제 #12

0

파일 보기

 def parse(self, response):
     movie = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for m in movie[:10]:
         item = SpiderItem()
         url = m.xpath('./a/@href').extract_first().strip()
         link = 'https://maoyan.com' + url
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)

예제 #13

0

파일 보기

파일: maoyan.py 프로젝트: zgzguangguang/Python001-class01

 def parse(self, response):
     items = []
     movies = Selector(response=response).xpath('//*[@id="app"]/div/div/div[1]/dl/dd[2]/div/div/div[1]')
     for movie in movies:
         item = SpiderItem()
         link = movie.xpath('./a/@href')
         title = movie.xpath('./a/text()')
         time = movie.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[2]/div/div/div[1]/p[3]')
         item['title'] = title
         item['link'] = link
         item['time'] = time
         items.append(item)
         yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)

예제 #14

0

파일 보기

파일: data.py 프로젝트: playboysnow/newspace

 def parse_item(self, response):
     url = response.css('a::attr(href)').extract()
     for i in range(0, len(url) + 1):
         if "detail" in url[i]:
             #                yield scrapy.Request('https:'+url[i], callback=self.parse,
             #                                    errback=self.errback)
             newres = self.webpage('https:' + url[i])
             dt = ItemLoader(item=SpiderItem(), response=newres)
             dt.add_xpath('name', '//*[@id="J_AttrUL"]/li[1]')
             dt.add_xpath('parse', '//*[@id="J_AttrUL"]/li[3]')
             #   dt.add_xpath('price','//*[@id="J_StrPriceModBox"]/dd/span')
             dt.add_xpath('price', '//*[@id="J_StrPriceModBox"]/dd/span')
             return dt.load_item()

예제 #15

0

파일 보기

파일: nmac.py 프로젝트: augustwu/spider

    def parse_product(self, response):
        sel = Selector(response)

        item = SpiderItem()

        category = sel.xpath(
            '//div[contains(@class, "category-list")]//a/text()')[-1].extract(
            )
        tag = sel.xpath(
            '//div[contains(@class,"post-tags-wrapper")]//div[contains(@class,"post-tags")]//a/text()'
        ).extract()

        content = sel.xpath(
            '//div[contains(@class, "the-content")]/*[not(@class="nmac-before-content"  or self::a or self::script or @class="nmac-after-content" or @class="adsbygoogle" or @id="aswift_2_expand" or class="alert fade in alert-error" or class="wp-image-3333" or @style="text-align: center; width: 40%; margin-left: 30%;" or @style="text-align: center" or @style="text-align: center;" or  @class="alert fade in alert-error" or @style="text-align: left;" or @class="alert fade in alert-error " or @style="text-align: center; width: 100%;" or @class="size-full")]'
        ).extract()

        download_url = sel.xpath(
            '//div[contains(@class, "the-content")]//a[contains(@class,"btn-block")]/@href'
        ).extract()
        name = sel.xpath(
            '//div[contains(@class, "main-content")]//h1/text()').extract()

        unique_name = name[0].replace(u'\u2013', '-').split('-')[0].strip()
        full_name = name[0].replace(u'\u2013', '-').strip()
        post_time = sel.xpath(
            '//div[contains(@class,"meta-data")]//span[contains(@class,"date")]/text()'
        )[-1].extract().split('\n')[-1].strip()

        image_urls = sel.xpath(
            '//div[contains(@class, "the-content")]//img[contains(@class,"alignright")]/@src'
        ).extract()

        item['unique_name'] = unique_name
        item['full_name'] = full_name
        item['content'] = content
        item['category'] = [category]

        item['image_urls'] = image_urls

        item['tag'] = tag
        item['post_time'] = post_time

        for index, d_url in enumerate(download_url):
            if index == 0:
                request = Request(d_url,
                                  callback=self.parse_download_link_1,
                                  meta={
                                      "download_url": download_url,
                                      'item': item
                                  })
                yield request

예제 #16

0

파일 보기

 def parse(self, response):
     items = []
     shorts = Selector(response=response).xpath("//span[@class='short']")
     stars = Selector(
         response=response).xpath("//span[starts-with(@class, 'allstar')]")
     votes = Selector(response=response).xpath("//span[@class='votes']")
     for i in range(len(shorts)):
         item = SpiderItem()
         item['short'] = shorts[i].xpath('./text()').extract()[0]
         item['star'] = stars[i].xpath('./@class').extract()[0][7:9]
         item['recommend'] = stars[i].xpath('./@title').extract()[0]
         item['vote'] = votes[i].xpath('./text()').extract()[0]
         items.append(item)
     print(items)
     return items

예제 #17

0

파일 보기

파일: trans.py 프로젝트: taichuai/news_spider

 def parse_item(self, response):
     self.logger.info('A response from %s just arrived!', response.url)
     item = SpiderItem()
     item['url'] = response.url
     title = response.xpath('//div[@id="article"]/h1[@id="title"]/text()').extract()[0]
     if title:
         item['title'] = title
     else:
         title['title'] = ''
     text = response.xpath('//div[@id="article"]/div[2]/p/text()').extract()
     if text:
         item['text'] = ' '.join(text)
     else:
         item['text'] = ''
     return item

예제 #18

0

파일 보기

    def parse(self, response):
        # 获取的数据集
        node_list = response.xpath("//div[@class='li_txt']")
        for node in node_list:
            item = SpiderItem()
            # .extract()将xpath对象转换为Unicode字符串
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            # yield ：获得一个item数据后暂停循环，然后将它交给管道，之后继续进行循环
            yield item

예제 #19

0

파일 보기

    def parse(self, response):

        items = []

        movies = Selector(
            response=response).xpath('//div[@class="movie-hover-info"]')

        for movie in movies:

            item = SpiderItem()

            movie_title = movie.xpath('./div/span/text()').extract_first()
            item['movie_title'] = movie_title

            movie_info_list = movie.xpath(
                './div[@class="movie-hover-title"]/text()').extract()
            movie_info_list_new = []

            for x in movie_info_list:
                x = x.replace('\n', '').replace(' ', '')
                if x != '':
                    movie_info_list_new.append(x)

            movie_type = movie_info_list_new[0]
            item['movie_type'] = movie_type

            movie_time_list = movie.xpath(
                './div[@class="movie-hover-title movie-hover-brief"]/text()'
            ).extract()
            movie_time_list_new = []

            for y in movie_time_list:
                y = y.replace('\n', '').replace(' ', '')
                if y != '':
                    movie_time_list_new.append(y)

            movie_time = ''.join(movie_time_list_new)
            if movie_time == '':
                movie_time = '暂无'

            item['movie_time'] = movie_time

            items.append(item)

        return items

예제 #20

0

파일 보기

파일: maoyanmovie.py 프로젝트: so1uckyyy/Python003-003

 def parse(self, response):
     movies = Selector(response=response).xpath(
         '//div[contains(@class, "movie-hover-info")]')
     print(movies)
     for movie in movies[0:10]:
         name = movie.xpath('./div/span[@class="name "]/text()')
         print(name.extract_first())
         category = movie.xpath(
             './div/span[contains(text(), "类型")]/parent::*/text()')
         print(category.extract()[-1].strip())
         show_time = movie.xpath(
             './div/span[contains(text(), "上映时间")]/parent::*/text()')
         print(show_time.extract()[-1].strip())
         item = SpiderItem()
         item['name'] = name.extract_first()
         item['category'] = category.extract()[-1].strip()
         item['show_time'] = show_time.extract()[-1].strip()
         yield item

예제 #21

0

파일 보기

파일: pcconnection_camera.py 프로젝트: cash2one/bad_source

    def parse(self, response):
        '''
        extract
        title
        content
        url
        '''
        print '>' * 50
        print 'response url: ', response.url
        hxs = HtmlXPathSelector(response)
        print '>>>> repsonse.url: ', response.url
        #get urls
        content_urls = hxs.select(content_url_format).extract()

        list_urls = hxs.select(list_url_format).extract()
        list_urls = [up.urljoin(response.url, url) for url in list_urls]
        content_urls = [up.urljoin(response.url, url) for url in content_urls]

        print "@" * 60
        time.sleep(self.sleep_time)
        self.start_urls.extend(list_urls)

        for url in list_urls:
            yield Request(url, self.parse)

        #http://www.pcconnection.com/IPA/Shop/Product/Detail.htm?sku=16037879&cac=Result
        content_re = re.compile(
            r'http://www[.]pcconnection[.]com/.*cac=Result')
        for url in content_urls:
            if content_re.match(url):
                if len(self.dic) > 160:
                    self.start_urls = []
                    raise CloseSpider('reach pages limit, end the spider.')

                self.count += 1
                self.dic.add(hash(url))
                #extract data
                item = SpiderItem()
                item['url'] = url
                item['kind'] = self.name
                yield item
            else:
                print "!!!!!!! not match content url:"
                print url

예제 #22

0

파일 보기

    def parse(self, response):
        item = SpiderItem()
        if self.counter < self.goal:
            print(self.counter)
            self.counter += 1
            # h1, h2, h3, h4, h5, h6, li, a, span
            title = response.xpath("//h1/text()").extract()
            p = response.xpath('//p/text()').extract()
            span = response.xpath('//span/text()').extract()
            li = response.xpath('//li/text()').extract()
            a = response.xpath('//a/text()').extract()
            # h1 = response.xpath('//h1/text()').extract()
            h2 = response.xpath('//h2/text()').extract()
            h3 = response.xpath('//h3/text()').extract()
            h4 = response.xpath('//h4/text()').extract()
            h5 = response.xpath('//h5/text()').extract()
            h6 = response.xpath('//h6/text()').extract()
            text = str(p).strip() + str(span).strip() + str(a).strip()\
                    + str(h2).strip() + str(h3).strip()\
                   + str(h4).strip() + str(h5).strip() + str(h6).strip() \
                   + str(li) + str(title)
            text = text.replace('\\r', '').replace('\\n',
                                                   '').replace('\\t', '')
            if str(response.url) not in self.dict_page.keys():
                self.doc_id += 1
                self.dict_page[str(response.url)] = [[self.doc_id], [text]]
            item['text'] = text
            item['title'] = title
        else:
            print("Writing dictionary into file. " + "Dictionary size: " +
                  str(len(self.dict_page)))
            with open(
                    "/Users/lekangdu/Desktop/my_spider/spider/res/ai_res" +
                    str(self.goal) + ".json", 'w') as f:
                json.dump(self.dict_page, f)
            self.crawler.engine.close_spider(self, 'Spider closed.')

        links = response.xpath('.//a/@href').extract()
        for url in links:
            if url.endswith('.html'):
                next_url = response.urljoin(url)
                yield scrapy.Request(next_url, callback=self.parse)

예제 #23

0

파일 보기

파일: pdd_sync_goods_v_old.py 프로젝트: byst4nder/his_spider

    def parse(self, response):
        pass
        content = response.body.decode('utf-8')
        a = re.search('window\.rawData= (.*)\;\s*\<\/script\>', content)
        if a:
            content = json.loads(a.group(1))
            print(content)
            if 'goods' not in content.keys():
                return None

            goods = content['goods']

            goods_data = SpiderItem()
            goods_data['goods_id'] = goods['goodsID']
            goods_data['mall_id'] = goods['mallID']
            goods_data['goods_type'] = goods['goodsType']
            goods_data['category1'] = str(goods['catID1'])
            goods_data['category2'] = str(goods['catID2'])
            goods_data['category3'] = str(goods['catID3'])
            goods_data['goods_name'] = goods['goodsName']
            goods_data['market_price'] = goods['marketPrice']
            goods_data['max_group_price'] = goods['maxOnSaleGroupPrice']
            goods_data['min_group_price'] = goods['minOnSaleGroupPrice']
            goods_data['max_normal_price'] = goods['maxOnSaleNormalPrice']
            goods_data['min_normal_price'] = goods['minOnSaleNormalPrice']
            goods_data['thumb_url'] = goods['thumbUrl']
            goods_data['publish_date'] = self.get_goods_publish_date(
                goods['topGallery'], goods['detailGallery'], goods['skus'])
            goods_data['total_sales'] = int(goods['sales'])  ##总销量

            if goods['isOnSale'] and goods['isGoodsOnSale']:
                goods_data['is_on_sale'] = 1
            else:
                goods_data['is_on_sale'] = 0

            # ##获取核算价
            goods_data['price'] = self.get_goods_price(goods['skus'],
                                                       goods['sales'])
            goods_data['total_amount'] = float(
                goods_data['total_sales'] * float(goods_data['price']))  ##总销售额

            yield goods_data

예제 #24

0

파일 보기

    def parse(self, response):
        '''
        extract
        title
        content
        url
        '''
        print '>' * 50
        print 'response url: ', response.url
        hxs = HtmlXPathSelector(response)
        print '>>>> repsonse.url: ', response.url
        #get urls
        content_urls = hxs.select(content_url_format).extract()

        list_urls = hxs.select(
            '//ul[contains(@class,"pagination")]/li/a[contains(@href,"query=camera")]/@href'
        ).extract()
        list_urls = [up.urljoin(response.url, url) for url in list_urls]
        content_urls = [up.urljoin(response.url, url) for url in content_urls]

        print "@" * 60
        time.sleep(self.sleep_time)
        self.start_urls.extend(list_urls)

        for url in list_urls:
            yield Request(url, self.parse)

        content_re = re.compile(
            r'http://.*[.]cnet[.]com/.*camera.*/.*[.]html$')
        for url in content_urls:
            if content_re.match(url):
                if self.count > 450:
                    self.start_urls = []
                    raise CloseSpider('reach pages limit, end the spider.')

                self.count += 1
                #extract data
                item = SpiderItem()
                item['url'] = url
                item['kind'] = self.name
                yield item

예제 #25

0

파일 보기

    def parse(self, response):
        movie_list = response.xpath(
            "//div[@class='article']//ol[@class='grid_view']//li")

        #循环电影的条目

        for i_item in movie_list:
            #item文件导进来
            spider_item = SpiderItem()
            #写详细的xpath，进行数据解析
            spider_item['serial_number'] = i_item.xpath(
                ".//div[@class='item']//em/text()").extract_first()

            spider_item['movie_name'] = i_item.xpath(
                ".//div[@class='info']/div[@class='hd']/a/span[1]/text()"
            ).extract_first()
            content = i_item.xpath(
                ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract(
                )
            #数据的处理
            for i_content in content:
                content_s = "".join(i_content.split())
                spider_item['introduce'] = content_s

            # print(spider_item['introduce'])
            spider_item['star'] = i_item.xpath(
                ".//span[@class='rating_num']/text()").extract_first()
            spider_item['evaluate'] = i_item.xpath(
                ".//div[@class='star']//span[4]/text()").extract_first()
            spider_item['describe'] = i_item.xpath(
                ".//p[@class='quote']/span/text()").extract_first()
            #你需要将数据yield到pipelines里面去
            #
            yield spider_item
        #解析下一页,取后页的xpath
        next_link = response.xpath("//span[@class='next']/a/@href").extract()
        #判断是否有下一页按钮
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                                 callback=self.parse)  #回调函数

예제 #26

0

파일 보기

    def parse(self, response):

        # 1从返回对象中取值
        # 取值方式：
        # 1.1 response.xpath()
        # data_txt = response.xpath('//script[@id="getListByCountryTypeService2true"]/text()').get()
        # print(data_txt)
        # 1.2 response.css()
        data_txt = response.css(
            '#getListByCountryTypeService2true::text').get()
        # print(f'工作目录：{os.getcwd()}')

        # 2 清洗数据
        # 2.1 通过正则匹配出“[]”中的字符串
        data_txt = re.findall('\[.+\]', data_txt)[0]
        # 2.2 通过json模块，将字符串转换化为python对象，在这里为list
        data = json.loads(data_txt)

        item = SpiderItem()
        item['is_last_updated'] = True
        item['data'] = data
        yield item

        # # 3 保存数据
        # with  open('datas/last_updated_dxy_datas.json', 'w+') as f:
        #     json.dump(data, f, ensure_ascii=False)

        # 循环 data 得到 每个国家的历史疫情数据 URL
        for country_data in data:
            # 表示 各国家 的历史疫情数据 URL
            url = country_data['statisticsData']
            #国家或地区名称
            country_name = country_data['provinceName']
            countryShortCode = country_data['countryShortCode']
            # 发起请求
            yield scrapy.Request(url,
                                 callback=self.country_history_data_parse,
                                 meta={
                                     'country_name': country_name,
                                     'countryShortCode': countryShortCode
                                 })

예제 #27

0

파일 보기

파일: baidu.py 프로젝트: blackflagking/ScrapySpider--hotpoint

    def parse(self, response):

        item = SpiderItem()

        pro_char = 'http://www.baidu.com/'
        for num in range(1, 11):
            pro_hot = response.xpath(
                "//div[@class='FYB_RD']/table/tbody[1]/tr[%d]" % num)
            hotname = pro_hot.xpath('td/span/a/text()').extract()[0]
            hoturl = pro_hot.xpath('td/span/a/@href').extract()[0]
            theurl = pro_char + hoturl

            item['bdname'] = hotname
            item['bdurl'] = theurl
            print hotname, theurl
        yield scrapy.Request(
            url=
            'https://www.so.com/s?ie=utf-8&fr=none&src=360sou_newhome&q=%E7%83%AD%E7%82%B9',
            meta={'item': item},
            callback=self.get360,
            dont_filter=True)

예제 #28

0

파일 보기

    def parse(self, response):
        '''
        extract
        title
        content
        url
        '''
        print '>' * 50
        print 'response url: ', response.url
        hxs = HtmlXPathSelector(response)
        print '>>>> repsonse.url: ', response.url
        #get urls
        content_urls = hxs.select(content_url_format).extract()
        list_urls = hxs.select(
            '//span[contains(@class,"pagnLink")]/a[contains(@href,"keywords=notebook")]/@href'
        ).extract()
        list_urls = [up.urljoin(response.url, url) for url in list_urls]
        print "@" * 60
        time.sleep(self.sleep_time)
        self.start_urls.extend(list_urls)

        for url in list_urls:
            yield Request(url, self.parse)

        content_re = re.compile(
            r'http://www.amazon.com/[^s]+.*&keywords=notebook$')
        for url in content_urls:
            if content_re.match(url):
                if len(self.dic) > 450:
                    self.start_urls = []
                    raise CloseSpider('reach pages limit, end the spider.')

                self.count += 1

                self.dic.add(hash(url))
                #extract data
                item = SpiderItem()
                item['url'] = url
                item['kind'] = 'amazon_notebook'
                yield item

예제 #29

0

파일 보기

    def parse(self, response):
        pass
        goods_id = response.meta['goods_id']
        self.ssdb_client.hdel(self.hash_name, goods_id)

        content = response.body.decode('utf-8')
        a = json.loads(
            content
        )  # re.search('window\.rawData= (.*)\;\s*\<\/script\>', content)
        if a:
            goods = a
            goods_data = SpiderItem()
            goods_data['goods_id'] = goods['goods_id']
            goods_data['mall_id'] = goods['mall_id']
            goods_data['goods_type'] = goods['goods_type']
            goods_data['category1'] = str(goods['cat_id_1'])
            goods_data['category2'] = str(goods['cat_id_2'])
            goods_data['category3'] = str(goods['cat_id_3'])
            goods_data['goods_name'] = goods['goods_name']
            goods_data['market_price'] = float(goods['market_price'] /
                                               100)  # 单位：元，下同
            goods_data['max_group_price'] = float(
                goods['max_on_sale_group_price'] / 100)
            goods_data['min_group_price'] = float(
                goods['min_on_sale_group_price'] / 100)
            goods_data['max_normal_price'] = float(
                goods['max_on_sale_normal_price'] / 100)
            goods_data['min_normal_price'] = float(
                goods['min_on_sale_normal_price'] / 100)
            goods_data['thumb_url'] = goods['thumb_url']
            goods_data['publish_date'] = goods['created_at']
            goods_data['total_sales'] = int(goods['sales'])  # 总销量
            goods_data['is_on_sale'] = goods['is_onsale']

            # ##获取核算价
            goods_data['price'] = goods_data['min_group_price']
            goods_data['total_amount'] = float(
                goods_data['total_sales'] * float(goods_data['price']))  # 总销售额

            yield goods_data

예제 #30

0

파일 보기

    def parse(self, response):
        global maxnum
        item = SpiderItem()
        selector = Selector(response)
        article = selector.css('.post')
        for arc in article:
            name = arc.xpath(
                './/h2[@class="entry-title"]//text()').extract_first()
            link = arc.css('.entry-title a::attr(href)').extract_first()
            author = arc.xpath(
                './/h5[@class="entry-author"]/a/text()').extract()
            description = arc.xpath(
                './/div[@class="entry-summary"]/p/text()').extract_first()
            authors = ''
            for eachAuthor in author:
                eachAuthor += ' '
                authors += eachAuthor
            item['name'] = name
            item['link'] = link
            item['author'] = authors
            item['description'] = description
            yield item
        url = 'http://www.allitebooks.com/page'
        digits = re.findall(r'/(\d+)/', response.url)
        digit = 0
        if bool(self.maxnum) is False:
            self.maxnum = selector.css("#content .pagination a:last-child"
                                       ).xpath(".//text()").extract_first()
            self.maxnum = int(self.maxnum)

        print("\nmaxnum:" + str(self.maxnum) + '\n')
        if digits:
            digit = int(digits[0]) + 1
            nextURL = url + '/' + str(digit) + '/?s=python'
        else:
            nextURL = url + '/2/?s=python'
        if digit <= self.maxnum:
            print("\nnextURL:" + nextURL + "\n")
            yield scrapy.Request(nextURL, callback=self.parse)