예제 #1
0
    def parse(self, response: Response):
        if response.status == 200:
            lis = response.css('.all-img-list li')
            print(f'------{len(lis)}-------------')
            for li in lis:
                item = BookItem()
                item['book_id'] = uuid.uuid4().hex
                a = li.xpath('./div[1]/a')
                item['book_url'] = a.xpath('./@href').get()
                item['book_cover'] = a.xpath('./img/@src').get()
                item['book_name'] = li.xpath('./div[2]/h4//text()').get()

                item['author'], *item['tags'] = li.css(
                    '.author a::text').extract()
                item['summary'] = li.css('.intro::text').get()

                # 请求小说的详情
                yield Request('https:' + item['book_url'],
                              callback=self.parse_info,
                              priority=1,
                              meta={'book_id': item['book_id']})

                yield item

            # 获取下一页
            next_url = response.css('.lbf-pagination-item-list ').xpath(
                './li[last()]/a/@href').get()
            if next_url.find('javascript') == -1:
                yield Request('https:' + next_url, priority=100)
예제 #2
0
 def parse(self, response: Response):
     log('parse')
     if response.css('#leftNav a > i.a-star-medium-4'):
         log('go star')
         yield Request(response.url,
                       self.star_4,
                       dont_filter=True,
                       errback=self.errors('star_4'))
     else:
         links = response.css(
             'div.left_nav.browseBox a::attr(href)').getall()
         for link in links:
             node = get_query_val(link, 'node')
             if node:
                 url = f'{amazon_url}/s?node={node}'
                 yield Request(url,
                               self.parse,
                               errback=self.errors('parse'))
             else:
                 rh = get_query_val(link, 'rh')
                 if rh:
                     url = f'{amazon_url}/s?rh={rh}'
                     yield Request(url,
                                   self.parse,
                                   errback=self.errors('parse'))
예제 #3
0
def _wikidata_info(response: Response):
    property_codes = [{
        'name': 'date_of_birth',
        'code': 'P569'
    }, {
        'name': 'date_of_death',
        'code': 'P570'
    }, {
        'name': 'place_of_birth',
        'code': 'P19',
        'link': True
    }, {
        'name': 'place_of_death',
        'code': 'P20',
        'link': True
    }, {
        'name': 'gender',
        'code': 'P21',
        'link': True
    }]
    winner_info = {}
    for prop in property_codes:
        if prop.get('link'):
            sel = response.css(f'#{prop["code"]} .wikibase-snakview-value'
                               f' a::text')
        else:
            sel = response.css(f'#{prop["code"]} .wikibase-snakview-value'
                               f'::text')
        if sel:
            if not sel.get():
                print(f'No field {prop["name"]} for {response.url}')
            winner_info[prop['name']] = sel.extract_first()
    yield NWinnerItem(**winner_info, **response.meta['winner'])
예제 #4
0
 def single_category(self, response: Response):
     log('single')
     if get_query_val(response.url, 's', '') != 'review-rank':
         yield Request(update_query(response.url, s='review-rank'),
                       self.single_category,
                       dont_filter=True,
                       errback=self.errors('single_category'))
     else:
         links = response.css(
             'div.s-result-list.s-search-results.sg-row > div[data-asin] h2 > a::attr(href)'
         ).getall()
         for link in links:
             dirs = link.split('/')
             dp = dirs[dirs.index('dp') + 1]
             url = f'{amazon_url}/dp/{dp}'
             yield Request(url,
                           self.single_parse,
                           errback=self.errors('single_parse'))
         page = int(get_query_val(response.url, 'page', 1))
         if page < max_page:
             next_link = response.css(
                 'ul.a-pagination > li.a-last > a::attr(href)').get()
             if next_link:
                 yield Request(remove_query(amazon_url + next_link, 's',
                                            'rh', 'page'),
                               self.single_category,
                               errback=self.errors('single_category'))
예제 #5
0
    def parse_sku(self, response: Response):
        other_color_urls = [
            self.base_url + item.attrib['href']
            for item in response.css('div.color-picker__swatches a')
        ]
        for url in other_color_urls:
            yield Request(url, callback=self.parse_sku)

        price_hkd = response.css(
            'div.product-info-panel__price::text').get().strip('HKD').replace(
                ',', '')
        price = {
            'hkd': float(price_hkd),
        }

        json_in_page = response.css(
            'script[type="application/ld+json"]::text').getall()[-1]
        json_data = json.loads(json_in_page)

        code = json_data['sku']

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '',
                  [], [])

        yield sku
예제 #6
0
    def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]:
        yield from self.follow_pages(response)
        for recipe_url in response.css("a.promo::attr(href)"):
            yield response.follow(recipe_url.get(), callback=self.parse)

        recipe = response.css("div.recipe-main-info")
        if recipe:
            ingredients = [
                self._get_ingredient(response, ingredient) for ingredient in
                recipe.css("li.recipe-ingredients__list-item")
            ]
            if all(ingredient["url"] is not None
                   for ingredient in ingredients):
                chef_name_parts = recipe.css(".chef__name *::text").getall()
                chef_name = chef_name_parts[-1] if len(
                    chef_name_parts) > 0 else None
                image_urls = recipe.css(
                    ".recipe-media__image img::attr(src)").getall()
                yield {
                    "title": recipe.css("h1::text").get(),
                    "url": response.url,
                    "chef_name": chef_name,
                    "ingredients": ingredients,
                    "image_urls": image_urls,
                }
예제 #7
0
    def parse(self, response: Response):
        """获取小说信息"""
        items = BookItem()
        if response.status == 200:
            # print(response.text)
            lis = response.css('.all-img-list li')
            for li in lis:
                items['book_id'] = uuid.uuid4().hex
                items['book_url'] = li.xpath('./div/a/@href').get()
                items['book_name'] = li.xpath('./div/h4//text()').get()
                items['author'], *items['tags'] = li.css(
                    '.author a::text').extract()
                items['description'] = li.xpath('./div/p[2]//text()').get()
                items['img'] = li.css('.book-img-box img::attr("src")').get()

                yield Request('https://' + items['book_url'] + '#Catalog',
                              callback=self.parse_info,
                              priority=100,
                              meta={'book_id': items['book_id']})

                yield items

            # 获取下一页的链接
            next_url = response.css('.lbf-pagination-item-list').xpath(
                './li[last()]/a/@href').get()

            if next_url.find('javascript') == -1:
                # priority高的优先执行
                yield Request('https:' + next_url, priority=1)
예제 #8
0
 def parse_video_page(self, response: Response):
     item = VideoItem()
     item['up_name'] = response.css('.name a::text').extract_first()
     item['title'] = response.css('.tit::text').extract_first()
     aid = response.url.replace("https://www.bilibili.com/video/av", "")
     item['aid'] = aid.replace("/", "")
     timestamp = int(round(time.time() * 1000))
     header = {
         "Host": "api.bilibili.com",
         "Origin": "https: // www.bilibili.com",
         "Referer": f"https://www.bilibili.com/video/av{item['aid']}/",
         "USER-AGENT":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
         'Accept-Language': 'en-US,en;q=0.5',
         'Accept': '*/*',
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
     }
     yield Request(
         url=
         f"http://api.bilibili.com/archive_stat/stat?callback=&aid={item['aid']}&type=json&_={timestamp}",
         dont_filter=False,
         headers=header,
         callback=self.parse,
         # item对象传递给下个函数
         meta={"item": item})
예제 #9
0
    def parse(self, response: Response):
        # 下面这个数字用来计算端口号
        data_num = response.css('html').re_first(
            r'<div style="display:none" data-[a-zA-Z]*="(\d+)"')
        for row in response.css('table.table-hover tbody tr'):
            loader = ItemLoader(item=IPItem(), selector=row)
            loader.add_value('source', 'dbproxy')
            loader.add_css('remark', 'td:nth-child(4) div::text')
            loader.add_css('protocol', 'td:nth-child(5)::text')

            # 解密IP
            script_elem = row.css('td:nth-child(1) script::text')
            ip_first_part = script_elem.re_first(r'\'([\d\.]*)\'\.split')
            ip_first_part = ''.join(reversed(ip_first_part))

            hex_list = script_elem.re(r'\\x([A-Za-z0-9]{2})')
            b64_string = bytearray.fromhex(''.join(hex_list)).decode()
            ip_second_part = base64.b64decode(b64_string).decode()

            loader.add_value('ip', ip_first_part + ip_second_part)

            # 解密port
            raw_port = script_elem.re_first(r'var  pp =  \((\d+) -')
            loader.add_value('port', int(raw_port) + int(data_num))

            yield loader.load_item()
예제 #10
0
    def parse(self, response: Response):
        if response.status == 200:
            # 解析数据
            lis = response.css('.all-img-list li')
            for li in lis:
                item = BookItem()
                item['book_id'] = uuid.uuid4().hex

                # li 对象类型是Selector  没有x()函数
                a = li.xpath('./div[1]/a')

                item['book_url'] = a.xpath('./@href').get()
                item['book_cover'] = a.xpath('./img/@src').get()

                item['book_name'] = li.xpath('./div[2]/h4//text()').get()
                item['author'], *item['tags'] = li.css(
                    '.author a::text').extract()
                item['summary'] = li.css('.intro::text').get()

                # 请求小说内容
                yield Request('https://' + item['book_url'],
                              callback=self.parse_info,
                              priority=10,
                              meta={'book_id': item['book_id']})

                yield item

            # 寻找下一页的标签
            next_url = response.css('.lbf-pagination-item-list').xpath(
                './li[last()]/a/@href').get()
            if next_url.find('javascript') == -1:  # 存在下一页
                yield Request('https:' + next_url,
                              priority=100)  # 优先级值越高,会优先下载
예제 #11
0
 def follow_pages(self, response: Response) -> Iterable[Request]:
     yield from (
         response.follow(letter.get(), callback=self.parse)  # type: ignore
         for letter in response.css(".az-keyboard ul li a::attr(href)"))
     yield from (
         response.follow(number.get(), callback=self.parse)  # type: ignore
         for number in response.css("ul.pagination__list li a::attr(href)"))
예제 #12
0
    def parse_pcexpect(self, response: Response):
        """コンピュータ予想"""
        item = response.meta["item"]
        focuses = response.css(".numberSet2_row").xpath("string()").getall()
        focuses = list(map(lambda x: "".join(x.split()), focuses))
        item["predict_patterns"] = focuses

        item["predict_confidence"] = response.css(
            ".state2 .state2_lv::attr('class')").re("is-lv(\d)")[0]

        marks = response.css(
            ".table1 .is-fs12 tr:first-child td:first-child").getall()
        for i, mark in enumerate(marks):
            match = re.search(r"icon_mark1_(\d+)\.png", mark)
            if match:
                item["racers"][i]["predict_mark"] = match[1]
            else:
                item["racers"][i]["predict_mark"] = None

        # 結果の取得へ
        url = self.get_url(
            response.css("ul.tab3_tabs li:nth-child(6) a::attr('href')").get())
        yield scrapy.Request(
            url=url,
            callback=self.parse_result,
            meta={"item": item},
        )
예제 #13
0
    def parse_odds3t(self, response: Response):
        def convert(odds):
            try:
                return float(odds)
            except:
                return 0

        """3連単"""
        item = response.meta["item"]

        odds = response.css("td.oddsPoint::text").getall()
        patterns = transpose(permutations(range(1, 7), 3), 20)
        item["trifecta"] = {
            "-".join(map(str, p)): convert(o)
            for p, o in sorted(zip(patterns, odds), key=lambda x: x[0])
        }

        # 直前情報の取得へ
        url = self.get_url(
            response.css("ul.tab3_tabs li:nth-child(3) a::attr('href')").get())
        yield scrapy.Request(
            url=url,
            callback=self.parse_beforeinfo,
            meta={"item": item},
        )
예제 #14
0
    def handle_page(self, response: Response) -> TorrentFileItem:
        torrents = response.css(
            'a[href^="forum.php?mod=attachment"]:contains("torrent")::attr(href)'
        ).extract()
        page_links = response.css(
            'a[href^="imc_attachad-ad.html"]:contains("torrent")::attr(href)'
        ).extract()
        if len(torrents) < 1 and len(page_links) < 1:
            return

        for torrent in torrents:
            request = DownloadRequest(
                url=response.urljoin(torrent),  # relative url to absolute
                callback=self.handle_item)
            request.meta['from_url'] = response.url
            yield request

        regex = re.compile(r'aid=(\w+)')
        for page_link in page_links:
            match = regex.search(page_link)
            if not match:
                continue
            id = match.group(1)
            request = DownloadRequest(
                url=response.urljoin('forum.php?mod=attachment&aid=%s' %
                                     id),  # relative url to absolute
                callback=self.handle_item,
                dont_filter=True)
            request.meta['from_url'] = response.url
            yield request
예제 #15
0
def _country(response: Response):
    h3s = response.css('h3')
    ols = response.css('h3+ol')
    for h3, ol in zip(h3s, ols):
        country = h3.css('span.mw-headline::text').get()
        if country:
            yield (country, ol)
예제 #16
0
파일: zegna_it.py 프로젝트: Houtian17/Learn
    def parse_sku(self, response: Response):
        code = response.css('span.infoProduct__button::text').get()

        price_eur = response.css('span.pdpData__price::text').get().strip().strip('€ ').replace(',', '')
        price = {
            'eur': float(price_eur)
        }

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], [])
        yield sku
예제 #17
0
파일: iwc_cn.py 프로젝트: Houtian17/Learn
    def parse_sku(self, response: Response):
        attrs = []

        code = response.css('h2.iwc-buying-options-reference::text').get()
        name = response.css('h3.iwc-buying-options-title::text').get().strip()
        description = '<br>'.join(
            response.css(
                'ul[data-toggle-id="showDetails1"] li.iwc-product-detail-item::text'
            ).getall())

        tracking_product = json.loads(
            response.css('button[data-tracking-product]').
            attrib['data-tracking-product'])
        cny = tracking_product['price']

        price = {'cny': float(cny)}

        image_elements = response.css('div.rcms_productPageThumbnails')
        image_urls = [item.attrib['data-src'] for item in image_elements]
        image_urls = [self.base_url + url for url in image_urls]
        image_urls = [re.sub(r'\.transform.+', '', url) for url in image_urls]

        compositions_one = [
            item.strip() for item in response.css(
                'ul[data-toggle-id="showDetails0"] li::text').getall()
            if len(item.strip())
        ]
        for s in compositions_one:
            s = s.strip().replace('\n', '')
            attrs.append({'name': '表壳', 'value': s})

        compositions_three = [
            item.strip() for item in response.css(
                'ul[data-toggle-id="showDetails2"] li::text').getall()
            if len(item.strip())
        ]
        for s in compositions_three:
            s = s.strip().replace('\n', '').replace(' ', '')
            attrs.append({'name': '机芯', 'value': s})

        compositions_four = [
            item.strip() for item in response.css(
                'ul[data-toggle-id="showDetails3"] li::text').getall()
            if len(item.strip())
        ]
        for s in compositions_four:
            s = s.strip().replace('\n', '')
            attrs.append({'name': '表带', 'value': s})

        compositions_two = [
            item.strip() for item in response.css(
                'ul[data-toggle-id="showDetails4"] li::text').getall()
            if len(item.strip())
        ]
        for s in compositions_two:
            s = s.strip().replace('\n', '')
            attrs.append({'name': '表盘', 'value': s})
        sku = SKU(self.brand_name, '', '', code, name, response.url, price,
                  description, image_urls, attrs)
        yield sku
예제 #18
0
    def parse_sku(self, response: Response):
        code = response.css('div.itemInfo-modelfabricolor span.value::text').get()

        eur = response.css('span.price span.value::text').get().strip('€ ').replace('.', '').replace(',', '.')
        price = {
            'eur': float(eur)
        }

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], [])

        yield sku
예제 #19
0
파일: furla_us.py 프로젝트: Houtian17/Learn
    def parse_sku(self, response: Response):
        code = response.css('div.product-number div::text').get()

        price_usd = response.css('span.price-sales::text').get()
        if price_usd is not None and len(price_usd) > 1:
            price_usd = price_usd.strip('$').replace(',', '')
            price = {
                'usd': float(price_usd),
            }

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', {}, {})
        yield sku
예제 #20
0
    def parse(self, response: Response):
        params = parse_query_params(response.url)
        stadium_urls = response.css("td.is-alignL a::attr('href')").re(
            r".*raceindex.*")
        self.logger.info(f"{params['hd']}: {len(stadium_urls)} stadiums")
        for stadium_url in stadium_urls:
            yield scrapy.Request(url=self.get_url(stadium_url),
                                 callback=self.parse_stadium)

        # さらに前日へ
        url = response.css("li.title2_navsLeft a::attr('href')").get()
        yield scrapy.Request(url=self.get_url(url), callback=self.parse)
예제 #21
0
 def handle(self, response: Response) -> Result:
     price = (
         response.css("div.priceView-hero-price")
         .css(".priceView-customer-price")
         .css("span[aria-hidden=true]::text")
         .get()
     )
     availability = (
         response.css("button.add-to-cart-button::text").get().lower()
         == "add to cart"
     )
     return Result(url=response.url, price=price, availability=availability)
예제 #22
0
    def parse_sku(self, response: Response):
        price_hkd = response.css('span.sales::text').get().strip().strip('HK$').replace(',', '')
        price = {
            'hkd': float(price_hkd)
        }

        composition = response.css('ul.list.fs-s.ff-light li::text').getall()
        code = composition[0].strip().strip('參考編號').strip().strip(':').strip()

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], [])

        yield sku
예제 #23
0
    def parse_sku(self, response: Response):
        attrs = []
        image_urls = []

        name = response.css('h1.product-purchase_name::text').get()

        price_cny = response.css('span.product-purchase_price::text').get().strip('¥').replace(',', '')
        price = {
            'cny': float(price_cny),
        }

        color = response.css('li[data-type="colour"] span.product-purchase_selected::text').get()
        attrs.append({
            'name': '颜色',
            'value': color,
        })

        # 爬取其他颜色
        other_color_urls = [self.base_url + item.attrib['href'] for item in
                            response.css('li[data-type="colour"] div.product-purchase_options-labels a')]
        for url in other_color_urls:
            yield Request(url, callback=self.parse_sku)

        description = response.css('div.accordion-tab_content p::text').get()

        attrs_in_page = response.css('div.accordion-tab_sub-item li::text').getall()
        for attr in attrs_in_page:
            parts = attr.split(':', 1)
            n = '参数'
            v = parts[0]
            if len(parts) > 1:
                n = parts[0]
                v = parts[1]
            attrs.append({
                'name': n,
                'value': v,
            })

        if response.css('span[data-label="选择尺码"]').get() is not None:
            sizes = response.css('li[data-type="size"] div.product-purchase_options label::text').getall()
            attrs.append({
                'name': '尺码',
                'value': ','.join(sizes)
            })

        image_elements = response.css('div.product-carousel_item noscript picture img')
        image_urls = ['https://' + (item.attrib['src'].strip('//')) for item in image_elements]

        code = response.css('p.accordion-tab_item-number::text').get().strip("商品 ")

        sku = SKU(self.brand_name, '', '', code, name, response.url, price, description, image_urls, attrs)

        yield sku
예제 #24
0
    def parse_sku(self, response: Response):
        code = response.css('span[itemprop="productID"]::text').get()

        if code is None:
            return

        price_usd = response.css('span.price-sales::text').get().strip(
            '$ ').replace('.', '').replace(',', '.')
        price = {'usd': float(price_usd)}

        sku = SKU(self.brand_name, '', '', code, '', '', price, '', {}, {})
        yield sku
예제 #25
0
 def handle(self, response: Response) -> Result:
     price = "UNKNOWN"
     availability = response.css("div[id=outOfStock]").get() is None
     if availability:
         # Check if it is available new
         price = response.css("span[id=price_inside_buybox]::text").get()
         if not price:
             # Check if it is available used
             price = response.css("div[id=buyNew_noncbb]").css("span::text").get()
         if isinstance(price, str):
             price = price.strip()
     return Result(url=response.url, price=price, availability=availability)
예제 #26
0
    def single_parse(self, response: Response):
        self.debug(response)
        if not response.xpath('//h1[@id="title"]/*/text()').get('').strip():
            return Request(response.url,
                           self.single_parse,
                           dont_filter=True,
                           errback=self.errors('single_parse'))
        log('product')
        now = dt.now(timezone('Asia/Tokyo'))
        product = AmazonItem()

        product['time'] = now.strftime('%Y-%m-%dT%H-%M-%S')
        product['title'] = response.xpath('//h1[@id="title"]/*/text()').get(
            '').strip()
        product['url'] = response.url
        review = response.css('span#acrCustomerReviewText::text').get('')
        product['review_num'] = review[0:-4] if review else 0
        product['description'] = '\n'.join([
            x.strip() for x in response.css(
                '#feature-bullets > ul > li *::text').getall()
            if x.strip() not in ('', 'モデル番号を入力してください', 'これが適合するか確認:')
        ])
        seller = response.css('a#sellerProfileTriggerId')
        if seller:
            shop_name = seller.css('*::text').get('')
            seller_id = get_query_val(seller.attrib['href'], 'seller')
            shop_url = f'{amazon_url}/sp?seller={seller_id}' if seller_id else ''
        elif response.xpath('//*[@id="merchant-info"]/a'):
            shop_name = 'Amazon.co.jp'
            shop_url = 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070'
        else:
            shop_name = '-'
            shop_url = ''
        product['shop_name'] = shop_name
        product['shop_url'] = shop_url
        product['categories'] = ' > '.join([
            el.get().strip() for el in response.css(
                '#wayfinding-breadcrumbs_feature_div > ul > li > span > a::text'
            )
        ])
        if shop_url == 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070':
            product['shop_address'] = '〒153-0064 東京都目黒区下目黒1-8-1 日本'
            yield product
        elif shop_url:
            yield Request(shop_url,
                          self.shop_parse,
                          meta={'product': product},
                          dont_filter=True,
                          errback=self.errors('single_parse', response.url))
        else:
            product['shop_address'] = '---'
            yield product
예제 #27
0
    def parse_sku(self, response: Response):
        attrs = []

        price_eur = response.css('div.itemPrice span.value::text').get().replace('.', '').replace(',', '.')
        price = {
            'eur': float(price_eur)
        }

        code = response.css('div.item-mfc span.item-mfc-value::text').get().strip('Modello: ')

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], [])

        yield sku
예제 #28
0
파일: furla_it.py 프로젝트: Houtian17/Learn
    def parse_sku(self, response: Response):
        code = response.css('div.product-number div::text').get()
        price = {}
        price_eur = response.css('span.price-sales::text').get()
        if price_eur is not None and len(price_eur) > 1:
            price_eur = price_eur.strip('€ ').replace(',', '.')
            price = {
                'eur': float(price_eur),
            }

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '',
                  [], {})
        yield sku
예제 #29
0
    def parse_sku(self, response: Response):
        attrs = []

        price_hkd = response.css(
            'div.itemPrice span.value::text').get().replace(',', '')
        price = {'hkd': float(price_hkd)}
        code = response.css('div.item-mfc span.item-mfc-value::text').get(
        ).strip('Product ID: ')

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '',
                  [], [])

        yield sku
예제 #30
0
파일: iwc_us.py 프로젝트: Houtian17/Learn
    def parse_sku(self, response: Response):
        code = response.css('h2.iwc-buying-options-reference::text').get()

        tracking_product = json.loads(
            response.css('button[data-tracking-product]').
            attrib['data-tracking-product'])
        usd = tracking_product['price']

        price = {'usd': float(usd)}

        sku = SKU(self.brand_name, '', '', code, '', response.url, price, '',
                  [], [])

        yield sku