Exemplo n.º 1
0
    def test_first_match_unicode_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(
            return_value=[u'one', u'two', u'three', u'four'])

        first_match = Utils.first_match(sample_list)
        assert first_match == u'one', "First item from unicode list incorrect!"
Exemplo n.º 2
0
    def test_first_match_string_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(
            return_value=['one', 'two', 'three', 'four'])

        first_match = Utils.first_match(sample_list)
        assert first_match == 'one', "First item from string list incorrect!"
Exemplo n.º 3
0
 def parse(self, response):
     menu = response.css('#nav_custom').xpath('li/ul')
     lis = menu.xpath('li')
     if self.categories:
         lis = SelectorList([li for cat in self.categories
                             for li in menu.xpath('li[a[span[text() = "' + cat + '"]]]')
                             if cat])
     for url in lis.xpath('ul/li/a[span[not(contains(text(), "Accessories"))]]/@href').extract():
         yield Request("%s?limit=all" % url, callback=self.parse_items)
Exemplo n.º 4
0
 def parse(self, response):
     menu = response.css('#top-menu').xpath('ul/li/ul')
     lis = menu.xpath('li').extract()
     if self.categories:
         lis = SelectorList([li for cat in self.categories
                             for li in menu.xpath('li[h3[contains(text(), "' + cat + '")]]')
                             if cat])
     for url in lis.xpath('ul/li/a[not(contains(text(), "All"))]/@href').extract():
         yield Request(url, callback=self.parse_items)
Exemplo n.º 5
0
 def parse(self, response):
     menu = get_extracted(response.css('#vmenu_69'))
     lis = menu.xpath('li')
     if self.categories:
         lis = SelectorList([li for cat in self.categories
                             for li in menu.xpath('li[div/a[text() = "' + cat + '"]]')
                             if cat])
     for url in lis.xpath('div/a/@href').extract():
         yield Request(url, callback=self.parse_items)
Exemplo n.º 6
0
    def test_trim_list_mixed_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(return_value=[
            '  one  ', u'  two point five ', 'three    ', u'   four'
        ])

        desired_list = ['one', 'two point five', 'three', 'four']
        trimmed_list = Utils.trim_list(sample_list)
        assert desired_list == trimmed_list, "Mixed list incorrect trimmed"
Exemplo n.º 7
0
 def select_from(self, selector: SelectorList) -> SelectorList:
     selected = selector.css(self.string_selector)
     if not selected:
         msg = 'Not found any "{}" containers.'.format(self.name)
         if self.raise_on_missed:
             raise RuntimeError(msg)
         else:
             self.logger.warning(msg)
             return SelectorList([])
     return selected
Exemplo n.º 8
0
    def test_trim_list_string_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(return_value=[
            '  one  ', '  two point five ', 'three    ', '   four'
        ])

        desired_list = ['one', 'two point five', 'three', 'four']
        trimmed_list = Utils.trim_list(sample_list)
        assert desired_list == trimmed_list, "String list incorrect trimmed: {%s} vs {%s}" % (
            ', '.join(map(str, desired_list)), ', '.join(map(
                str, trimmed_list)))
Exemplo n.º 9
0
    def test_trim_list_unicode_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(return_value=[
            u'  one  ', u'  two point five ', u'three    ', u'   four'
        ])

        desired_list = [u'one', u'two point five', u'three', u'four']
        trimmed_list = Utils.trim_list(sample_list)
        assert desired_list == trimmed_list, "Unicode list incorrect trimmed: {%s} vs {%s}" % (
            ', '.join(map(unicode, desired_list)), ', '.join(
                map(unicode, trimmed_list)))
Exemplo n.º 10
0
 def parse(self, response):
     menu = response.css('#top-menu').xpath('ul/li/ul')
     lis = menu.xpath('li').extract()
     if self.categories:
         lis = SelectorList([
             li for cat in self.categories
             for li in menu.xpath('li[h3[contains(text(), "' + cat + '")]]')
             if cat
         ])
     for url in lis.xpath(
             'ul/li/a[not(contains(text(), "All"))]/@href').extract():
         yield Request(url, callback=self.parse_items)
Exemplo n.º 11
0
    def _make_selector_list(self, elems, is_text, text_recurse, attr):
        if type(elems) is not list:
            elems = [elems]

        if is_text:
            return SelectorList(
                _TextNode(self.webdriver, s)
                for elem in elems
                for s in self._text_content(elem, text_recurse)
            )

        selectors = self._make_result(elems)
        if attr:
            selectors = (_NodeAttribute(s.element, attr) for s in selectors)
        return SelectorList(selectors)
Exemplo n.º 12
0
 def parse_comment(self, response):
     movie_name = Selector(response).xpath(
         '//h1/text()').extract_first().replace('短评', '').strip()
     comments = SelectorList(
         Selector(response).xpath('//div[@class="comment"]').extract())
     for comment in comments:
         shorts = Selector(
             text=comment).xpath('//p/span/text()').extract_first()
         votes = Selector(text=comment).xpath(
             '//h3/span[@class="comment-vote"]/span/text()').extract_first(
             )
         stars = Selector(text=comment).xpath(
             '//h3/span[@class="comment-info"]/span[contains(@class,"rating")]/@class'
         ).extract_first()
         if stars:
             stars = stars.split()[0].replace('allstar',
                                              '').strip().replace('0', '')
         else:
             stars = 0
         comment_time = Selector(text=comment).xpath(
             '//h3/span[@class="comment-info"]/span[@class="comment-time "]/text()'
         ).extract_first()
         comment_item = CommentItem()
         comment_item['movie_name'] = movie_name
         comment_item['shorts'] = shorts
         comment_item['stars'] = stars
         comment_item['votes'] = votes
         comment_item['comment_time'] = comment_time
         yield comment_item
Exemplo n.º 13
0
def parse_body(obj_name: str, logger: Logger,
               selectors: SelectorList) -> str or None:
    logger.info(
        f'--------------------------parse_body() {obj_name}, selectors[0]: {selectors.get(default="")[:50] or None}'
    )

    if obj_name != 'body' and not selectors.get():
        return

    def parse_line(selector: Selector) -> str:
        if selector.css('br').get():
            return ''
        elif selector.css('a[href*="mitemin"]').get():
            return parse_mitemin_href(f'an illust in {obj_name}', logger,
                                      selector)[1]

        texts = validate(f"a line of {obj_name}", logger,
                         selector.css("*::text").getall())
        if len(texts) == 1:
            return texts[0]

        larges = validate(f"words in a line of {obj_name}", logger,
                          selector.css('rb::text').getall())
        smalls = validate(f"rubies in a line of {obj_name}", logger,
                          selector.css('rt::text').getall())
        ruby_texts = [
            f'|{large}《{small}》' for large, small in zip(larges, smalls)
        ]
        return ''.join([
            ''.join(pair)
            for pair in zip_longest(texts, ruby_texts, fillvalue='')
        ])

    return '\n'.join(map(parse_line, selectors))
Exemplo n.º 14
0
def childes(
    selector: SelectorList,
    parent_tag: str,
) -> SelectorList:
    if not isinstance(parent_tag, str):
        raise TypeError('Given `parent_tag` is not `str` object.')
    childes_selector = SelectorList()
    iterate_selector_string_template = parent_tag + ' > :nth-child({i})'
    i = 1
    # starting the iteration
    while True:
        child = selector.css(iterate_selector_string_template.format(i=i))
        if child:
            childes_selector.append(child)
            i += 1
        else:
            return childes_selector
Exemplo n.º 15
0
def parse_href_end_num(obj_name: str, logger: Logger,
                       selector: SelectorList) -> int:
    logger.info('--------------------------parse_href_end_num()')
    url: str = validate(
        f'url of {obj_name}', logger,
        selector.xpath('@href').get())  # e.g. 'https://hoge.com/huga/114514/'
    return int(
        validate(f'{obj_name} in url', logger, safe_get(url.split('/'), -2)))
Exemplo n.º 16
0
    def parse_model(self, listing):
        vehicle = VehicleInfo()
        full_name = SelectorList(
            listing.xpath(
                '//*[@id="vdp-title"]/div/div/div[1]/div[1]/div[1]/h1/text()')
        ).extract_first()
        url_split = listing.url.split('/')
        ulr_path = re.sub('detail-', '', url_split[url_split.__len__() - 1])

        vehicle['year'] = re.findall('[0-9]{4}', ulr_path)[0]
        vehicle['make'] = re.sub('_', ' ',
                                 re.split('-', ulr_path)[1]).capitalize()
        vehicle['model'] = re.sub(
            vehicle['year'] + ' ' + vehicle['make'] + ' ', '', full_name)
        vehicle['domain'] = self.domain
        vehicle['trim'] = (
            listing.xpath('//*[@id="vdp-1-toggle"]/div[2]/div/div[2]/text()'
                          ).extract_first()).strip()
        vehicle['ext_color'] = listing.xpath(
            '//*[@id="tab-details"]/div[3]/table/tbody/tr[1]/td[2]/text()'
        ).extract_first().strip()
        vehicle['int_color'] = listing.xpath(
            '//*[@id="tab-details"]/div[3]/table/tbody/tr[2]/td[2]/text()'
        ).extract_first().strip()
        vehicle['stock_no'] = listing.xpath(
            '//*[@id="tab-details"]/div[3]/table/tbody/tr[3]/td[2]/text()'
        ).extract_first().strip()
        vehicle['miles'] = "0"
        vehicle['vin'] = listing.xpath(
            '//*[@id="tab-details"]/div[3]/table/tbody/tr[4]/td[2]/text()'
        ).extract_first()
        vehicle['url'] = listing.url
        vehicle['price'] = listing.xpath(
            '//*[@id="vdp-price"]/div/h4/text()').extract_first().lstrip('$')
        vehicle['veh_state'] = "new"
        vehicle['engine'] = listing.xpath(
            '//*[@id="tab-details"]/div[2]/table/tbody/tr[3]/td[2]/text()'
        ).extract_first()
        vehicle['transmission'] = listing.xpath(
            '//*[@id="tab-details"]/div[2]/table/tbody/tr[4]/td[2]/text()'
        ).extract_first()
        vehicle['drivetrain'] = listing.xpath(
            '//*[@id="tab-details"]/div[2]/table/tbody/tr[5]/td[2]/text()'
        ).extract_first()
        vehicle['body_type'] = listing.xpath(
            '//*[@id="vdp-title"]/div/div/div[1]/div[1]/div[1]/h4/text()'
        ).extract_first().strip()
        vehicle['title'] = listing.xpath(
            '/html/head/title/text()').extract_first()
        image_urls = listing.css(
            '#tab-slideshow-photos .swiper-slide a::attr(href)').extract()

        result = ""
        for img in image_urls:
            result = result + "http:" + img + ","
        vehicle['image'] = result.rstrip(',')
        yield vehicle
Exemplo n.º 17
0
 def parse(self, response):
     if self.record_spec.get('css'):
         record_selectors = response.css(self.record_spec['css'])
     elif self.record_spec.get('xpath'):
         record_selectors = response.xpath(self.record_spec['xpath'])
     # elif self.record_spec.get('jsonpath'):
     #     json_response = json.loads(response.body_as_unicode())
     #     jsonpath_expr = parse(self.record_spec['jsonpath'])
     #     records_raw = [json.dumps(match.value) for match in jsonpath_expr.find(json_response)]
     #     record_selectors = SelectorList([Selector(text=record_raw) for record_raw in records_raw])
     else:
         record_selectors = SelectorList()
     for record_selector in record_selectors:
         yield self.get_record_fields(record_selector)
Exemplo n.º 18
0
 def parse_article_item(self, response, extra_info):
   item = CommonNewsItem()
   item['original_url'] = format_url(response.url)
   item['id'] = hash_digest(item.get('original_url'))
   try:
     item['scr'] = extra_info.get('scr')
     item['cid'] = extra_info.get('cid')
     item['media'] = extra_info.get('media_id')
     item['title'] = response.css('div#article div.articlehead h1::text').extract_first().replace('\r\n', '')
     author = response.css('div#article div.articlehead span.author::text').extract_first() or ''
     item['author'] = author.replace('\r\n', '')
     date_str = '%s %s' % (response.xpath('head/meta[contains(@name,"publication_date")]/@content').extract_first(), response.css('div#article.bloc span.timestampUpdatedright').re_first('\d+:\d\d'))
     if extra_info.get('release_time'):
       item['release_time'] = extra_info.get('release_time')
     else:
       item['release_time'] = int((datetime.strptime(date_str,'%Y-%m-%d %H:%M')).timestamp()*1000)
     item['recom_time'] = int(datetime.now().timestamp()*1000)
     abstract = extra_info.get('abstract') or response.css('div#article div.articlehead span.kicker::text').extract_first() or ''
     item['abstract'] = abstract.replace('\r\n', '')
     item['content_type'] = 0
     item['url'] = ''
     content_selector = response.css('div#articlebody')
     thumbnail_src = response.xpath('head/meta[@property = "og:image"]/@content').extract_first()
     thumbnail_selector = SelectorList([Selector(text = '<img src="%s">' % thumbnail_src)])
     content, item['img'] = strip_html_imgs(content_selector, thumbnail_selector)
     # extract videos
     html, item['video'] = extract_content_videos(content)
     if item['video']:
       item['content_type'] = 1
     else:
       item['content_type'] = 0
     # format html
     html = strip_html_attrs(html)
     html = re.sub(r'</?[^p!][^>]*>','',html)
     item['content'] = html
     return item
   except Exception as e:
     print('failed to parse_article, url: %s' % response.url, e)
     traceback.print_exc()
     item['recom_time'] = None  # illegal flag
     return
Exemplo n.º 19
0
import logging
Exemplo n.º 20
0
    def parse_product(self, response):
        pic = response.xpath('//ul[@id="J_UlThumb"]/li/a/img/@src').extract()
        detail = response.xpath('//div[@class="attributes" and @id="attributes"]').extract()
        color_pics = response.xpath('//dd/ul[contains(@class,"tm-clear J_TSaleProp tb-img")]/li')
        total_list = response.xpath('//dd/ul[contains(@class,"tm-clear J_TSaleProp")]/li')
        json_getter = response.xpath('//div[@class="tm-clear"]/script[3]').extract()
        extract_pic = 
                
        print len(extract_pic)

        size_list = SelectorList()
        
        for i in total_list:
            if i.xpath('a/span/text()').extract()[0] not in color_pics.xpath('a/span/text()').extract():
                size_list.append(i)
#        cnt = 0
#        sku_type = 0
#        for i in total_list:
#            cnt = cnt + 1
#            if i.xpath('a/span/text()').extract()[0] not in color_pics.xpath('a/span/text()').extract():
#                size_list.append(i)
#                sku_type = cnt
#        if sku_type > len(total_list)-len(color_pics):
#            sku_type = 0
#        else:
#            sku_type = 1
#
#        if len(size_list)==0:
#            sku_type = 3
#
        # cut_line = len(total_list)-len(color_pics)

        str_val_map = {}

        for i in range(len(total_list)):
            v = total_list[i].xpath('@data-value').extract()
            n = total_list[i].xpath('a/span/text()').extract()
            if len(n)>0 and len(v)>0:
                str_val_map[n[0]] = v[0]
            else:
                print "Value Error"
        #size_list = total_list[:cut_line]
        st = json_getter[0].split('TShop.Setup(')[1]
        info_dict = self.python_getter(st)
        skuMap = info_dict["valItemInfo"]["skuMap"]

        product_img = []
        for i in pic:
            tmp = self.resize_pic(i)
            product_img.append(tmp)
    
        # color_set = selen.single_page(response.url)
        color_set = []
        for i in color_pics:
            color = {}
            color['color'] = i.xpath('@title').extract()[0]
            try:
                tmp_str = i.xpath('a/@style').extract()[0]
                tmp_str = tmp_str[tmp_str.find('(')+3:tmp_str.rfind(')')]
                tmp_str = tmp_str[0:tmp_str.rfind('_')]
                color['image_url'] = tmp_str
            except:
                color['image_url'] = product_img[0]

            color['alternative_image_urls'] = product_img
            color['pricing_list'] = self.get_pricing(skuMap, str_val_map, i, size_list, response.meta['price'])
            if len(color['pricing_list'])>0:
                color_set.append(color)
        
        item = MTSGetdataItem()
        item['product_url'] = response.meta['product_url']
        item['item_id'] = response.meta['item_id']
        item['title'] = response.meta['title']
        item['brand'] = 'Midi'
        item['merchant'] = 'Tmall'
        item['product_description'] = ''
        item['product_detail'] = detail[0]
        item['colors'] = color_set
        item['categories'] = info_dict['itemDO']['categoryId']
        if len(item['colors'])>0:
            yield item
Exemplo n.º 21
0
def select(selector: SelectorList, string_selector: str) -> SelectorList:
    return selector.css(string_selector)
Exemplo n.º 22
0
 def select_script(self, script, *args):
     """Return elements using JavaScript snippet execution."""
     result = self.webdriver.execute_script(script, *args)
     return SelectorList(self._make_result(result))
Exemplo n.º 23
0
    def test_first_match_empty_list(self):
        sample_list = SelectorList([])

        first_match = Utils.first_match(sample_list)
        assert first_match is None, "First item from empty list incorrect!"
Exemplo n.º 24
0
    def test_trim_list_empty_list(self):
        sample_list = SelectorList()
        sample_list.extract = Mock(return_value=[])

        trimmed_list = Utils.trim_list(sample_list)
        assert trimmed_list is None, "Empty list incorrect trimmed"