Exemplo n.º 1
0
    def __init__(self, query=None, page=1, region=DEFAULT_REGION, url=None, html=None, html_element=None, products=None):
        def get_iter(it):
            if not hasattr(it, '__iter__') or isinstance(it,str):
                return [it]
            else:
                return it

        self._products = []
        self._indexes = []
        self._urls = []

        if query is not None:
            page = get_iter(page)
            url = [build_url(query=query, page_num=p, region=region) for p in page]
        if url is not None:
            url = get_iter(url)
            self._urls = url
            html = [request.urlopen(request.Request(build_url(u), **URL_ADDONS)).read() for u in url]
        if html is not None:
            html = get_iter(html)
            html_element = [html_module.fromstring(h) for h in html]
        if html_element is not None:
            html_element = get_iter(html_element)
            for html_el in html_element:
                products = html_el.cssselect('div.s-result-list.s-search-results.sg-row > .s-result-item')
                products = [x for x in products if x.cssselect('h2')]
                products = [AmzProduct(elem) for elem in products]
        if products is not None:
            products = get_iter(products)
            products = [prod for prod in products if prod.is_valid()]
            self._products += products
            self._indexes += [prod._index for prod in products]
Exemplo n.º 2
0
    def _get_from_html(self, root):
        d = {}

        title_root = [
            x for x in root.cssselect('a') if len(x.cssselect('h2')) > 0
        ][0]
        d['title'] = ''.join(
            [x.text_content() for x in title_root.cssselect('h2')])
        d['product_url'] = build_url(title_root.get('href'))
        for elem in title_root.getparent().getparent().cssselect(
                'div[class="a-row a-spacing-none"]'):
            temp_subtext = ''.join([
                x.text_content()
                for x in elem.cssselect('span[class*="a-size-small"]')
            ])
            if len(temp_subtext) > 0:
                d['subtext'] = d.get('subtext', []) + [temp_subtext]

        d['image_url'] = root.cssselect('img[src]')[0].get('src')
        d['rating'] = AmzRating(root) or None

        d['prices'] = {}
        price_names = root.cssselect('h3[data-attribute]')
        price_text = root.cssselect('span[class^="a"]')
        price_text = filter(
            lambda x: re.match('^[^a-z\-]+$', str(x.text)) and re.search(
                '[\.\,]', str(x.text)) and re.search('\d', str(x.text)),
            price_text)

        for i, el in enumerate(price_text):
            if i >= len(price_names):
                price_key = str(
                    len(d['prices']
                        ))  # defaults to a number if no name for price type
            else:
                price_key = price_names[i].text
            d['prices'][price_key] = el.text

        extras = root.cssselect(
            'div[class="a-fixed-left-grid-inner"] > div > span')
        extras = [re.sub('\s+', ' ', x.text_content().strip()) for x in extras]
        d['extra_attributes'] = dict(list(zip(extras, extras[1:]))[::2])

        # _index is not used explicitly in _all_attrs but can be referenced elsewhere
        d['_index'] = root.get('id', '').split('_')[-1]

        # clean up before returning
        return dict(
            map(lambda k: (k, d[k].strip()
                           if isinstance(d[k], str) else d[k]), d))