示例#1
0
def do(queue, string_proxy):
    lr = LRequest(string_proxy=string_proxy)
    while 1:
        try:
            # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk
            category = queue.get(timeout=30)
            url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus(
                category)

            lr.load(url)
            if check_captcha(lr):
                lr.load(url)
            ele = lr.xpath('//h2[@id="s-result-count"]')

            f.write('%s\t%s\n' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',', '')))
            f.flush()
            print '%s\t%s' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',', ''))

        except Empty:
            print 'empty'
            break
        except Exception as e:
            queue.put(category)
            print 'EEEEEEEEE %s' % e
示例#2
0
def get_codes(delay=.0):  # 20200810: need delay 4s
    codes = []
    urls = [
        'http://app.finance.ifeng.com/list/stock.php?t=ha&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=hs&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=sa&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=kcb&f=symbol&o=asc',
    ]

    lr = LRequest(delay=delay)

    try:
        for url, m in urls:
            # logger.info('Load: %s' % url)
            lr.load(url, isdecode=True)
            while 1:
                for ele in lr.xpaths(
                        '//div[@class="tab01"]/table//td[1]/a')[:-1]:
                    code = ele.text.strip()
                    if code.isdigit():
                        codes.append(code)

                next_ele = lr.xpath(u'//a[contains(text(), "下一页")]')
                if next_ele is None:
                    break
                next_url = urljoin(url, next_ele.attrib['href'])
                # logger.info('Load: %s' % next_url)
                lr.load(next_url, isdecode=True)
    except:
        logger.error(traceback.format_exc())
    return codes
示例#3
0
class GoogleSearch(object):

    search_url = 'https://www.google.%(tld)s/search?q=%(query)s&hl=%(lang)s&filter=%(filter)d&num=%(num)d&start=%(start)s&btnG=Google+Search'

    def __init__(self, query, *args, **kwargs):

        self.query = query

        self._tld = kwargs.get('tld', 'com')
        self._filter = kwargs.get('filter', 0)
        self._lang = kwargs.get('lang', 'en')
        self._num = kwargs.get('num', 100)
        self._page = kwargs.get('page', 0)

        timeout = kwargs.get('timeout', 90)
        string_proxy = kwargs.get('string_proxy', None)

        self.lr = LRequest(timeout=timeout, string_proxy=string_proxy, handers=[GoogleHTTPErrorProcessor(), ])


    @property
    def page(self):
        return self._page

    @page.setter
    def page(self, value):
        self._page = value


    def _get_result(self):
        safe_url = self.search_url % {'query': urllib.quote_plus(self.query),
                            'start': self.page * self._num,
                            'num': self._num,
                            'tld' : self._tld,
                            'lang' : self._lang,
                            'filter': self._filter}

        print safe_url
        self.lr.load(safe_url)

        results = []
        i = 0
        for r in self.lr.xpath('//li[@class="g"]'):
            i += 1
            result = {}
            result['title'] = ''.join(r.xpath('./div/h3//text()'))
            result['description'] = ''.join(r.xpath('./div//span[@class="st"]//text()'))
            result['url'] = ''.join(r.xpath('./div/h3/a/@href'))

            results.append(result)

        print i

        return results

    def get_result(self):

        return self._get_result()
示例#4
0
def do(queue, string_proxy):
    lr = LRequest(string_proxy=string_proxy)
    while 1:
        try:
            # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk
            category = queue.get(timeout=30)
            url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus(
                category)

            lr.load(url)
            if check_captcha(lr):
                lr.load(url)

            total_price = 0.0
            count = 0.0
            price_eles = lr.xpaths(
                '//span[contains(@class, "s-price a-text-bold")]')
            for price_ele in price_eles:  # $49.99
                price = price_ele.text.replace('$', '').replace(',', '').split(
                    '-', 1)[0].strip()
                try:
                    float(price)
                except:
                    pass
                else:
                    total_price += float(price)
                    count += 1
            if count > 0:
                ave_price = total_price / count

            ele = lr.xpath('//h2[@id="s-result-count"]')

            f.write('%s\t%s\t%.2f\n' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(
                    ',', ''), ave_price))
            f.flush()
            print '%s\t%s\t%.2f' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',',
                                                                ''), ave_price)

        except Empty:
            print 'empty'
            break
        except Exception as e:
            traceback.print_exc()
            queue.put(category)
            print 'EEEEEEEEE %s' % e
示例#5
0
class AmazonBase(object):

    CACHE_ROOT = ''
    CACHE_PAGES_ROOT = ''
    CACHE_IMAGES_ROOT = ''

    CACHE_EXPIRED_DAYS = 15

    captcha = None

    def __init__(self, **kwargs):

        self.lr = LRequest(string_proxy=kwargs.get('string_proxy', ''))

        self.captcha = GsaCaptcha(ip=kwargs.get('gsa_ip', '192.168.1.188'),
                                  port=kwargs.get('gsa_port', '8000'))

        self.CACHE_ROOT = config.AMAZON_CACHE_ROOT
        self.CACHE_PAGES_ROOT = kwargs.get(
            'cache_page', os.path.join(self.CACHE_ROOT, 'pages'))
        self.CACHE_IMAGES_ROOT = kwargs.get(
            'cache_image', os.path.join(self.CACHE_ROOT, 'images'))

        if not os.path.exists(self.CACHE_ROOT): os.makedirs(self.CACHE_ROOT)
        if not os.path.exists(self.CACHE_PAGES_ROOT):
            os.makedirs(self.CACHE_PAGES_ROOT)
        if not os.path.exists(self.CACHE_IMAGES_ROOT):
            os.makedirs(self.CACHE_IMAGES_ROOT)

        self.domain = kwargs.get('domain', 'amazon.com')

        self.CACHE_EXPIRED_DAYS = kwargs.get('cache_expired_days', 15)

    def load(self, url, is_xpath=True, is_decode=True):
        # logger.info('Load Url: %s' % url)
        url = urllib.parse.quote(url, safe='https:/')
        self.lr.load(url, is_xpath=is_xpath, is_decode=is_decode)
        if self.check_captcha():
            self.lr.load(url, is_xpath=is_xpath, is_decode=is_decode)

    def check_captcha(self):
        if self.captcha is not None:
            captcha_img_ele = self.lr.xpath(
                '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]'
            )
            if captcha_img_ele is not None:
                while 1:
                    logger.info('Need Captcha')

                    try:
                        if captcha_img_ele is not None:
                            print('##### %s ' % captcha_img_ele.attrib['src'])
                            form = self.lr.get_forms()[0]
                            self.lr.load(captcha_img_ele.attrib['src'])
                            cap = self.captcha.decode_stream(self.lr.body)
                            logger.info('Captcha: %s' % cap)

                            form['field-keywords'] = cap
                            self.lr.load(form.click())
                        else:
                            return True

                        captcha_img_ele = self.lr.xpath(
                            '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]'
                        )

                    except KeyboardInterrupt:
                        raise
                    except IndexError:
                        self.lr.load(self.lr.current_url)
                        captcha_img_ele = self.lr.xpath(
                            '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]'
                        )
                        if captcha_img_ele is None:
                            return True
                    except:
                        # open(os.path.join('I:\\captcha_error_page', '%s.html' % time.time()), 'w').write(self.lr.body)
                        logger.error(traceback.format_exc())

            return False
        else:
            raise RuntimeError('Not Captcha Server...')

    def exists_cache(self, cache_name):
        cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0],
                                  cache_name[1], cache_name)
        return os.path.exists(cache_path)

    def remove_cache(self, cache_name):
        cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0],
                                  cache_name[1], cache_name)

        if os.path.exists(cache_path):
            try:
                os.remove(cache_path)
            except:
                pass

    def load_cache(self, cache_name):
        cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0],
                                  cache_name[1], cache_name)

        if os.path.exists(cache_path):
            try:
                return pickle.loads(gzip.GzipFile(cache_path, 'rb').read())
            except:
                return {}

        return {}

    def save_cache(self, cache_name, data):
        _p = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1])
        if not os.path.exists(_p): os.makedirs(_p)

        cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0],
                                  cache_name[1], cache_name)

        gzip_file = gzip.open(cache_path, 'wb')
        gzip_file.write(pickle.dumps(data))
        gzip_file.close()

    def exists_image(self, name):
        image_path = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1],
                                  name)
        return os.path.exists(image_path)

    def save_image(self, name, data):
        _p = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1])
        if not os.path.exists(_p): os.makedirs(_p)

        image_path = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1],
                                  name)
        open(image_path, 'wb').write(data)

    @staticmethod
    def wrapped_url(url):
        return url.split('/ref', 1)[0]

    @cache()
    @load_html
    @name
    @price
    @brand
    @merchant
    @sold_by
    @reviews
    @star
    @ranks_str
    @other_seller
    @weight_ounces
    def product_detail(self, asin, is_cache=True, **kwargs):

        return kwargs.get('product_info', {})

    @cache()
    @load_html
    @image_urls
    @image_data
    def product(self, asin, is_cache=True, **kwargs):
        return kwargs.get('product_info', {})