Exemplo n.º 1
0
class Scraper(object):

    HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.73 Safari/537.36',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding':'gzip,deflate,sdch',
                'Accept-Language':'it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded',
                # 'Referer': '',
                # 'Origin': '',
                # 'Host': ''
                }

    # PROXY = {'http':'http://*****:*****@38.78.197.196:60099'}
    PROXY = {}
    proxy_obj = ProxyRotator()

    def __init__(self, *args, **kwargs):
        self.proxy_obj = ProxyRotator()
        self.PROXY = json.loads(self.proxy_obj.rotate_proxy())
        print self.PROXY
        self.session = requests
        self.session.headers = self.HEADERS

    def clean_results(self, price):
        try:
            return float(re.search('[\d\.,]+', price).group(0).replace(',', ''))
        except AttributeError:
            return 0
Exemplo n.º 2
0
class PlccenterScraper(object):
    HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.73 Safari/537.36',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding':'gzip,deflate,sdch',
                'Accept-Language':'it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded',
                # 'Referer': '',
                # 'Origin': '',
                # 'Host': ''
                }

    PROXY = {}
    proxy_obj = ProxyRotator()

    def __init__(self, brand):
        self.brand = brand
        self.proxy_obj = ProxyRotator()
        self.PROXY = json.loads(self.proxy_obj.rotate_proxy())
        print self.PROXY
        self.session = requests
        self.session.headers = self.HEADERS
    
    def clean_results(self, price):
        try:
            return float(re.search('[\d\.,]+', price).group(0).replace(',', ''))
        except AttributeError:
            return 0
    
    def get_price(self, part_num):
        part_num = part_num.strip()
        if self.brand == 'Redlion':
            url = 'http://www.plccenter.com/en-US/Buy/RED LION CONTROLS/%s' % part_num
        else:
            try:
                part_num = '%06d' % int(part_num)
                url = 'http://www.plccenter.com/en-US/Buy/RELIANCE ELECTRIC/%s' % part_num
            except:
                pass
                url = 'http://www.plccenter.com/en-US/Buy/DODGE/%s' % part_num
        print self.proxy_obj
        print self.proxy_obj.rotate_proxy()
        data = self.proxy_obj.rotate_proxy()
        self.PROXY = json.loads(data)

        print url
        try:
            resp = self.session.get(url, proxies=self.PROXY)
            print resp.url
        except Exception as e:
            print e
            return 0
        x = html.fromstring(resp.content)

        try:
            price = x.xpath(".//div[contains(@class,'productDetailPriceRowOurPrice')]/text()")[0].strip()
            if 'Est.' in price:
                price = price.split('Est.')[1]
            print price
        except IndexError as e:
            print e
            print url
            return 0
        return self.clean_results(price)

    def scrap(self, brand):
        pass