Пример #1
0
def update_review(isbn):
    attrs = {}
    d = {}
    # tor proxy with protocol and port address
    proxy = {'socks4':'127.0.0.1:9050'}

    # dump requests from all website into memory
    for key, value in urlset.items():
        t_url = value + isbn
        key_url = key + '_url'
        attrs[key_url] = t_url
        r = requests.get(t_url, proxies=proxy)
        if r.status_code == 200:
            d[key] = pq(r.text)


## Fetch price attribute for flipkart website
    if (d.get('flipkart') != None):
        fk = d.get('flipkart')
        attrs["flipkart"] = fk("meta[itemprop=\"price\"]").attr("content")
        try:
            attrs["ratingValue"] = float(fk
                        ("meta[itemprop=\"ratingValue\"]").attr("content"))
        except (TypeError, ValueError), e:
            attrs["ratingValue"] = 'Not Rated'
        try:
            attrs["ratingCount"] = int(fk
                ("span[itemprop=\"ratingCount\"]").text())
        except (TypeError, ValueError), e:
            attrs["ratingCount"] = 'None'
Пример #2
0
def get_review(isbn):
	attrs = {}
	d = {}
	# compile regex for integer matching
	sankhya = re.compile(r'\d+')
	# Tor proxy with port and protocol
	proxy = {'socks4':'127.0.0.1:9050'}
	attrs['_id'] = isbn
	attrs['date'] = datetime.datetime.utcnow()

	''' requests get html content from
	different website and store them in dictionary
	into the memory directly.'''
	for key, value in urlset.items():
		t_url = value + isbn
		key_url = key + '_url'
		attrs[key_url] = t_url
		r = requests.get(t_url, proxies=proxy)
		if r.status_code == 200:
			d[key] = pq(r.text)

	## for Infibeam website Price

	if (d.get('Infibeam') != None):
		Ib = d.get('Infibeam')
		attrs['Infibeam'] = Ib(
                    "span[class=\"infiPrice amount price\"]").text()
	else:
		attrs['Infibeam'] = 'None'


	## for Crossword website Price

	if (d.get('Crossword') != None):
		try:
			attrs['Crossword'] = d.get('Crossword')(
                        "span[class=\"variant-final-price\"]").text().strip('R')
		except AttributeError:
			attrs['Crossword'] = d.get('Crossword')(
                            "span[class=\"variant-final-price\"]").text()
	else:
		attrs['Crossword'] = 'None'


	## for Homeshop18 website Price

	if (d.get('Homeshop18') != None):
		try:
			attrs['Homeshop18'] = d.get('Homeshop18')(
                            "span[id=\"hs18Price\"]").text().split()[1]
		except AttributeError:
			attrs['Homeshop18'] = d.get('Homeshop18')(
                            "span[id=\"hs18Price\"]").text()
	else:
		attrs['Homeshop18'] = 'None'


	## for Bookadda website Price

	if (d.get('Bookadda') != None):
		try:
			attrs['Bookadda'] = d.get('Bookadda')(
                            "span[class=\"actlprc\"]").text().strip('Rs.')
		except AttributeError:
			attrs['Bookadda'] = d.get('Bookadda')(
                            "span[class=\"actlprc\"]").text()

	else:
		attrs['Bookadda'] = 'None'

    ## for rediff book website

	if (d.get('Rediffbook') != None):
		try:
			attrs['Rediffbook'] = d.get('Rediffbook')(
                        "div[class=\"proddetailinforight\"]").text().split()[2]
		except (IndexError, AttributeError), e:
			attrs['Rediffbook'] = d.get('Rediffbook')(
                            "div[class=\"proddetailinforight\"]").text()