def update_review(isbn): attrs = {} d = {} # tor proxy with protocol and port address proxy = {'socks4':'127.0.0.1:9050'} # dump requests from all website into memory for key, value in urlset.items(): t_url = value + isbn key_url = key + '_url' attrs[key_url] = t_url r = requests.get(t_url, proxies=proxy) if r.status_code == 200: d[key] = pq(r.text) ## Fetch price attribute for flipkart website if (d.get('flipkart') != None): fk = d.get('flipkart') attrs["flipkart"] = fk("meta[itemprop=\"price\"]").attr("content") try: attrs["ratingValue"] = float(fk ("meta[itemprop=\"ratingValue\"]").attr("content")) except (TypeError, ValueError), e: attrs["ratingValue"] = 'Not Rated' try: attrs["ratingCount"] = int(fk ("span[itemprop=\"ratingCount\"]").text()) except (TypeError, ValueError), e: attrs["ratingCount"] = 'None'
def get_review(isbn): attrs = {} d = {} # compile regex for integer matching sankhya = re.compile(r'\d+') # Tor proxy with port and protocol proxy = {'socks4':'127.0.0.1:9050'} attrs['_id'] = isbn attrs['date'] = datetime.datetime.utcnow() ''' requests get html content from different website and store them in dictionary into the memory directly.''' for key, value in urlset.items(): t_url = value + isbn key_url = key + '_url' attrs[key_url] = t_url r = requests.get(t_url, proxies=proxy) if r.status_code == 200: d[key] = pq(r.text) ## for Infibeam website Price if (d.get('Infibeam') != None): Ib = d.get('Infibeam') attrs['Infibeam'] = Ib( "span[class=\"infiPrice amount price\"]").text() else: attrs['Infibeam'] = 'None' ## for Crossword website Price if (d.get('Crossword') != None): try: attrs['Crossword'] = d.get('Crossword')( "span[class=\"variant-final-price\"]").text().strip('R') except AttributeError: attrs['Crossword'] = d.get('Crossword')( "span[class=\"variant-final-price\"]").text() else: attrs['Crossword'] = 'None' ## for Homeshop18 website Price if (d.get('Homeshop18') != None): try: attrs['Homeshop18'] = d.get('Homeshop18')( "span[id=\"hs18Price\"]").text().split()[1] except AttributeError: attrs['Homeshop18'] = d.get('Homeshop18')( "span[id=\"hs18Price\"]").text() else: attrs['Homeshop18'] = 'None' ## for Bookadda website Price if (d.get('Bookadda') != None): try: attrs['Bookadda'] = d.get('Bookadda')( "span[class=\"actlprc\"]").text().strip('Rs.') except AttributeError: attrs['Bookadda'] = d.get('Bookadda')( "span[class=\"actlprc\"]").text() else: attrs['Bookadda'] = 'None' ## for rediff book website if (d.get('Rediffbook') != None): try: attrs['Rediffbook'] = d.get('Rediffbook')( "div[class=\"proddetailinforight\"]").text().split()[2] except (IndexError, AttributeError), e: attrs['Rediffbook'] = d.get('Rediffbook')( "div[class=\"proddetailinforight\"]").text()