def getNum(url): soup = getSoup(url) if(soup==-1): return -1 raw=soup.find('h2',{'class':'a-size-base a-spacing-small a-spacing-top-small a-text-normal'}).contents[0] start = raw.find('of ') end= raw.find(' results') return int(raw[start+3:end].replace(',',''))
def getData(line): item = line.split() if(check(item[1])==0): return -1 id =item[0] url = 'http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords='+\ id + '&rh=i%3Aaps%2Ck%3AB00R8GX5WO' result={'ASIN': id, 'offers':'N', 'reviews':item[1],'stars':item[3]} soup = getSoup(url) if(soup==-1): return result raw=soup.findAll(text=re.compile('offers')) if(len(raw)>0): result['offers'] = raw[0].split()[0].strip('(') if(result['stars'] =='N'): raw=soup.findAll(text=re.compile('out of 5 stars')) if(len(raw)>0): result['stars'] = raw[0].split()[0] #print result return result
def getData(i,interval): results=[] url=global_url.format(interval[0], interval[1], i) soup = getSoup(url) if(soup==-1): print "errrrrrrrror\t%s" % url return -1, i contents=soup.findAll('div',{'class':"s-item-container"}) print("page: %d num: %d "% (i, len(contents))) for item in contents: result={'ASIN': item.parent['data-asin'], 'offers':'N', 'reviews':'N','stars':'N'} try: #other price new new_offer = item.find('span',{'class':'a-size-base a-color-price a-text-bold'}) if(new_offer): offer_num = new_offer.parent.find('span',{'class':'a-color-secondary'}) result['offers'] = offer_num.contents[0].split()[0].strip('(') else: result['offers']='N' #reviews raw_review = item.findAll('a',{'class':"a-size-small a-link-normal a-text-normal"})[-1] if('Reviews' in raw_review.prettify()): result['reviews'] = raw_review.contents[-1] #stars stars= item.find(text=re.compile('out of 5 stars')) if(stars): result['stars'] = stars.split()[0] except: print result['ASIN'] finally: results.append(result) return results, i
def getData(line): output = check(line) if(output==-1): return (-2,[],[]) id = output['ASIN'] url1 = 'http://www.amazon.com/Revgear-129004-Youth-Boxing-Glove/dp/'+ id +'/ref=sr_1_1?ie=UTF8&qid=1450578728&sr=8-1' url2 = 'http://www.amazon.com/gp/offer-listing/'+ id + '/ref=sr_1_1_olp?ie=UTF8&qid=1449347231&sr=8-1&keywords=B0019CU6T8&condition=new' soup = getSoup(url1) if(soup==-1): return (-1,[],[]) #find parent tmp_pounds = soup.findAll(text=re.compile('pounds')) tmp_ounces = soup.findAll(text=re.compile('ounces')) weight = getWeight(tmp_pounds, 'Shipping Weight',1) if(weight =='N'): weight = getWeight(tmp_ounces,'Shipping Weight',1) if(weight =='N'): weight = getWeight(tmp_ounces,'Item Weight',1) if(weight =='N'): weight = getWeight(tmp_pounds,'Item Weight',1) #find parent's parent if(weight =='N'): weight = getWeight(tmp_pounds, 'Shipping Weight',0) if(weight =='N'): weight = getWeight(tmp_ounces,'Shipping Weight',0) if(weight =='N'): weight = getWeight(tmp_ounces,'Item Weight',0) if(weight =='N'): weight = getWeight(tmp_pounds,'Item Weight',0) output['weight']=weight soup = getSoup(url2) if(soup==-1): return (-1,[],[]) products=soup.findAll('div', {'class':'a-row a-spacing-mini olpOffer'}) prime_list, uprime_list=[], [] num_p, num_up =0,0 for product in products: try: raw = product.find('span',{'class':'a-size-large a-color-price olpOfferPrice a-text-bold'}) if(raw): price_raw = raw.contents[0] else: continue price = float(price_raw.strip().split('$')[1].replace(',', '')) if(product.find('span',{'class':'supersaver'})): if num_p>1: break prime_list.append(price) num_p=num_p+1 else: if num_up>1: break shipping_raw= product.find('span',{'class':'olpShippingPrice'}) if(shipping_raw): shipping = float(shipping_raw.contents[0].split('$')[1]) else: shipping=0 uprime_list.append((price, shipping)) num_up=num_up+1 except: print product return (-1,[],[]) return (output, prime_list, uprime_list)