def get_id(): #Best-BuyCameras & CamcordersCamcordersAction-Camcorders-Action Camcorders # http://www.bestbuy.com/site/ipad-tablets-ereaders/tablets/pcmcat209000050008.c?id = pcmcat209000050008 # base_url = '''http://www.bestbuy.com/site/searchpage.jsp?cp=[i]&searchType=search&browsedCategory=abcat0701000&id=pcat17071&nrp=24&qp=condition_facet%3DCondition~New&sc=Global&searchVariant=A&st=categoryid%24abcat0701000&type=page&usc=All%20Categories''' base_url = '''http://www.bestbuy.com/site/searchpage.jsp?cp=[i]&searchType=search&_dyncharset=UTF-8&ks=960&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&seeAll=&browsedCategory=pcmcat309300050015&st=pcmcat309300050015_categoryid%24abcat0502000&qp=features_facet%3DFeatures~2-in-1%20Design^condition_facet%3DCondition~New''' # 只有一页的时候 # html = get_html.get_html_src(base_url) # id_list = re.findall(r'data-sku-id="(.*?)"', html, re.S) # print len(id_list) # for goods_id in id_list: # with open("./id.txt", "aw") as f: # f.write(goods_id + "\n") # print goods_id for i in range(1, 6): # 页码,共2页 url = base_url.replace("[i]", str(i)) print (url) # time.sleep(5) html = get_html.get_html_src(url) id_list = re.findall(r'data-sku-id="(.*?)"', html, re.S) print (len(id_list)) for goods_id in id_list: with open("./id.txt", "aw") as f: f.write(goods_id + "\n") print (goods_id)
def get_asin(base_url, page): #Electronics : Computers & Accessories : Monitors : Prime Eligible : New for i in range(0, page): # 页码 start_num = i * 25 url = base_url.replace("[page]", str(i)).replace('[start]', str(start_num)) time.sleep(0.5) html = get_html.get_html_src(url) url_list_re = re.findall(r'<td colspan="2">(.*?)</td>', html, re.S) url_list = re.findall(r'<A HREF="(.*?)">', str(url_list_re), re.S) # print len(url_list) for goods_url in url_list: with open("./Result/items_url.txt", "aw") as f: f.write('http://www.frys.com/' + goods_url + "\n") print(goods_url)
def get_count(price_range, url): # 没有解析出count的url记录下来。 try: html = get_html.get_html_src(url) if html == '' or -1 != html.find( 'Sorry, we just need to make sure you'): lock.acquire() captcha_url_file.write(url + '\n') captcha_url_file.flush() lock.release() return if html == '404 error': lock.acquire() not_list_file.write(url + '\n') not_list_file.flush() lock.release() # print 'product not found' return # 没有抓下来的页面 if html == 'time out or other errors': lock.acquire() not_crawl_file.write(price_range + '\n') not_crawl_file.flush() lock.release() return num = re.search('<h2 id="s-result-count".*?>1-60 of (.*?) result', html) if num == None: num = re.search('<h2 id="s-result-count".*?>(.*?) result', html) if num != None: num = int(num.group(1).replace(',', '')) return num else: if 'did not match any products.' in html: lock.acquire() f_no_product.write(url + '\n') f_no_product.flush() lock.release() else: with open('get_count_fail.txt', 'aw') as f: f.write(price_range + '\n') return except Exception as e: print(str(e))
def handle(url): try: # html = tool.gethtmlproxy(url) html = get_html.get_html_src(url) if html == '' or -1 != html.find( 'Sorry, we just need to make sure you'): lock.acquire() f_fail.write(url + '\t空或验证码\n') f_fail.flush() lock.release() return if html == '404 error': lock.acquire() f_fail.write(url + '\t404 error\n') f_fail.flush() lock.release() return if html == 'time out or other errors': lock.acquire() f_fail.write(url + '\ttime out or other errors\n') f_fail.flush() lock.release() return tmp_asins = re.findall(r'data-asin="(.*?)"', html) print(tmp_asins) if len(tmp_asins) != 0: for asin in tmp_asins: lock.acquire() f_asins.write(asin + '\n') f_asins.flush() lock.release() lock.acquire() f_success.write(url + '\n') f_success.flush() lock.release() else: lock.acquire() f_fail.write(url + '\ttmp_asins为空\n') f_fail.flush() lock.release() except Exception as e: print(str(e))
def handle(asin): asin = asin.strip() try: baseurl = "https://www.amazon.ca/dp/[asin]" url = baseurl.replace('[asin]', asin) print (asin) html = get_html.get_html_src(url) robot_check = re.findall('<title dir="ltr">Robot Check</title>', html) if robot_check: lock.acquire() captcha_url_file.write(asin + '\n') captcha_url_file.flush() lock.release() print ("robot_check") else: error404 = re.findall("We're sorry. The Web address you entered is not a functioning page on our site", html) if error404: lock.acquire() not_list_file.write(asin + '\n') not_list_file.flush() lock.release() print ("not exit") else: buyboxinfo = [asin] price = re.findall(r'New <span class="olp-from">from</span> CDN\$ (.*?)\n', html)[0] buyboxinfo.append(str(price)) lock.acquire() result_file.write("\t".join(buyboxinfo) + "\n") result_file.flush() success_asin_file.write(asin + '\n') success_asin_file.flush() lock.release() print ("success") except Exception as e: print (str(e)) lock.acquire() captcha_url_file.write(asin + '\n') captcha_url_file.flush() lock.release() print ("error: not html")
def get_info(sku): global false, true, null false = False true = True null = None file_id = open("./test.json", "r") Ids = file_id.readlines() # result_file = open(result_file, "aw") # with open('./Result/items.xls','aw') as f: url = 'http://api.bestbuy.com/v1/products/' + sku[:-1] + '.json?apiKey=68zbtdy4wmac9dgvnbhwke4e' info = get_html.get_html_src(url) items_info = eval(info) if items_info.has_key("salePrice"): price = items_info['salePrice'] else: price = 'wrong' a = Ids[0] m = eval(a) b = m['storeAvailabilities'] myconf = { "ISSAQUAH WA": "0", "TUKWILA WA": "0", "FEDERAL WAY WA": "0", "BELLEVUE WA": "0", "SOUTH CENTER": "0", } mydata = {} for li in b: if myconf.has_key(li['store']["name"]): if li['skuAvailabilities'][0].has_key("lowOnStock"): mydata[li['store']["name"]] = "Low" else: mydata[li['store']["name"]] = li['skuAvailabilities'][0][ "availabilityType"] realdata = myconf print(realdata) for (k, v) in mydata.items(): realdata[k] = v realli = ["2016-11-24", sku[:-1]] realli.append(str(price)) state = '' low = '' donot = '' nothave = '' for (k, v) in realdata.items(): if v == "InStore": v = "3" state = "1" elif v == 'ShipToStore': v = '1' donot = "0" elif v == 'Low': v = "2" low = "2" else: nothave = '4' realli.append(v) if state == "1": realli.append(state) elif state == '' and low == "2": realli.append(low) elif state == '' and low == '' and donot == '0': realli.append(donot) else: realli.append("4") print(realli) csv_writer.writerow(realli)
def get_info(file_name): global false, true, null false = False true = True null = None file_id = open("./Id_last.txt", "r") titles = ['itemsId', 'price', 'Original_price', 'ship', 'stock', 'brand', 'title', 'img1', 'img2', 'img3', 'img4','img5', 'detail1', 'detail2', 'detail3', 'detail4', 'detail5','Specification'] # 调用函数create_titles create_titles(file_name, titles) Ids = file_id.readlines() # result_file = open(result_file, "aw") # with open('./Result/items.xls','aw') as f: for goods_id in Ids: goods_id = goods_id.split("\n")[0] # print goods_id url = 'http://api.bestbuy.com/v1/products/' + str(goods_id) + '.json?apiKey=68zbtdy4wmac9dgvnbhwke4e' info = get_html.get_html_src(url) # 将字符串str当成有效的表达式来求值并返回计算结果,即去掉引号 items_info = eval(info) itemsId = items_info['sku'] # print itemsId goods_info = [] goods_info.append(str(itemsId)) #售价 price = items_info['salePrice'] goods_info.append(str(price)) #一般价 Original_price = items_info['regularPrice'] goods_info.append(str(Original_price)) #ship ship = items_info['shippingCost'] if ship == '': ship = "Null" goods_info.append(str(ship)) #stock stock_info = items_info['onlineAvailability'] #判断是否有库存 if stock_info: stock = 'in stock' else: stock = 'out of stock' goods_info.append(str(stock)) #brand brand = '' if items_info.has_key('manufacturer'): brand = items_info['manufacturer'] goods_info.append(str(brand)) #title title = items_info['name'] goods_info.append(str(title)) #image url = 'http://www.bestbuy.com/site/products/' + str(goods_id) + '.p' html = get_html.get_html_src(url) img_list = re.findall(r'<li data-target="#carousel-main".*?src="(.*?);', html, re.S) while len(img_list) < 5: # for img_info in img_list: # if img_info.find("Image coming soon") == -1: # img_list = [] # else: img_list.append("") # print len(img_list) imgs = img_list[:5] # 最多取8张图片 # imgs = [8] # for i in range(8): # imgs[i] = img_list[i] # print "-=-=-=-=-=-=-=-=-=-=" # print imgs # print "-=-=-=-=-=-=-=-=-=-=" goods_info += imgs # detail = '' detail_url = 'http://www.bestbuy.com/site/searchpage.jsp?st='+goods_id+'&_dyncharset=UTF-8&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys' detail_html = get_html.get_html_src(detail_url) # print detail_html detail_list = [] detail_list_src = re.findall(r'<div class="short-description">(.*?)</div>', detail_html, re.S) if detail_list_src ==[]: detail_list.append('Brand Type:'+ brand) # print detail_list_src if detail_html: for detail_items in detail_list_src: # print detail_items detail_list = re.findall(r'<li>(.*?)</li>', detail_items, re.S) if detail_list ==[]: detail_list = detail_list_src # print "--------------" # print detail_list # detail_list.append(dtmp1) while len(detail_list) < 5: detail_list.append("") # print len(img_list) detail_info= detail_list[:5] # 最多取8张图片 goods_info += detail_info # if items_info.has_key('shortDescription'): # detail = items_info['shortDescription'] # print goods_id # print "-=-=-=-=-=-=-=-=-=-=" # print detail # print "-=-=-=-=-=-=-=-=-=-=" # # feature # if items_info.has_key('longDescription'): # feature_info = items_info['longDescription'] # if len(feature_info) > 500: # feature_info = str(feature_info[:500]) # else: # feature_info = "" # # print len(feature_info) # # goods_info.append(str(feature_info)) # # print feature_info #spct spct_info = re.findall(r'data-tabs=.*?fragmentUrl":"(.*?);', html, re.S) # 1139003 # data-tabs= # "[{"fragmentUrl":"/site/kodak-pixpro-sp360-hd-action-camera-aqua-sport-pack-yellow/1139003.p if spct_info: spct_url = 'http://www.bestbuy.com' + spct_info[0]+";template=_specificationsTab" # spct_info = re.search(r'data-tabs=.*?"(.*?)"', html, re.S) # if spct_info: # spct_all = spct_info.group(1) # spct_list = spct_all.replace('"', '"') # spct_list = eval(spct_list) # spct_url_info = spct_list[1]['fragmentUrl'] # spct_url = 'http://www.bestbuy.com' + str(spct_url_info) html1 = get_html.get_html_src(spct_url) # spct_html = re.findall(r'<div class="specification-title">Key Specs</div>.*?<div class="header">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<i class="fistack info-icon">.*?<div class="header">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<div class="specification-name"><span>(.*?)</span>.*?<div class="specification-value">(.*?)</div>', html1, re.S) #spct_html = re.findall(r'<div class="specification-group key-specs">.*?<div class="specification-name">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<div class="specification-group">',html1) spct_html_limit = re.findall(r'<div class="specification-group key-specs">(.*?)<div class="specification-group">',html1,re.S) spct_html = re.findall(r'<div class="specification-name">(.*?)</div>.*?<div class="specification-value">(.*?)</div>',str(spct_html_limit),re.S) spct_dic = {} if spct_html: for spct_items in spct_html: tmp1 = re.sub(r'<[^>]+>', '', spct_items[0], re.S) # print "tmp1" # print tmp1 retmp1 = tmp1.replace(' ','') tmp2 = spct_items[1] # print "tmp2" # print tmp2 spct_dic[retmp1] = tmp2 # print "spct_dic" # print spct_dic spct_dic = str(spct_dic) # print "---------------" # print spct_dic else: spct_dic = "" goods_info.append(spct_dic) # print goods_info # spct_len = len(spct_dic) # if spct_len > 2000: # spct_dic = spct_dic[:2000] # 如何字符长度大于2000,则取前面2000个字符 # info['Specification'] = spct_dic # # spct_all = spct_info.group(1) # spct_list = spct_info.replace('"', '"') # spct_list = eval(spct_list) # spct_url_info = spct_list[1]['fragmentUrl'] # spct_url = 'http://www.bestbuy.com' + str(spct_url_info) # print spct_url # feature_list = feature_info.split('.') # feature = feature_list # feature = ['detail1', 'detail2', 'detail3', 'detail4', 'detail5'] # feature_len = len(feature) # features = detail # feature_count = len(features) # if feature_count < feature_len: # features += ['' for _i in range(feature_len - feature_count)] # feature_dict = dict(zip(feature, features)) # return feature_dict # spct_info = re.findall(r'data-tabs=[\s]"(.*?)"', html, re.S) # pct_url = 'http://www.bestbuy.com' + str(spct_url_info) # print len(goods_info) # write info to file result_file.write("\t".join(goods_info) + "\n") print ('=============') print (goods_info) # f.flush() file_id.flush() result_file.flush()
def handle(asin): asin = asin.strip() # time.sleep(3) try: baseurl = "https://www.amazon.com/dp/[asin]" url = baseurl.replace('[asin]', asin) # time.sleep(30) html = get_html.get_html_src(url) print ('handling...') # 验证码页面 if html == '' or -1 != html.find('Sorry, we just need to make sure you'): # time.sleep(180) lock.acquire() captcha_url_file.write(asin + '\n') captcha_url_file.flush() lock.release() return # 下架产品 if html == '404 error': lock.acquire() not_list_file.write(asin + '\n') not_list_file.flush() lock.release() # print 'product not found' return # 没有抓下来的页面 if html == 'time out or other errors': lock.acquire() not_crawl_file.write(asin + '\n') not_crawl_file.flush() lock.release() return buyboxinfo = [] buyboxinfo.append(asin) stock = re.search('<span class="a-size-medium a-color-success">\s+(.*?)\.', html, re.S) if stock: stock = stock.group(1) else: stock = re.search('<span class="a-size-base a-color-state">\s+(.*?)\.', html, re.S) if stock: stock = stock.group(1) else: stock = re.search('<span class="a-color-success a-text-bold">\s+(.*?)\.', html, re.S) if stock: stock = stock.group(1) else: stock = re.search('<span class="a-size-medium a-color-price">\s+(.*?)\.', html, re.S) if stock: stock = stock.group(1) else: stock = '' buyboxinfo.append(regex_sub_info.sub(str(stock), '')) print (buyboxinfo) lock.acquire() result_file.write("\t".join(buyboxinfo) + "\n") result_file.flush() success_asin_file.write(asin + '\n') success_asin_file.flush() lock.release() except Exception as e: print (asin, e)
def handle(asin): asin = asin.strip() try: baseurl = "http://www.amazon.com/gp/offer-listing/[asin]/ref=olp_f_primeEligible?ie=UTF8&f_new=true&f_primeEligible=true" url = baseurl.replace('[asin]', asin) # html = tool.gethtmlproxy(url) html = get_html.get_html_src(url) print ('handling...') # 验证码页面 if html == '' or -1 != html.find('Sorry, we just need to make sure you'): lock.acquire() captcha_url_file.write(asin + '\n') captcha_url_file.flush() lock.release() return # 下架产品 if html == '404 error': lock.acquire() not_list_file.write(asin + '\n') not_list_file.flush() lock.release() # print 'product not found' return # 没有抓下来的页面 if html == 'time out or other errors': lock.acquire() not_crawl_file.write(asin + '\n') not_crawl_file.flush() lock.release() return buyboxinfo = [] buyboxinfo.append(asin) author = re.search('<div id="olpProductByline" class="a-section a-spacing-mini">\s+by (.*?)\s+</div>', html, re.S) if author: author = author.group(1).strip() else: author = '' buyboxinfo.append(regex_sub_info.sub(str(author), '')) price_list = re.findall( '<span class="a-size-large a-color-price olpOfferPrice a-text-bold">\s+\$(.*?)\s+</span>', html) if price_list: for i in range(len(price_list)): price = price_list[i].strip().replace(',', '') price_list[i] = float(price) min_price = min(price_list) fbastatus = 'FBA' else: min_price = '' fbastatus = 'FBM' buyboxinfo.append(regex_sub_info.sub(str(min_price), '')) buyboxinfo.append(regex_sub_info.sub(str(fbastatus), '')) print (buyboxinfo) lock.acquire() result_file.write("\t".join(buyboxinfo) + "\n") result_file.flush() success_asin_file.write(asin.strip() + '\n') success_asin_file.flush() lock.release() except Exception as e: print (asin, e)