def __init__(self, site='us'): self.site = site options = webdriver.ChromeOptions() options.add_argument('lang=en') prefs = {'profile.managed_default_content_settings.images': 2} options.add_experimental_option('prefs', prefs) self.browser = webdriver.Chrome(chrome_options=options) self.wait = WebDriverWait(self.browser, 10) self.mysql_server = Mysql_server() self.cursor = self.mysql_server.get_cursor() self.countryArr = { "de": "https://www.amazon.de/", "fr": "https://www.amazon.fr/", "uk": "https://www.amazon.co.uk/", "jp": "https://www.amazon.co.jp/", "us": "https://www.amazon.com/", "it": "https://www.amazon.it/", "es": "https://www.amazon.es/", "ca": "https://www.amazon.ca/", "au": "https://www.amazon.com.au/", } self.cookies_list = { 'us': [ 'session-id=147-8333591-9758622; ubid-main=130-3884127-5100529; skin=noskin; session-token=FcrgJrciPNhG+AEYp1gHtGkDkqmjWQqYPzqYWTI4fLyOCg4EZSkATF0YUeQDqPHFk/x9DWfgNcrFMGYiPcW6Wrhm7eIkYNcnASlbMs8qTfrMu6umzYA5BsQ+Z9OM97OJtqVPKg8nbrXNeJzTHKaRIU20Tz+frsKMNRmKFBK4XHXP+3FBm+55LU1UN/O0ipTriWa5Fn99lPUjzz9Hb78rA4E1avMFp8KbTJ4hIYJTCdlki74RAFTMzf2nk0RN/xQVSvMjDCnGJ2zcNi7VUzaPledyQwBagbx9; x-main="HlcsLZ4Gq7XzxS5rmZQXTaD5EW7xI73nKL8h9qlfNMNfOE8o035z5EP4?f47Y61J"; at-main=Atza|IwEBIOnOU7--iHDyKsOiltFuLx-dtMCFjNcnMpKmsGzA08j8J2uLYIMsHNMZNR4QdeYFWAAPs8hSTiEjqaptkbHgKExvqG8KghI0rpQVu9_w9179BjIS_Em7IlSjvL--0aAWoYoVlWzGCrUqul5oH0ojYSTwNnQdt4AAe5_sWY9hFGes7-p0Yndm09X7ymSQWQ-d1yFkSVveWRbV09a4AZePywbm; sess-at-main="3QorUsauzcHl3UyVZ4KL5xzYMG027qJ+/OKSxqm6UGI="; sst-main=Sst1|PQHarY1O5hurB5muBtN9Yu5ZCe9uE1YOdn2dkPVUnEA1MOxvpjAgcfuh3n2MiykhdYoe39tM3pduszP9vggv3H84TOHj4JT83gsk97ZXMbOkI0RoFdmWPAFOk-KaOfFLaZTaFaRvMpU_i50BDCLmTF7b4J0XSDZCGXiA87L7t0H24-FtdU5CYyc5Ef7t7J7jaUJ1fMBNWpW0uyfR-OGRae6kRbB6afuJiXZ9vOH-pXiES3QSuf9hd4znAATAskcYlYhmPqid_4NmTcXrzITr3LH8N7kxjggktbkEOzXrQc5cjho; lc-main=en_US; session-id-time=2082787201l; i18n-prefs=USD; sp-cdn="L5Z9:GB"; csm-hit=tb:s-EW60VBXATHNYN8EMCSW5|1600761482564&t:1600761486768&adb:adblk_no', ], 'uk': [], 'de': [], 'fr': [], 'it': [], 'es': [], 'jp': [], }
class Bsrgata(): # 用于整理类目数,未完成,暂时未用 def __init__(self): self.mongodb = MongoDBserver() self.collection = self.mongodb.get_collection() self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() def get_category(self, category_id, level): params = (category_id, level) sql = """select category_name, category_id, level,parent_id from amz_category where category_id = %s and level=%s""" self.cursor.execute(sql) data_list = self.cursor.fetchall() result_list = [] for data in data_list: result = {} result['category_name'] = data[0] result['category_id'] = data[1] result['level'] = data[2] result['parent_id'] = data[3] result_list.append(result) return result_list def close(self): self.mysql.close()
class ReviewData(): def __init__(self): self.sql_server = Mysql_server() def get_data(self, asin, raiting): # 根据asin提取指定评分的评论 self.cursor = self.sql_server.get_cursor() sql = """select review_title, review_body from product_reviews where asin=%s and review_raiting=%s""" params = (asin, raiting) self.cursor.execute(sql, params) data_list = self.cursor.fetchall() # 将评论写入文件,便于下一步进行词频统计 for data in data_list: with open(f'{asin}_{raiting}.txt', 'a', encoding='utf-8') as f: f.write(data[0]) f.write(" ") f.write(data[-1]) f.write("\n") self.sql_server.close() def get_otherdata(self): self.cursor = self.sql_server.get_cursor() sql = ""
def save_txt(self, sql): # 将MySQL查询得到的数据存入本地的txt文件 my = Mysql_server() cursor = my.get_cursor() cursor.execute(sql) data_list = cursor.fetchall() cursor.close() my.close() for data in data_list: with open('reviews/WordCount.txt', 'a', encoding='utf-8') as f: f.write(' ') f.write(data[0].lower()) f.write('\n') f.write(data[-1].lower()) f.write('\n') print('写入完毕')
class SearchFeature(): def __init__(self): # 建立数据库连接 self.mongodb = MongoDBserver() self.collection = self.mongodb.get_collection() self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() def get_asin(self): # 根据条件搜索出相应的asin self.cursor.execute( 'select asin from product where product="" and state=4 and country="us"' ) asin_list = self.cursor.fetchall() result_list = [] i = 0 data_list = [] for asin in asin_list: result = self.collection.find_one({ "asin": asin[0], "country": "us" }) if result != None: data_list.append(result) i += 1 data = str(result) word = "minute" result_test = re.findall("(([^\s]+ ){0,5}(water).*? ?(,|\.))", data) if result_test != []: data_dict = { "asin": result['asin'], "data": result_test[0][0] } result_list.append(data_dict) self.mongodb.close() self.mysql.close() for i in result_list: with open('test.txt', 'a', encoding='utf-8') as f: f.write(str(i)) f.write('\n') df = pd.DataFrame(result_list) df.to_csv('test.xlsx')
class InsertData(): def __init__(self, file_path): self.sql_server = Mysql_server() with open(file_path, 'r', encoding='utf-8') as f: self.data_list = f.readlines() def insert_data(self): self.cursor = self.sql_server.get_cursor() i = 0 print(self.data_list) for data in self.data_list[1:]: info_list = data.replace('\n', '').split(',')[0] # try: print(info_list, '===') sql = f"""insert into us_asins (id, asin, keyword, pageNum, positionNum, ad,state, timestamp) values(0, '{info_list}', 'tuoyuanji', 1,1, 0, 5, 1)""" print('====', sql) self.cursor.execute(sql) self.sql_server.conn.commit() # except: # i +=1 # print(i) self.sql_server.close()
def main(): js = Py4Js() my = Mysql_server() cursor = my.get_cursor() # cursor.execute('select reviewID, review_title,review_body from product_reviews where state=0 LIMIT 2000') cursor.execute("""SELECT b.reviewID, b.review_title, b.review_body FROM product as a JOIN product_reviews as b on a.asin=b.asin where a.id>39 and b.state = 0 limit 2000""") data = cursor.fetchall() for task in data: update_sql = """update product_reviews set state=1 where reviewID=%s""" parse = (task[0], ) cursor.execute(update_sql, parse) my.conn.commit() # print('---') for task in data: title = task[1] body = task[-1] reviewID = task[0] content = title tk = js.getTk(content) title = translate(content, tk) content = body tk = js.getTk(content) body = translate(content, tk) # print(title) update_sql = """update product_reviews set body_CN=%s, title_CN=%s, state=1 where reviewID=%s""" parmas = (body, title, reviewID) cursor.execute(update_sql, parmas) my.conn.commit() # print(task) # print(data) cursor.close() my.close()
class CategorySupplement(object): def __init__(self, site='us'): self.site = site options = webdriver.ChromeOptions() options.add_argument('lang=en') prefs = {'profile.managed_default_content_settings.images': 2} options.add_experimental_option('prefs', prefs) self.browser = webdriver.Chrome(chrome_options=options) self.wait = WebDriverWait(self.browser, 10) self.mysql_server = Mysql_server() self.cursor = self.mysql_server.get_cursor() self.countryArr = { "de": "https://www.amazon.de/", "fr": "https://www.amazon.fr/", "uk": "https://www.amazon.co.uk/", "jp": "https://www.amazon.co.jp/", "us": "https://www.amazon.com/", "it": "https://www.amazon.it/", "es": "https://www.amazon.es/", "ca": "https://www.amazon.ca/", "au": "https://www.amazon.com.au/", } self.cookies_list = { 'us': [ 'session-id=147-8333591-9758622; ubid-main=130-3884127-5100529; skin=noskin; session-token=FcrgJrciPNhG+AEYp1gHtGkDkqmjWQqYPzqYWTI4fLyOCg4EZSkATF0YUeQDqPHFk/x9DWfgNcrFMGYiPcW6Wrhm7eIkYNcnASlbMs8qTfrMu6umzYA5BsQ+Z9OM97OJtqVPKg8nbrXNeJzTHKaRIU20Tz+frsKMNRmKFBK4XHXP+3FBm+55LU1UN/O0ipTriWa5Fn99lPUjzz9Hb78rA4E1avMFp8KbTJ4hIYJTCdlki74RAFTMzf2nk0RN/xQVSvMjDCnGJ2zcNi7VUzaPledyQwBagbx9; x-main="HlcsLZ4Gq7XzxS5rmZQXTaD5EW7xI73nKL8h9qlfNMNfOE8o035z5EP4?f47Y61J"; at-main=Atza|IwEBIOnOU7--iHDyKsOiltFuLx-dtMCFjNcnMpKmsGzA08j8J2uLYIMsHNMZNR4QdeYFWAAPs8hSTiEjqaptkbHgKExvqG8KghI0rpQVu9_w9179BjIS_Em7IlSjvL--0aAWoYoVlWzGCrUqul5oH0ojYSTwNnQdt4AAe5_sWY9hFGes7-p0Yndm09X7ymSQWQ-d1yFkSVveWRbV09a4AZePywbm; sess-at-main="3QorUsauzcHl3UyVZ4KL5xzYMG027qJ+/OKSxqm6UGI="; sst-main=Sst1|PQHarY1O5hurB5muBtN9Yu5ZCe9uE1YOdn2dkPVUnEA1MOxvpjAgcfuh3n2MiykhdYoe39tM3pduszP9vggv3H84TOHj4JT83gsk97ZXMbOkI0RoFdmWPAFOk-KaOfFLaZTaFaRvMpU_i50BDCLmTF7b4J0XSDZCGXiA87L7t0H24-FtdU5CYyc5Ef7t7J7jaUJ1fMBNWpW0uyfR-OGRae6kRbB6afuJiXZ9vOH-pXiES3QSuf9hd4znAATAskcYlYhmPqid_4NmTcXrzITr3LH8N7kxjggktbkEOzXrQc5cjho; lc-main=en_US; session-id-time=2082787201l; i18n-prefs=USD; sp-cdn="L5Z9:GB"; csm-hit=tb:s-EW60VBXATHNYN8EMCSW5|1600761482564&t:1600761486768&adb:adblk_no', ], 'uk': [], 'de': [], 'fr': [], 'it': [], 'es': [], 'jp': [], } def set_cookies(self): for line in random.choice(self.cookies_list.get(self.site, 'us')).split('; '): cookies_dict = {} temp = line.split('=', 1) cookies_dict['name'] = temp[0] cookies_dict['value'] = temp[-1] self.browser.add_cookie(cookies_dict) def get_data(self): select_sql = f"select url from category_supplement where state=0 and country=%s limit 1" self.cursor.execute(select_sql, (self.site, )) data = self.cursor.fetchall() for record in data: parmas = (record[0]) update_sql = f"""update category_supplement set state=1 where url=%s""" self.cursor.execute(update_sql, parmas) self.mysql_server.conn.commit() return data def insert_data(self, item): # 更新类目补充表 params = (item['bigCategoryName'], item['bigRank'], item['smallCategoryName'], item['smallRank'], item['url']) update_sql = f"""update category_supplement set bigCategoryName=%s, bigRank=%s, smallCategoryName=%s, smallRank=%s, state=2 where url=%s""" self.cursor.execute(update_sql, params) self.mysql_server.conn.commit() print('{}更新成功'.format(item['url'])) def start(self): # 先登录,在爬取数据 self.browser.get(self.countryArr.get(self.site, 'us')) self.set_cookies() data = self.get_data() for record in data: item = self.spider_parse(record[0]) self.insert_data(item) print(item) self.close_link() def spider_parse(self, url): item = {} flag = 1 item['url'] = url item['asin'] = url.split('/')[-1] try: self.browser.get(url) self.wait.until( EC.presence_of_element_located((By.ID, 'navFooter'))) html = self.browser.page_source doc = pq(html) sec = etree.HTML(html) # 获取类目,排名 tr_list = doc( '#productDetails_detailBullets_sections1>tbody>tr').items() if tr_list: for tr in tr_list: th = tr('th').text() td = tr('td').text() if self.site in ['us']: if th == 'Best Sellers Rank': flag = 0 bigCategory = list( map(str.strip, td.split('#')[1].split(' in '))) smallCategory = list( map(str.strip, td.split('#')[-1].split(' in '))) bigCategoryName = bigCategory[1].split(' (')[0] bigRank = int(''.join(bigCategory[0].split(','))) smallCategoryName = smallCategory[-1] smallRank = int(''.join( smallCategory[0].split(','))) item['bigCategoryName'] = bigCategoryName item['bigRank'] = bigRank item['smallCategoryName'] = smallCategoryName item['smallRank'] = smallRank if self.site in ['de']: if th == 'Amazon Bestseller-Rang': flag = 0 bigCategory = list( map(str.strip, td.split('Nr. ')[1].split(' in '))) smallCategory = list( map(str.strip, td.split('Nr. ')[-1].split(' in '))) bigCategoryName = bigCategory[1].split(' (')[0] bigRank = int(''.join(bigCategory[0].split(','))) smallCategoryName = smallCategory[-1] smallRank = int(''.join( smallCategory[0].split(','))) item['bigCategoryName'] = bigCategoryName item['bigRank'] = bigRank item['smallCategoryName'] = smallCategoryName item['smallRank'] = smallRank ul_list = doc('#detailBulletsWrapper_feature_div>ul').items() if ul_list: for ul in ul_list: title = ul('li>span>span').text() if self.site in ['de']: if title == 'Amazon Bestseller-Rang:': flag = 0 content = ul('li').text() bigCategory = list( map(str.strip, content.split('Nr. ')[1].split(' in '))) smallCategory = list( map(str.strip, content.split('Nr. ')[-1].split(' in '))) bigCategoryName = bigCategory[1].split(' (')[0] bigRank = int(''.join(bigCategory[0].split(','))) smallCategoryName = smallCategory[-1] smallRank = int(''.join( smallCategory[0].split(','))) item['bigCategoryName'] = bigCategoryName item['bigRank'] = bigRank item['smallCategoryName'] = smallCategoryName item['smallRank'] = smallRank if flag: item['bigCategoryName'] = '' item['bigRank'] = 0 item['smallCategoryName'] = '' item['smallRank'] = 0 # 获取价格 price = doc('#priceblock_ourprice').text() # 获取跟卖最低价 min_price = doc('.olp-text-box>span:nth-child(3)').text() # 获取Rating rating_star = doc('span[data-hook="rating-out-of-text"]').text() rating_num = doc('#acrCustomerReviewText').text() star_section = doc('#histogramTable>tbody>tr').items() stars = ['five', 'four', 'three', 'two', 'one'] for line, star in zip(star_section, stars): item[star] = line('td:nth-child(3)').text() # 获取label label_section = doc('div[cssclass="aok-float-left"]>span').items() label = '' best_seller = '' new_release = '' amazon_choice = '' for i, line in enumerate(label_section): if i == 0: label = line.text() else: if label == "Best's Seller": best_seller = line.text().split('"')[1] if label == "New's Release": new_release = line.text().split('"')[1] if label == "Amazon's Choice": amazon_choice = line.text().split('"')[1] # 获取卖家类型 seller1 = ''.join(sec.xpath('//*[@id="buybox-tabular"]//text()')) seller2 = ''.join(sec.xpath('//*[@id="merchant-info"]//text()')) seller1_url = ''.join( sec.xpath( '//*[@id="buybox-tabular"]//a[@id="sellerProfileTriggerId"]/@href' )) seller2_url = ''.join( sec.xpath( '//*[@id="merchant-info"]//a[@id="sellerProfileTriggerId"]/@href' )) if 'Amazon' in seller1 or 'Amazon' in seller2: seller_type = 'AMZ' elif 'isAmazonFulfilled' in seller1_url or 'isAmazonFulfilled' in seller2_url: seller_type = 'FBA' else: seller_type = 'MAH' item['price'] = float(re.findall(r"\d+\.?\d*|$", price)[0] or 0) item['min_price'] = float( re.findall(r"\d+\.?\d*|$", min_price)[0] or 0) item['rating_star'] = float( re.findall(r"\d+\.?\d*|$", rating_star)[0] or 0) item['rating_num'] = int( re.findall(r"\d+\.?\d*|$", rating_num.replace(',', ''))[0] or 0) item['best_seller'] = best_seller item['new_release'] = new_release item['amazon_choice'] = amazon_choice item['seller_type'] = seller_type except Exception as e: print('识别验证码') return item def close_link(self): self.browser.close() self.mysql_server.close()
def __init__(self): self.sql_server = Mysql_server()
def __init__(self, file_path): self.sql_server = Mysql_server() with open(file_path, 'r', encoding='utf-8') as f: self.data_list = f.readlines()
def __init__(self): self.mongodb = MongoDBserver() self.collection = self.mongodb.get_collection() self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor()
def __init__(self): # 建立数据库连接 self.mongodb = MongoDBserver() self.collection = self.mongodb.get_collection() self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() self.categoryIndexArr = { "us": { "Home & Kitchen": 1055398, "Office Products": 1064954, "Musical Instruments": 11091801, "Amazon Launchpad": 12034488011, "Kindle Store": 133140011, "Kitchen & Dining": 13900821, "Automotive": 15684181, "Industrial & Scientific": 16310091, "Grocery & Gourmet Food": 16310101, "Digital Music": 163856011, "Toys & Games": 165793011, "Baby": 165796011, "Electronics": 172282, "Gift Cards": 2238192011, "Tools & Home Improvement": 228013, "Software": 229534, "Cell Phones & Accessories": 2335752011, "Apps & Games": 2350149011, "Arts, Crafts & Sewing": 2617941011, "Appliances": 2619525011, "Pet Supplies": 2619533011, "Movies & TV": 2625373011, "Books": 283155, "Patio, Lawn & Garden": 2972638011, "Sports Collectibles": 3250697011, "Sports & Outdoors": 3375251, "Health & Household": 3760901, "Beauty & Personal Care": 3760911, "Video Games": 468642, "Camera & Photo": 502394, "Entertainment Collectibles": 5088769011, "CDs & Vinyl": 5174, "Computers & Accessories": 541966, "Magazine Subscriptions": 599858, "Clothing, Shoes & Jewelry": 7141123011, "Collectible Coins": 9003130011 }, "jp": { "Home & Kitchen": 1055398, "Office Products": 1064954, "Musical Instruments": 11091801, "Amazon Launchpad": 12034488011, "Kindle Store": 133140011, "Kitchen & Dining": 13900821, "Automotive": 15684181, "Industrial & Scientific": 16310091, "Grocery & Gourmet Food": 16310101, "Digital Music": 163856011, "Toys & Games": 165793011, "Baby": 165796011, "Electronics": 172282, "Gift Cards": 2238192011, "Tools & Home Improvement": 228013, "Software": 229534, "Cell Phones & Accessories": 2335752011, "Apps & Games": 2350149011, "Arts, Crafts & Sewing": 2617941011, "Appliances": 2619525011, "Pet Supplies": 2619533011, "Movies & TV": 2625373011, "Books": 283155, "Patio, Lawn & Garden": 2972638011, "Sports Collectibles": 3250697011, "Sports & Outdoors": 3375251, "Health & Household": 3760901, "Beauty & Personal Care": 3760911, "Video Games": 468642, "Camera & Photo": 502394, "Entertainment Collectibles": 5088769011, "CDs & Vinyl": 5174, "Computers & Accessories": 541966, "Magazine Subscriptions": 599858, "Clothing, Shoes & Jewelry": 7141123011, "Collectible Coins": 9003130011 }, "xx": { 'ビューティー': 52374051, 'jp-stores': 579684, 'DVD': 561958, 'ゲーム': 637394, '産業・研究開発用品': 3445393051, 'Prime Video': 2351649051, 'スポーツ&アウトドア': 14304371, '洋書': 52033011, '食品・飲料・お酒': 57239051, '車&バイク': 2017304051, 'シューズ&バッグ': 2016926051, 'ジュエリー': 85895051, '腕時計': 324025011, 'ホビー': 2277721051, 'DIY・工具・ガーデン': 2016929051, 'ペット用品': 2127212051, 'ホーム&キッチン': 3828871, '文房具・オフィス用品': 86731051, '楽器・音響機器': 2123629051, 'ドラッグストア': 160384011, 'デジタルミュージック': 2128134051, '家電&カメラ': 3210981, '大型家電': 2277724051, '服&ファッション小物': 352484011, 'おもちゃ': 13299531, 'PCソフト': 637392, 'ミュージック': 561956, 'ベビー&マタニティ': 344845011, 'パソコン・周辺機器': 2127209051, 'Amazonデバイス・アクセサリ': 4976279051, 'Androidアプリ': 2381130051, 'Kindleストア': 2250738051, 'ギフト券': 2351652051, 'ファッション': 2229202051, '本': 465392 }, "de": { 'Haustier': 340852031, 'DVD & Blu-ray': 284266, 'Küche, Haushalt & Wohnen': 3167641, 'Auto & Motorrad': 78191031, 'Lebensmittel & Getränke': 340846031, 'Prime Video': 3010075031, 'Amazon Launchpad': 9418395031, 'Musik-CDs & Vinyl': 255882, 'Bürobedarf & Schreibwaren': 192416031, 'Beleuchtung': 213083031, 'Kamera & Foto': 571860, 'Beauty': 84230031, 'Elektronik & Foto': 562066, 'Spielzeug': 12950651, 'Schuhe & Handtaschen': 355006011, 'Baby': 355007011, 'Elektro-Großgeräte': 908823031, 'Bekleidung': 77028031, 'Gewerbe, Industrie & Wissenschaft': 5866098031, 'Zeitschriften': 1161658, 'Sport & Freizeit': 16435051, 'Musikinstrumente & DJ-Equipment': 340849031, 'Bücher': 186606, 'Amazon-Geräte & Zubehör': 12598632031, 'Fremdsprachige Bücher': 52044011, 'Games': 300992, 'Handmade Produkte': 9699311031, 'Koffer, Rucksäcke & Taschen': 2454118031, 'Geschenkgutscheine': 1571256031, 'Musik-Downloads': 77195031, 'Software': 301927, 'Sonstiges': 72921031, 'Computer & Zubehör': 340843031, 'Apps & Spiele': 1661648031, 'Drogerie & Körperpflege': 64187031, 'Uhren': 193707031, 'Schmuck': 327472011, 'Garten': 10925031, 'Baumarkt': 80084031, 'Kindle-Shop': 530484031 }, "uk": { 'Home & Garden Store': 3146281, 'DVD & Blu-ray': 283920, 'Shoes & Bags': 355005011, 'Stationery & Office Supplies': 192413031, 'Prime Video': 3010085031, 'CDs & Vinyl': 229816, 'Garden & Outdoors': 11052671, 'Handmade Products': 9699254031, 'Large Appliances': 908798031, 'Health & Personal Care': 65801031, 'Musical Instruments & DJ': 340837031, 'Beauty': 117332031, 'Automotive': 248877031, 'Pet Supplies': 340840031, 'Apps & Games': 1661657031, 'Gift Cards': 1571304031, 'Lighting': 213077031, 'Computers & Accessories': 340831031, 'Sports & Outdoors': 318949011, 'Everything Else': 72911031, 'Grocery': 340834031, 'Luggage': 2454166031, 'Amazon Devices & Accessories': 12598575031, 'Business, Industry & Science': 5866054031, 'Digital Music': 77197031, 'DIY & Tools': 79903031, 'Software': 300435, 'Jewellery': 193716031, 'Electronics & Photo': 560798, 'Baby Products': 59624031, 'Kindle Store': 341677031, 'Watches': 328228011, 'Toys & Games': 468292, 'Books': 266239, 'Clothing': 83450031, 'PC & Video Games': 300703, "Amazon Launchpad": 7212961031, "Home & Kitchen": 11052681 }, "fr": { 'Bricolage': 590748031, 'Animalerie': 1571268031, 'Hygiène et Santé': 197861031, 'Livres': 301061, 'DVD & Blu-ray': 405322, 'Boutique chèques-cadeaux': 2524127031, 'Vêtements': 340855031, 'Fournitures de bureau': 192419031, 'Informatique': 340858031, 'Téléchargement de Musique': 77196031, 'Chaussures et Sacs': 215934031, 'Montres': 60649031, 'Appareils Amazon et Accessoires': 12598689031, 'Bébé & Puériculture': 206617031, 'Luminaires & Eclairage': 213080031, 'High-Tech': 13921051, 'Logiciels': 530488, 'Amazon Launchpad': 10525448031, 'Gros électroménager': 908826031, 'Jeux vidéo': 530490, 'Beauté et Parfum': 197858031, 'Bijoux': 193710031, 'Epicerie': 3635788031, 'Auto et Moto': 1571265031, 'Livres anglais et étrangers': 52042011, 'Instruments de musique et Sono': 340861031, 'Jeux et Jouets': 322086011, 'Sports et Loisirs': 325614031, 'Autres': 72919031, 'Commerce, Industrie et Science': 5866109031, 'Cuisine & Maison': 57004031, 'CD & Vinyles': 301062, 'Applis et Jeux': 1661654031, 'Boutique Kindle': 672108031, 'Bagages': 2454145031, 'Jardin': 3557027031, 'Produits Handmade': 9699368031 }, "it": { 'Abbigliamento': 2844433031, 'Alimentari e cura della casa': 6198092031, 'Altro': 425919031, 'Auto e Moto': 1571280031, 'Bellezza': 6198082031, 'Cancelleria e prodotti per ufficio': 3606310031, 'Casa e cucina': 524015031, 'CD e Vinili': 412600031, 'Commercio, Industria e Scienza': 5866068031, 'Elettronica': 412609031, 'Fai da te': 2454160031, 'Film e TV': 412606031, 'Giardino e giardinaggio': 635016031, 'Giochi e giocattoli': 523997031, 'Gioielli': 2454163031, 'Illuminazione': 1571292031, 'Informatica': 425916031, 'Libri': 411663031, 'Libri in altre lingue': 433842031, 'Musica Digitale': 1748203031, 'Orologi': 524009031, 'Prima infanzia': 1571286031, 'Prodotti per animali domestici': 12472499031, 'Salute e cura della persona': 1571289031, 'Scarpe e borse': 524006031, 'Sport e tempo libero': 524012031, 'Strumenti Musicali': 3628629031, 'Valigeria': 2454148031, 'Videogiochi': 412603031, 'App e Giochi': 1661660031, 'Buoni regalo': 3557017031, 'Dispositivi Amazon & Accessori': 12598749031, 'Grandi elettrodomestici': 14437356031, 'Kindle Store': 818937031, 'Prodotti Handmade': 9699425031, 'Software': 412612031 }, "es": { 'Alimentación y bebidas': 6198072031, 'Bebé': 1703495031, 'Belleza': 6198054031, 'Bricolaje y herramientas': 2454133031, 'CDs y vinilos': 599373031, 'Coche y moto': 1951051031, 'Deportes y aire libre': 2454136031, 'Electrónica': 599370031, 'Equipaje': 2454129031, 'Hogar y cocina': 599391031, 'Iluminación': 3564289031, 'Industria, empresas y ciencia': 5866088031, 'Informática': 667049031, 'Instrumentos musicales': 3628866031, 'Jardín': 1571259031, 'Joyería': 2454126031, 'Juguetes y juegos': 599385031, 'Libros': 599364031, 'Libros en idiomas extranjeros': 599367031, 'Música Digital': 1748200031, 'Oficina y papelería': 3628728031, 'Otros Productos': 667040031, 'Películas y TV': 599379031, 'Productos para mascotas': 12472654031, 'Relojes': 599388031, 'Ropa': 2846220031, 'Salud y cuidado personal': 3677430031, 'Videojuegos': 599382031, 'Zapatos y complementos': 1571262031, 'Grandes electrodomésticos': 4772050031, 'Productos Handmade': 9699482031, 'Software': 599376031, 'Tienda Kindle': 818936031, 'Apps y Juegos': 1661649031, 'Cheques regalo': 3564279031, 'Dispositivos Amazon y Accesorios': 12598806031 }, 'in': { 'Bags, Wallets and Luggage': 2454169031, "Baby": 1571274031, "Beauty": 1355016031, "Books": 976389031, "Car & Motorbike": 4772060031, "Clothing & Accessories": 1571271031, "Computers & Accessories": 976392031, "Electronics": 976419031, "Grocery & Gourmet Foods": 2454178031, "Health & Personal Care": 1350384031, "Home & Kitchen": 976442031, "Home Improvement": 3704992031, "Industrial & Scientific": 5866078031, "Jewellery": 1951048031, "Movies & TV Shows": 976416031, "Music": 976445031, "Musical Instruments": 3677697031, "Office Products": 2454172031, 'Outdoor Living': 2454175031, 'Pet Supplies': 2454181031, "Shoes & Handbags": 1571283031, 'Sports, Fitness & Outdoors': 1984443031, "Toys & Games": 1350380031, 'Video Games': 976460031, "Watches": 1350387031, "Software": 976451031 }, 'br': { "Apps e Jogos": 6446175011, "Bebê": 17242603011, 'Beleza': 16194414011, 'Binquedos e Jogos': 16194299011, 'Alimentos e Bebidas': 18991079011, "Jardim e Piscina": 18991021011, 'Casa': 16191000011, "CD e Vinil": 7791937011, 'Computadores e Informática': 16339926011, 'Cozinha': 16957125011, "DVD e Blu-ray": 7791856011, 'Eletrodomésticos': 16522082011, 'Eletrônicos': 16209062011, 'Esporte': 17349396011, 'Ferramentas e Materiais de Construção': 16957182011, 'Games e Consoles': 7791985011, 'Livros': 6740748011, "Loja Kindle": 5308307011, 'Moda': 17365811011, "Papelaria e Escritório": 16957239011, "Saúde": 16215417011, "Pet Shop": 18991136011, "Automotivo": 18914209011 }, 'au': { "Alexa Skills": 4931595051, "Apps & Games": 2544160051, "Automotive": 4851453051, "Baby": 4851510051, "Beauty": 4851567051, "Books": 4851626051, "CDs & Vinyl": 4852330051, "Clothing, Shoes & Accessories": 4851856051, "Computers": 4851683051, "Electronics": 4851799051, "Everything Else": 4103126051, 'Health, Household & Personal Care': 4851917051, 'Home': 4851975051, "Home Improvement": 4852033051, "Kindle Store": 2490359051, "Kitchen & Dining": 4852150051, "Lighting": 4852207051, "Movies & TV": 4852264051, 'Pantry Food & Drinks': 5547635051, 'Pet Supplies': 5514967051, 'Software': 4852502051, 'Sports, Fitness & Outdoors': 4852559051, 'Stationery & Office Products': 4852445051, 'Toys & Games': 4852617051, 'Video Games': 4852675051 }, 'ca': { 'Automotive': 6948389011, "Baby": 3561346011, "Beauty & Personal Care": 6205124011, "Books": 916520, "Clothing & Accessories": 8604903011, "Electronics": 667823011, 'Everything Else': 2356392011, "Featured Stores": 916516, 'Grocery & Gourmet Food': 6967215011, 'Health & Personal Care': 6205177011, "Home": 2206275011, 'Industrial & Scientific': 11076213011, 'Jewelry': 6205496011, 'Livres': 916522, 'Luggage & Bags': 6205505011, 'Movies & TV': 917972, 'Music': 916514, 'Musical Instruments, Stage & Studio': 6916844011, 'Office Products': 6205511011, 'Patio, Lawn & Garden': 6205499011, 'Pet Supplies': 6205514011, 'Shoes & Handbags': 8604915011, 'Sports & Outdoors': 2242989011, 'Tools & Home Improvement': 3006902011, 'Toys & Games': 6205517011, 'Video': 916518, 'Video Games': 3198031, 'Watches': 2235620011 }, 'ae': { "Appliances": 15149781031, "Automotive": 11498031031, "Baby Products": 11498088031, "Beauty": 11497860031, "Books": 11497689031, "Computers": 11497746031, "Electronics": 11601327031, "Fashion": 11497632031, "Grocery": 15150009031, "Health": 11601441031, "Home": 16725681031, "Kitchen": 16402718031, "Mobile Phones & Communication Products": 12303750031, "Office Products": 15150351031, "Pet Supplies": 15150408031, "Sporting Goods": 11601213031, "Tools & Home Improvement": 11601270031, "Toys": 11497803031, "Videogames": 11601384031, }, "mx": { "Industria, Empresas y Ciencia": 11076223011, "Juguetes y Juegos": 11260442011, "Productos para Animales": 11782336011, "Ropa, Zapatos y Accesorios": 13848838011, "Automotriz y Motocicletas": 13848848011, "Instrumentos Musicales": 13848858011, "Tienda Kindle": 6446439011, "Libros": 9298576011, "Electrónicos": 9482558011, "Hogar y Cocina": 9482593011, "Salud, Belleza y Cuidado Personal": 9482610011, "Música": 9482620011, "Películas y Series de TV": 9482630011, "Videojuegos": 9482640011, "Bebé": 9482650011, "Deportes y Aire libre": 9482660011, "Herramientas y Mejoras del Hogar": 9482670011, "Software": 9482690011, "Oficina y Papelería": 9673844011 }, "tr": { "Oyuncak": 12467126031, "Spor ve Outdoor": 12467068031, "Mutfak": 12466781031, "Ofis ve Kırtasiye": 12467009031, "Kitap": 12466380031, "Yapı Market": 12466724031, "Kadın Modası": 13546651031, "Ev ve Yaşam": 12466667031, "Elektronik": 12466496031, "Erkek Modası": 13546649031, "Bebek": 12466208031, "Diğer Her Şey": 12467297031, "Bilgisayarlar": 12466439031, "Moda": 12466553031, "Video Oyunu ve Konsol": 12467183031 } } self.country_site = { "de": "de", "fr": "fr", "uk": "co.uk", "jp": "co.jp", "us": "com", "it": "it", "es": "es", 'in': "in", 'br': 'com.br', 'au': "com.au", 'ca': "ca" }
class Getdata(): def __init__(self): # 建立数据库连接 self.mongodb = MongoDBserver() self.collection = self.mongodb.get_collection() self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() self.categoryIndexArr = { "us": { "Home & Kitchen": 1055398, "Office Products": 1064954, "Musical Instruments": 11091801, "Amazon Launchpad": 12034488011, "Kindle Store": 133140011, "Kitchen & Dining": 13900821, "Automotive": 15684181, "Industrial & Scientific": 16310091, "Grocery & Gourmet Food": 16310101, "Digital Music": 163856011, "Toys & Games": 165793011, "Baby": 165796011, "Electronics": 172282, "Gift Cards": 2238192011, "Tools & Home Improvement": 228013, "Software": 229534, "Cell Phones & Accessories": 2335752011, "Apps & Games": 2350149011, "Arts, Crafts & Sewing": 2617941011, "Appliances": 2619525011, "Pet Supplies": 2619533011, "Movies & TV": 2625373011, "Books": 283155, "Patio, Lawn & Garden": 2972638011, "Sports Collectibles": 3250697011, "Sports & Outdoors": 3375251, "Health & Household": 3760901, "Beauty & Personal Care": 3760911, "Video Games": 468642, "Camera & Photo": 502394, "Entertainment Collectibles": 5088769011, "CDs & Vinyl": 5174, "Computers & Accessories": 541966, "Magazine Subscriptions": 599858, "Clothing, Shoes & Jewelry": 7141123011, "Collectible Coins": 9003130011 }, "jp": { "Home & Kitchen": 1055398, "Office Products": 1064954, "Musical Instruments": 11091801, "Amazon Launchpad": 12034488011, "Kindle Store": 133140011, "Kitchen & Dining": 13900821, "Automotive": 15684181, "Industrial & Scientific": 16310091, "Grocery & Gourmet Food": 16310101, "Digital Music": 163856011, "Toys & Games": 165793011, "Baby": 165796011, "Electronics": 172282, "Gift Cards": 2238192011, "Tools & Home Improvement": 228013, "Software": 229534, "Cell Phones & Accessories": 2335752011, "Apps & Games": 2350149011, "Arts, Crafts & Sewing": 2617941011, "Appliances": 2619525011, "Pet Supplies": 2619533011, "Movies & TV": 2625373011, "Books": 283155, "Patio, Lawn & Garden": 2972638011, "Sports Collectibles": 3250697011, "Sports & Outdoors": 3375251, "Health & Household": 3760901, "Beauty & Personal Care": 3760911, "Video Games": 468642, "Camera & Photo": 502394, "Entertainment Collectibles": 5088769011, "CDs & Vinyl": 5174, "Computers & Accessories": 541966, "Magazine Subscriptions": 599858, "Clothing, Shoes & Jewelry": 7141123011, "Collectible Coins": 9003130011 }, "xx": { 'ビューティー': 52374051, 'jp-stores': 579684, 'DVD': 561958, 'ゲーム': 637394, '産業・研究開発用品': 3445393051, 'Prime Video': 2351649051, 'スポーツ&アウトドア': 14304371, '洋書': 52033011, '食品・飲料・お酒': 57239051, '車&バイク': 2017304051, 'シューズ&バッグ': 2016926051, 'ジュエリー': 85895051, '腕時計': 324025011, 'ホビー': 2277721051, 'DIY・工具・ガーデン': 2016929051, 'ペット用品': 2127212051, 'ホーム&キッチン': 3828871, '文房具・オフィス用品': 86731051, '楽器・音響機器': 2123629051, 'ドラッグストア': 160384011, 'デジタルミュージック': 2128134051, '家電&カメラ': 3210981, '大型家電': 2277724051, '服&ファッション小物': 352484011, 'おもちゃ': 13299531, 'PCソフト': 637392, 'ミュージック': 561956, 'ベビー&マタニティ': 344845011, 'パソコン・周辺機器': 2127209051, 'Amazonデバイス・アクセサリ': 4976279051, 'Androidアプリ': 2381130051, 'Kindleストア': 2250738051, 'ギフト券': 2351652051, 'ファッション': 2229202051, '本': 465392 }, "de": { 'Haustier': 340852031, 'DVD & Blu-ray': 284266, 'Küche, Haushalt & Wohnen': 3167641, 'Auto & Motorrad': 78191031, 'Lebensmittel & Getränke': 340846031, 'Prime Video': 3010075031, 'Amazon Launchpad': 9418395031, 'Musik-CDs & Vinyl': 255882, 'Bürobedarf & Schreibwaren': 192416031, 'Beleuchtung': 213083031, 'Kamera & Foto': 571860, 'Beauty': 84230031, 'Elektronik & Foto': 562066, 'Spielzeug': 12950651, 'Schuhe & Handtaschen': 355006011, 'Baby': 355007011, 'Elektro-Großgeräte': 908823031, 'Bekleidung': 77028031, 'Gewerbe, Industrie & Wissenschaft': 5866098031, 'Zeitschriften': 1161658, 'Sport & Freizeit': 16435051, 'Musikinstrumente & DJ-Equipment': 340849031, 'Bücher': 186606, 'Amazon-Geräte & Zubehör': 12598632031, 'Fremdsprachige Bücher': 52044011, 'Games': 300992, 'Handmade Produkte': 9699311031, 'Koffer, Rucksäcke & Taschen': 2454118031, 'Geschenkgutscheine': 1571256031, 'Musik-Downloads': 77195031, 'Software': 301927, 'Sonstiges': 72921031, 'Computer & Zubehör': 340843031, 'Apps & Spiele': 1661648031, 'Drogerie & Körperpflege': 64187031, 'Uhren': 193707031, 'Schmuck': 327472011, 'Garten': 10925031, 'Baumarkt': 80084031, 'Kindle-Shop': 530484031 }, "uk": { 'Home & Garden Store': 3146281, 'DVD & Blu-ray': 283920, 'Shoes & Bags': 355005011, 'Stationery & Office Supplies': 192413031, 'Prime Video': 3010085031, 'CDs & Vinyl': 229816, 'Garden & Outdoors': 11052671, 'Handmade Products': 9699254031, 'Large Appliances': 908798031, 'Health & Personal Care': 65801031, 'Musical Instruments & DJ': 340837031, 'Beauty': 117332031, 'Automotive': 248877031, 'Pet Supplies': 340840031, 'Apps & Games': 1661657031, 'Gift Cards': 1571304031, 'Lighting': 213077031, 'Computers & Accessories': 340831031, 'Sports & Outdoors': 318949011, 'Everything Else': 72911031, 'Grocery': 340834031, 'Luggage': 2454166031, 'Amazon Devices & Accessories': 12598575031, 'Business, Industry & Science': 5866054031, 'Digital Music': 77197031, 'DIY & Tools': 79903031, 'Software': 300435, 'Jewellery': 193716031, 'Electronics & Photo': 560798, 'Baby Products': 59624031, 'Kindle Store': 341677031, 'Watches': 328228011, 'Toys & Games': 468292, 'Books': 266239, 'Clothing': 83450031, 'PC & Video Games': 300703, "Amazon Launchpad": 7212961031, "Home & Kitchen": 11052681 }, "fr": { 'Bricolage': 590748031, 'Animalerie': 1571268031, 'Hygiène et Santé': 197861031, 'Livres': 301061, 'DVD & Blu-ray': 405322, 'Boutique chèques-cadeaux': 2524127031, 'Vêtements': 340855031, 'Fournitures de bureau': 192419031, 'Informatique': 340858031, 'Téléchargement de Musique': 77196031, 'Chaussures et Sacs': 215934031, 'Montres': 60649031, 'Appareils Amazon et Accessoires': 12598689031, 'Bébé & Puériculture': 206617031, 'Luminaires & Eclairage': 213080031, 'High-Tech': 13921051, 'Logiciels': 530488, 'Amazon Launchpad': 10525448031, 'Gros électroménager': 908826031, 'Jeux vidéo': 530490, 'Beauté et Parfum': 197858031, 'Bijoux': 193710031, 'Epicerie': 3635788031, 'Auto et Moto': 1571265031, 'Livres anglais et étrangers': 52042011, 'Instruments de musique et Sono': 340861031, 'Jeux et Jouets': 322086011, 'Sports et Loisirs': 325614031, 'Autres': 72919031, 'Commerce, Industrie et Science': 5866109031, 'Cuisine & Maison': 57004031, 'CD & Vinyles': 301062, 'Applis et Jeux': 1661654031, 'Boutique Kindle': 672108031, 'Bagages': 2454145031, 'Jardin': 3557027031, 'Produits Handmade': 9699368031 }, "it": { 'Abbigliamento': 2844433031, 'Alimentari e cura della casa': 6198092031, 'Altro': 425919031, 'Auto e Moto': 1571280031, 'Bellezza': 6198082031, 'Cancelleria e prodotti per ufficio': 3606310031, 'Casa e cucina': 524015031, 'CD e Vinili': 412600031, 'Commercio, Industria e Scienza': 5866068031, 'Elettronica': 412609031, 'Fai da te': 2454160031, 'Film e TV': 412606031, 'Giardino e giardinaggio': 635016031, 'Giochi e giocattoli': 523997031, 'Gioielli': 2454163031, 'Illuminazione': 1571292031, 'Informatica': 425916031, 'Libri': 411663031, 'Libri in altre lingue': 433842031, 'Musica Digitale': 1748203031, 'Orologi': 524009031, 'Prima infanzia': 1571286031, 'Prodotti per animali domestici': 12472499031, 'Salute e cura della persona': 1571289031, 'Scarpe e borse': 524006031, 'Sport e tempo libero': 524012031, 'Strumenti Musicali': 3628629031, 'Valigeria': 2454148031, 'Videogiochi': 412603031, 'App e Giochi': 1661660031, 'Buoni regalo': 3557017031, 'Dispositivi Amazon & Accessori': 12598749031, 'Grandi elettrodomestici': 14437356031, 'Kindle Store': 818937031, 'Prodotti Handmade': 9699425031, 'Software': 412612031 }, "es": { 'Alimentación y bebidas': 6198072031, 'Bebé': 1703495031, 'Belleza': 6198054031, 'Bricolaje y herramientas': 2454133031, 'CDs y vinilos': 599373031, 'Coche y moto': 1951051031, 'Deportes y aire libre': 2454136031, 'Electrónica': 599370031, 'Equipaje': 2454129031, 'Hogar y cocina': 599391031, 'Iluminación': 3564289031, 'Industria, empresas y ciencia': 5866088031, 'Informática': 667049031, 'Instrumentos musicales': 3628866031, 'Jardín': 1571259031, 'Joyería': 2454126031, 'Juguetes y juegos': 599385031, 'Libros': 599364031, 'Libros en idiomas extranjeros': 599367031, 'Música Digital': 1748200031, 'Oficina y papelería': 3628728031, 'Otros Productos': 667040031, 'Películas y TV': 599379031, 'Productos para mascotas': 12472654031, 'Relojes': 599388031, 'Ropa': 2846220031, 'Salud y cuidado personal': 3677430031, 'Videojuegos': 599382031, 'Zapatos y complementos': 1571262031, 'Grandes electrodomésticos': 4772050031, 'Productos Handmade': 9699482031, 'Software': 599376031, 'Tienda Kindle': 818936031, 'Apps y Juegos': 1661649031, 'Cheques regalo': 3564279031, 'Dispositivos Amazon y Accesorios': 12598806031 }, 'in': { 'Bags, Wallets and Luggage': 2454169031, "Baby": 1571274031, "Beauty": 1355016031, "Books": 976389031, "Car & Motorbike": 4772060031, "Clothing & Accessories": 1571271031, "Computers & Accessories": 976392031, "Electronics": 976419031, "Grocery & Gourmet Foods": 2454178031, "Health & Personal Care": 1350384031, "Home & Kitchen": 976442031, "Home Improvement": 3704992031, "Industrial & Scientific": 5866078031, "Jewellery": 1951048031, "Movies & TV Shows": 976416031, "Music": 976445031, "Musical Instruments": 3677697031, "Office Products": 2454172031, 'Outdoor Living': 2454175031, 'Pet Supplies': 2454181031, "Shoes & Handbags": 1571283031, 'Sports, Fitness & Outdoors': 1984443031, "Toys & Games": 1350380031, 'Video Games': 976460031, "Watches": 1350387031, "Software": 976451031 }, 'br': { "Apps e Jogos": 6446175011, "Bebê": 17242603011, 'Beleza': 16194414011, 'Binquedos e Jogos': 16194299011, 'Alimentos e Bebidas': 18991079011, "Jardim e Piscina": 18991021011, 'Casa': 16191000011, "CD e Vinil": 7791937011, 'Computadores e Informática': 16339926011, 'Cozinha': 16957125011, "DVD e Blu-ray": 7791856011, 'Eletrodomésticos': 16522082011, 'Eletrônicos': 16209062011, 'Esporte': 17349396011, 'Ferramentas e Materiais de Construção': 16957182011, 'Games e Consoles': 7791985011, 'Livros': 6740748011, "Loja Kindle": 5308307011, 'Moda': 17365811011, "Papelaria e Escritório": 16957239011, "Saúde": 16215417011, "Pet Shop": 18991136011, "Automotivo": 18914209011 }, 'au': { "Alexa Skills": 4931595051, "Apps & Games": 2544160051, "Automotive": 4851453051, "Baby": 4851510051, "Beauty": 4851567051, "Books": 4851626051, "CDs & Vinyl": 4852330051, "Clothing, Shoes & Accessories": 4851856051, "Computers": 4851683051, "Electronics": 4851799051, "Everything Else": 4103126051, 'Health, Household & Personal Care': 4851917051, 'Home': 4851975051, "Home Improvement": 4852033051, "Kindle Store": 2490359051, "Kitchen & Dining": 4852150051, "Lighting": 4852207051, "Movies & TV": 4852264051, 'Pantry Food & Drinks': 5547635051, 'Pet Supplies': 5514967051, 'Software': 4852502051, 'Sports, Fitness & Outdoors': 4852559051, 'Stationery & Office Products': 4852445051, 'Toys & Games': 4852617051, 'Video Games': 4852675051 }, 'ca': { 'Automotive': 6948389011, "Baby": 3561346011, "Beauty & Personal Care": 6205124011, "Books": 916520, "Clothing & Accessories": 8604903011, "Electronics": 667823011, 'Everything Else': 2356392011, "Featured Stores": 916516, 'Grocery & Gourmet Food': 6967215011, 'Health & Personal Care': 6205177011, "Home": 2206275011, 'Industrial & Scientific': 11076213011, 'Jewelry': 6205496011, 'Livres': 916522, 'Luggage & Bags': 6205505011, 'Movies & TV': 917972, 'Music': 916514, 'Musical Instruments, Stage & Studio': 6916844011, 'Office Products': 6205511011, 'Patio, Lawn & Garden': 6205499011, 'Pet Supplies': 6205514011, 'Shoes & Handbags': 8604915011, 'Sports & Outdoors': 2242989011, 'Tools & Home Improvement': 3006902011, 'Toys & Games': 6205517011, 'Video': 916518, 'Video Games': 3198031, 'Watches': 2235620011 }, 'ae': { "Appliances": 15149781031, "Automotive": 11498031031, "Baby Products": 11498088031, "Beauty": 11497860031, "Books": 11497689031, "Computers": 11497746031, "Electronics": 11601327031, "Fashion": 11497632031, "Grocery": 15150009031, "Health": 11601441031, "Home": 16725681031, "Kitchen": 16402718031, "Mobile Phones & Communication Products": 12303750031, "Office Products": 15150351031, "Pet Supplies": 15150408031, "Sporting Goods": 11601213031, "Tools & Home Improvement": 11601270031, "Toys": 11497803031, "Videogames": 11601384031, }, "mx": { "Industria, Empresas y Ciencia": 11076223011, "Juguetes y Juegos": 11260442011, "Productos para Animales": 11782336011, "Ropa, Zapatos y Accesorios": 13848838011, "Automotriz y Motocicletas": 13848848011, "Instrumentos Musicales": 13848858011, "Tienda Kindle": 6446439011, "Libros": 9298576011, "Electrónicos": 9482558011, "Hogar y Cocina": 9482593011, "Salud, Belleza y Cuidado Personal": 9482610011, "Música": 9482620011, "Películas y Series de TV": 9482630011, "Videojuegos": 9482640011, "Bebé": 9482650011, "Deportes y Aire libre": 9482660011, "Herramientas y Mejoras del Hogar": 9482670011, "Software": 9482690011, "Oficina y Papelería": 9673844011 }, "tr": { "Oyuncak": 12467126031, "Spor ve Outdoor": 12467068031, "Mutfak": 12466781031, "Ofis ve Kırtasiye": 12467009031, "Kitap": 12466380031, "Yapı Market": 12466724031, "Kadın Modası": 13546651031, "Ev ve Yaşam": 12466667031, "Elektronik": 12466496031, "Erkek Modası": 13546649031, "Bebek": 12466208031, "Diğer Her Şey": 12467297031, "Bilgisayarlar": 12466439031, "Moda": 12466553031, "Video Oyunu ve Konsol": 12467183031 } } self.country_site = { "de": "de", "fr": "fr", "uk": "co.uk", "jp": "co.jp", "us": "com", "it": "it", "es": "es", 'in': "in", 'br': 'com.br', 'au': "com.au", 'ca': "ca" } def get_asin(self, country, keyword, ad, timestamp): # 根据条件筛选出asin并整理去重返回 table_name = country + '_asins' sql = f"select asin from {table_name} where keyword=%s and state>=2 and ad={ad} and timestamp>{timestamp}" params = (keyword, ) print(sql % params) self.cursor.execute(sql, params) asin_list = [] data_list = self.cursor.fetchall() # asin_data_list = [] for data in data_list: asin = data[0] if asin.isnumeric(): continue asin_list.append(asin) return list(set(asin_list)) def get_data(self, asin_list, country, timestamp, ad, keyword, category_name='', category_url=''): # 根据asin获取并处理数据 table_name = country + "_asins" data_list = [] little_rank = [] i = 0 # i,a只用于观察是否有asin无数据或虚拟物品过多影响结果 a = 1 # 遍历asin提取所有数据 for asin in asin_list: print(asin) print(a, '==') print(len(data_list), '----') a += 1 data = {} # 联表查询取出数据 sql = f"""select a.asin as 'asin', a.keyword as 'keyword',a.pageNum as 'page', a.positionNum as 'ranking', b.price as 'price', b.seller_type as 'seller_type', b.sellerName as 'sellerName', b.seller_num as 'seller_num', b.sellerID as 'sellerID', b.listing_rating as 'listing_rating', b.brand as 'brand', b.ratings as 'ratings', b.stock_status as 'stock_status', b.QA_num as 'QA_num', b.timestamp as 'time', b.country as 'country', b.reviews as 'reviews',b.critical as 'critical',b.vp_num as 'vp_num', b.product_style as 'product_style' , a.ad as 'ad', b.avg30 as 'avg30', b.actual_reviews as 'actual_reviews' from {table_name} as a left join product_detail as b on a.asin=b.asin where a.asin='{asin}' and country='{country}' and b.timestamp>{timestamp} and a.ad={ad} and a.keyword='{keyword}' order by b.timestamp DESC""" self.cursor.execute(sql) detail_data = self.cursor.fetchone() # 整理数据 if detail_data == None: continue country = detail_data[15] if category_name != '' and category_url != '': data['小类目'] = category_name data['类目链接'] = category_url data['小类目排名'] = '' data['asin'] = detail_data[0] data['站点'] = detail_data[15] data['产品链接'] = "https://www.amazon.%s/dp/" % self.country_site[ country] + data['asin'] # data['关键词'] = detail_data[1] result = self.collection.find_one({ "asin": asin, "country": "%s" % country, "timestamp": { "$gte": timestamp } }) if result != None: data['主图地址'] = "https://images-na.ssl-images-amazon.com/images/I/%s._AC_SL1000_.jpg" % \ result['img_list'][0] if len(result['img_list']) > 0 else '' data['视频'] = "有" if result['other_info']['video'] else "无" data['所在页数'] = detail_data[2] data['页面排名'] = detail_data[3] # data['广告位'] = "是" if detail_data[20]==1 else "否" data['广告所在页数'] = '' data['广告页面排名'] = '' data['价格'] = max(detail_data[4], 0) data['变体'] = 1 if result['vari_num'] == None else result[ 'vari_num'] data['跟卖数'] = detail_data[7] data['A+'] = "有" if result['other_info']['aplus'] else "无" product_info = json.loads(result['product_info']) first_date = { 'us': 'Date First Available', 'jp': 'Date First Available', 'de': 'Im Angebot von Amazon.de seit', 'uk': 'Date First Available', 'fr': 'Date First Available', 'es': 'Date First Available', 'it': 'Im Angebot von Amazon.de seit', } data['上架日期'] = product_info.get( first_date[country], '') # ('Im Angebot von Amazon.de seit', 0) # 提取出所有的类目数据并分为大类目和小类目数据 sql = f"""select category, `rank` from product_rankinfo where asin = '{asin}' and country='{country}' and timestamp>{timestamp}""" self.cursor.execute(sql) rank_list = self.cursor.fetchall() big_rank_info = [] # 大类目列表 little_rank_info = [] # 小类目列表 for rank_data in rank_list: if rank_data[0] in self.categoryIndexArr[country].keys(): rank_dict = { "category": rank_data[0], "rank": rank_data[-1] } big_rank_info.append(rank_dict) else: rank_dict = { "category": rank_data[0], "rank": rank_data[-1] } little_rank_info.append(rank_dict) data['大类目'] = big_rank_info[0]['category'] if len( big_rank_info) != 0 else "" data['大类目排名'] = big_rank_info[0]['rank'] if len( big_rank_info) != 0 else "" data['小类目数据'] = little_rank_info i += len(little_rank_info) little_rank.extend(little_rank_info) data['近30天排名'] = '' if detail_data[21] < 1 else detail_data[21] data['总评分人数'] = detail_data[11] data['listing评分'] = detail_data[9] # data['颜色/型号'] = detail_data[19] data['总评论数'] = detail_data[16] data['差评数'] = detail_data[17] data['VP评论数'] = detail_data[18] data['QA'] = max(detail_data[13], 0) data['品牌'] = detail_data[10].replace('Brand: ', '') data['实际评论数'] = detail_data[22] sellerID = detail_data[8] if sellerID not in ["", '-']: try: sql = """select negative_lifetime, count_lifetime from seller_info where sellerID='%s' order by timestamp DESC""" % sellerID self.cursor.execute(sql) seller_info = self.cursor.fetchone() data['feedback数'] = seller_info[-1] data['feedback差评率'] = seller_info[0] / 100 except: print('===') data['feedback数'] = 0 data['feedback差评率'] = 0 else: data['feedback数'] = "" data['feedback差评率'] = "" if detail_data[5] == 0: data['卖家类型'] = "虚拟" # 此即为虚拟商品或书本之类的商品 continue elif detail_data[5] == 1: data['卖家类型'] = "AMZ" elif detail_data[5] == 2: data['卖家类型'] = "FBA" elif detail_data[5] == 3: data['卖家类型'] = "MAH" # data['卖家类型'] = detail_data[5] data['店铺名称'] = detail_data[6] sales_sql = f"""select sales,date from product_sales where asin='{asin}'""" self.cursor.execute(sales_sql) sales_data = self.cursor.fetchall() # print(sales_data) if sales_data != []: for sales in sales_data: date = sales[-1] sales = sales[0] if date == "2020-10": data['月销量'] = sales data['月销量'] = 0 if data.get('月销量', 0) == 0 else data['月销量'] # data['sellerID'] = detail_data[8] # data['stock_status'] = detail_data[12] # data['time'] = detail_data[14] data_list.append(data) # 将数据汇总 # print(len(data_list), '----') else: print(f"====asin: {asin}, error: 没有抓取到详情页") # 将所有类目添加至一个列表中 little_category = [] for data in little_rank: little_category.append(data['category']) # 将列表去重赋给一个新的变量 category_set = set(little_category) # 新建一个字典用于统计保存每个类目出现的次数 category_dict = {} for category in category_set: value = little_category.count(category) category_dict[category] = value # 分情况处理,有时类目数很少只有一两个,再比较选出出现次数最多的一个大类目和两个小类目并返回结果 if len(category_dict) > 1: values = list(category_dict.values()) target_values = [] value_first = max(values) values.remove(value_first) value_next = max(values) for i in category_dict.items(): if i[-1] in [value_first, value_next]: target_values.append(i[0]) category_first = target_values[0] category_next = target_values[1] elif len(category_dict) == 1: category_first = list(category_dict.keys())[0] category_next = list(category_dict.keys())[0] else: category_first = " " category_next = " " return data_list, category_first, category_next def save_data(self, data_list, country, ad): # 将结果保存为excel文件 df = pd.DataFrame(data_list) if ad == 0: df.to_excel(f'data_{country}.xls') else: df.to_excel(f'ad_data_{country}.xls') def handle_data(self, data_list, category_first, category_next): # 根据最多的大类目和小类目数据处理一下数据,将不是这几个类目的数据置空 result_list = [] for data in data_list: if len(data['小类目数据']) > 0: if len(data['小类目数据']) == 1: # print(data['小类目数据'][0]['category']) if data['小类目数据'][0]['category'] in [ category_next, category_first ]: if data['小类目数据'][0]['category'] == category_first: data['小类目1'] = data['小类目数据'][0]['category'] data['小类目1排名'] = data['小类目数据'][0]['rank'] data['小类目2'] = "" data['小类目2排名'] = "" elif data['小类目数据'][0]['category'] == category_next: data['小类目1'] = "" data['小类目1排名'] = "" data['小类目2'] = data['小类目数据'][0]['category'] data['小类目2排名'] = data['小类目数据'][0]['rank'] else: data['小类目1'] = "" data['小类目1排名'] = "" data['小类目2'] = "" data['小类目2排名'] = "" elif len(data['小类目数据']) > 1: for category_data in data["小类目数据"]: if category_data['category'] in [ category_next, category_first ]: if category_data['category'] == category_first: data['小类目1'] = category_data['category'] data['小类目1排名'] = category_data['rank'] if data.get('小类目2', "") == "": data['小类目2'] = "" data['小类目2排名'] = "" elif category_data['category'] == category_next: if data.get('小类目1', "") == "": data['小类目1'] = "" data['小类目1排名'] = "" data['小类目2'] = category_data['category'] data['小类目2排名'] = category_data['rank'] else: if data.get('小类目1', "") == "" and data.get( '小类目2', "") == "": data['小类目1'] = "" data['小类目1排名'] = "" data['小类目2'] = "" data['小类目2排名'] = "" del data['小类目数据'] result_list.append(data) return result_list def close(self): self.mysql.close()