class SpiderAmazon(): def __init__(self): self.lr = LRequests() self.h = httplib2.Http(".cache") self.gsa = GsaCaptcha() self.key_asins = {} self.headers = { # ':authority': 'www.amazon.com', # ':method': 'GET', # ':path': '/s?k=Wallets', # ':scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en;q=0.9', 'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', } self.cookies = {} def set_body(self, body, resp_headers): if (isinstance(body, bytes)): body = body.decode('utf-8') self.current_url = resp_headers['content-location'] self.body = body self.tree = html.fromstring(str(BeautifulSoup(self.body, 'lxml'))) def xpath(self, xpath): eles = self.tree.xpath(xpath) if eles and len(eles) > 0: return eles[0] return None def xpaths(self, xpath): return self.tree.xpath(xpath) def load_cookies(self, resp_headers): if 'set-cookie' in resp_headers: cookies_str = resp_headers['set-cookie'] for cookie in re.split(';|,', cookies_str): cookie = cookie.strip().lower() if any([ True if not cookie.startswith(e) and cookie.find('=') > 0 else False for e in exclude_cookie ]): name, value = cookie.split('=', 1) self.cookies[name] = value if len(self.cookies.keys()) > 0: cookies = [] for k, v in self.cookies.items(): cookies.append('%s=%s' % (k, v)) # logger.info('update cookies: %s' % '; '.join(cookies)) self.headers['cookie'] = '; '.join(cookies) def load_amazon(self, url): # self.lr.load(url) time.sleep(1) logger.info('load url: %s' % url) (resp_headers, body) = self.h.request(url, method='GET', headers=self.headers) self.set_body(body, resp_headers) self.load_cookies(resp_headers) # if(url.find('ref=nb_sb_noss') > -1): # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # if self.lr.body.find('Something went wrong on our end') > 0: # forms = BeautifulSoup(self.lr.body).find_all('form') # for f in forms: # print('=======-----') # print('1111111 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('2222222 %s' % f.attrs.get('action')) # for input_tag in f.find_all("select"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # self.lr.load('https://www.amazon.com/ref=cs_503_link') # time.sleep(1) # self.lr.load(url) # forms = BeautifulSoup(self.lr.body).find_all('form') # for f in forms: # print('=======') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) img_url = self.xpath('//img[contains(@src, "captcha")]').get( 'src') logger.info('load img: %s' % img_url) (resp_headers, body) = self.h.request(img_url, method='GET', headers=self.headers) with open(captcha_path, 'wb') as f: f.write(body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.xpath('//input[@name="amzn"]').get('value') amzn_r = self.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} logger.info('load url: %s' % captcha_url) (resp_headers, body) = self.h.request(captcha_url, method='GET', headers=self.headers) #, data=payload) self.load_cookies(resp_headers) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) (resp_headers, body) = self.h.request(url, method='GET', headers=self.headers) self.set_body(body, resp_headers) self.load_cookies(resp_headers) except Exception as ex: logger.error(ex, exc_info=True) def fetch_list(self, keyword): file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if not os.path.exists(file): self.key_asins[keyword] = [] # self.lr.load('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body) # print('cccccccccccccccc %s' % self.lr.body.find('crid')) # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) # form = BeautifulSoup(self.lr.body).find_all('form')[0] # for f in forms: # print('33333333333333333') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('--------------------') # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword)) # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword)) self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) while 1: self.fetch_asin(keyword) if not self.next_page(): break if len(self.key_asins[keyword]) > 0: open(file, 'w', encoding='utf-8').write('\n'.join( self.key_asins[keyword])) else: logger.info("empty asin: %s" % keyword) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self, keyword): product_eles = self.xpaths( '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]' ) for product_ele in product_eles: logger.info('asin: %s' % product_ele.get('data-asin')) self.key_asins[keyword].append(product_ele.get('data-asin')) def next_page(self): next_ele = self.xpath( '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]' ) if next_ele is None: return False else: next_url = urljoin(self.current_url, next_ele.get('href')) self.load_amazon(next_url) return True def fetch_products(self, keyword): asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if os.path.exists(asins_file): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(asins_file, 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath( '//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write( str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) if os.path.exists(product_dir): products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products)
class SpiderAmazon(): def __init__(self): self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} def load_amazon(self, url): self.lr.load(url) # if(url.find('ref=nb_sb_noss') > -1): # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) if self.lr.body.find('Something went wrong on our end') > 0: forms = BeautifulSoup(self.lr.body).find_all('form') for f in forms: print('=======-----') print('1111111 %s' % f.attrs.get('action')) for input_tag in f.find_all("input"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) print('2222222 %s' % f.attrs.get('action')) for input_tag in f.find_all("select"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) self.lr.load('https://www.amazon.com/ref=cs_503_link') time.sleep(1) self.lr.load(url) forms = BeautifulSoup(self.lr.body).find_all('form') for f in forms: print('=======') print('444444 %s' % f.attrs.get('action')) for input_tag in f.find_all("input"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.lr.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) self.lr.load_img( self.lr.xpath('//img[contains(@src, "captcha")]').get( 'src')) with open(captcha_path, 'wb') as f: f.write(self.lr.body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.lr.xpath('//input[@name="amzn"]').get('value') amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} self.lr.load(captcha_url, method='GET') #, data=payload) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # self.lr.load(url) except Exception as ex: logger.error(ex, exc_info=True) def fetch_list(self, keyword): file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if not os.path.exists(file): self.key_asins[keyword] = [] # self.lr.load('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body) # print('cccccccccccccccc %s' % self.lr.body.find('crid')) # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) # form = BeautifulSoup(self.lr.body).find_all('form')[0] # for f in forms: # print('33333333333333333') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('--------------------') # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword)) # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword)) self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) while 1: self.fetch_asin(keyword) if not self.next_page(): break if len(self.key_asins[keyword]) > 0: open(file, 'w', encoding='utf-8').write('\n'.join( self.key_asins[keyword])) else: logger.info("empty asin: %s" % keyword) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self, keyword): product_eles = self.lr.xpaths( '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]' ) for product_ele in product_eles: logger.info('asin: %s' % product_ele.get('data-asin')) self.key_asins[keyword].append(product_ele.get('data-asin')) def next_page(self): next_ele = self.lr.xpath( '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]' ) if next_ele is None: return False else: next_url = urljoin(self.lr.current_url, next_ele.get('href')) self.lr.load(next_url) return True def fetch_products(self, keyword): asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if os.path.exists(asins_file): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(asins_file, 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.lr.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.lr.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath( '//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write( str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) if os.path.exists(product_dir): products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products)
class SpiderAmazon(): def __init__(self, list_queue, product_queue, record_queue, csv_queue): self.list_queue = list_queue self.product_queue = product_queue self.record_queue = record_queue self.csv_queue = csv_queue self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} def load_amazon(self, url): self.lr.load(url) # if(url.find('ref=nb_sb_noss') > -1): open('xx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.lr.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) self.lr.load_img( self.lr.xpath('//img[contains(@src, "captcha")]').get( 'src')) with open(captcha_path, 'wb') as f: f.write(self.lr.body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.lr.xpath('//input[@name="amzn"]').get('value') amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} self.lr.load(captcha_url, method='GET') #, data=payload) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # self.lr.load(url) except Exception as ex: logger.error(ex, exc_info=True) def fetch_products(self, keyword): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(os.path.join(ASIN_DIR, '%s.txt' % keyword), 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.lr.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.lr.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) record_queue.put('%s|||%s|||%s' % (keyword, asin, brand)) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword, brand, asin): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] try: uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath('//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword, brand, asin): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) try: trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def do_products(self): while True: try: keyword = self.product_queue.get(timeout=5) self.fetch_products(keyword) except queues.Empty: logger.info('products empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def do_records(self): while True: try: keyword, asin, brand = self.record_queue.get( timeout=5).split('|||') self.fetch_trademarkia(keyword, asin, brand) self.fetch_uspto(keyword, asin, brand) self.csv_queue.put('%s|||%s' % (keyword, asin)) except queues.Empty: logger.info('records empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def do_csv(self): while True: try: keyword, asin = self.csv_queue.get(timeout=5).split('|||') self.output_csv(keyword, asin) except queues.Empty: logger.info('csv empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword, asin): product_dir = os.path.join(PRODUCTS_DIR, keyword) products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) if os.path.exists(csv_path): with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products) else: with open(csv_path, 'a', encoding='utf-8', newline="") as csvfile: csvwriter.writerows(products)