def parse_other_url(self, dom, params): """ 获取所有url, 并写入depfilter_task_queue :param dom: :param params: :return: """ result_list = [] channel = self.get_channel(dom) for e in dom.find('a'): sub_url = PyQuery(e).attr('href') if sub_url and sub_url.startswith("."): sub_url = self.link_analysis.url_join(params["info:url"], sub_url) if self.link_analysis.url_legal(sub_url, self.allow_domains): if not self.link_filter(sub_url): # 存入redis队列 _params = dict( params.copy(), **{ "info:url": sub_url, "info:channel": channel }) result_list.extend( [json.dumps(_params), int(_params["info:priority"])]) self.redis_action.priority_queue_push("dupfilter_task_queue", *result_list) return "parse urls"
def extract_detail_url(self, html): pq = PQ(html) div = pq("div[class='question-summary']") hrefs = list() for a in div('h3 a'): href = PQ(a).attr('href') if href.startswith('/'): href = self.BASE_URL + href hrefs.append(href) return hrefs
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['name'] = doc('h1#div_product_name').text() product_info['sku_id'] = doc('span#div_product_itemno').text() product_info['price'] = doc('span#div_product_price').text() product_info['label_price'] = doc('span#div_retail_price').text() product_info['img_url'] = self.merchant.filteruri(doc('img#target_img').attr('src')) #获取reviews数目 product_info['reviews'] = '0' bNodeList = doc('b') for item in bNodeList: text = PyQuery(item).text() if text.startswith("Customer Reviews"): product_info['reviews'] = extractNum(text) break #获取品类路径 nodeList = doc('a.nav-location') if PyQuery(nodeList[0]).text().strip() == 'Home': nodeList = nodeList[1:] for i, node in enumerate(nodeList): product_info['level' + str(i+1) + '_category'] = PyQuery(node).text().strip()
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['name'] = doc('h1#div_product_name').text() product_info['sku_id'] = doc('span#div_product_itemno').text() product_info['price'] = doc('span#div_product_price').text() product_info['label_price'] = doc('span#div_retail_price').text() product_info['img_url'] = self.merchant.filteruri( doc('img#target_img').attr('src')) #获取reviews数目 product_info['reviews'] = '0' bNodeList = doc('b') for item in bNodeList: text = PyQuery(item).text() if text.startswith("Customer Reviews"): product_info['reviews'] = extractNum(text) break #获取品类路径 nodeList = doc('a.nav-location') if PyQuery(nodeList[0]).text().strip() == 'Home': nodeList = nodeList[1:] for i, node in enumerate(nodeList): product_info['level' + str(i + 1) + '_category'] = PyQuery(node).text().strip()
def parse_item(self, response): city = response.meta.get('item') html = response.body name = PyQuery(html).find('.company-name-t').find('a').text() size = PyQuery(html).find('.terminal-company').find('li').eq(0).find( 'strong').text() nature = PyQuery(html).find('.terminal-company').find('li').eq(1).find( 'strong').text() industry = PyQuery(html).find('.terminal-company').find('li').eq( 2).find('a').text() website = PyQuery(html).find('.terminal-company').find('li').eq( -2).find('a').text() if not website.startswith('www'): website = None address = PyQuery(html).find('.terminal-company').find('li').eq( -1).find('strong').text().strip() item = ZhilianItem(city=city, name=name, size=size, nature=nature, industry=industry, website=website, address=address) yield item
def run(self): ''' 解析网站源码 ''' time.sleep(random.uniform(1.0, 3.6)) try: pq = helper.get(self.url, myHeaders=self.headers) # 款型名称 name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip() # 颜色尺寸 # 找出所有的尺寸 size_span_list = pq('div.product-sizes__options span.product-sizes__detail') size_price_list = [] for size_span in size_span_list: size = PyQuery(size_span).find('span.product-sizes__size').text().strip() if 'K' in size or 'k' in size or '-' in size or 'XS' in size: continue size = re.sub(r'[WwYyCc\*]', '', size) # 还有非数字的size,醉了 if size == 'S': continue elif size == 'M': continue elif size == 'L': continue elif size == 'XL': continue elif size == 'XXL': continue elif size == 'XXXL': continue elif size == '': continue elif size == 'OS': continue price = PyQuery(size_span).find('span.product-sizes__price').text().strip() if price.startswith('$'): price = price.replace('$', '').replace(',', '') size_price_list.append({ 'size': size, 'price': float(price), 'isInStock': True }) else: size_price_list.append({ 'size': size, 'price': 0.0, 'isInStock': False }) if len(size_price_list) < 1: return # 配色的编号 number = '' # 性别 gender = 0 # 颜色 color_value = '' tr_list = pq('table#product-attribute-specs-table tr') for tr in tr_list: key = PyQuery(tr).find('th').text().strip() if key == 'Gender': gender_txt = PyQuery(tr).find('td').text().strip() if gender_txt == 'Mens': gender = 1 elif gender_txt == 'Womens': gender = 2 elif key == 'Colorway': color_value = PyQuery(tr).find('td').text().strip() elif key == 'Manufacturer Sku': number = PyQuery(tr).find('td').text().strip() # print(name, number, self.url, size_price_list, gender, color_value) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('div.product-gallery-image > img')[0].get('src') # 下载图片 result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods') if error_counter < 3: self.q.put(self.url)
# Download Pixelmator tutorial videos from vimeo. # by twinsant import requests from pyquery import PyQuery from savevideo import get_download_links from savevideo import download_video if __name__ == '__main__': # Get all vimeo urls in tutorial page session = requests.Session() r = session.get('http://www.pixelmator.com/tutorials/') d = PyQuery(r.text.encode('utf8')) hrefs = d('a') urls = set() for href in hrefs: a = PyQuery(href).attr.href if a.startswith('https://vimeo.com'): urls.add(a) for url in sorted(list(urls)): print 'Get video links for %s' % url # With help of savevideo.me links = get_download_links(url) for link in links: video_url, profile = link # Exclude HD and Mobile versions if profile.find('(MP4 format)') != -1: download_video(video_url)
# Download Pixelmator tutorial videos from vimeo. # by twinsant import requests from pyquery import PyQuery from savevideo import get_download_links from savevideo import download_video if __name__ == '__main__': # Get all vimeo urls in tutorial page session = requests.Session() r = session.get('http://www.pixelmator.com/tutorials/') d = PyQuery(r.text.encode('utf8')) hrefs = d('a') urls = set() for href in hrefs: a = PyQuery(href).attr.href if a.startswith('https://vimeo.com'): urls.add(a) for url in sorted(list(urls)): print 'Get video links for %s' % url # With help of savevideo.me links = get_download_links(url) for link in links: video_url, profile = link # Exclude HD and Mobile versions if profile.find('(MP4 format)')!=-1: download_video(video_url)