def parseProductsByCategory(self, category_page_content, category_info): if PyQuery(category_page_content)('section.main').find('a').filter(lambda i:PyQuery(this).text().strip()=='View all').eq(0).attr('href'): category_info.url = 'http://www.backcountry.com' + PyQuery(category_page_content)('section.main').find('a').filter(lambda i:PyQuery(this).text().strip()=='View all').eq(0).attr('href') category_page_content = self.crawler.fetchCategoryPageContent(category_info.url) productNodeList = PyQuery(category_page_content)('div#products > div.product') productList = [] for node in productNodeList: nodeQ = PyQuery(node) productInfo = self.newProduct() productInfo['name'] = nodeQ.children('a').attr('title') productInfo['product_url'] = 'http://www.backcountry.com' + nodeQ.children('a').attr('href') productInfo['img_url'] = nodeQ.children('a > div.ui-pl-img > img[itemprop="image"]').attr('src') if not productInfo['img_url']: productInfo['img_url'] = nodeQ.children('a > div.ui-pl-img > img[itemprop="image"]').attr('data-src') productInfo['img_url'] = "http:" + productInfo['img_url'] spanList = nodeQ('div.ui-pl-offers > span.ui-pl-pricing > span') if len(spanList) <= 2: #价格区间是两个span productInfo['price'] = PyQuery(spanList).text().replace(' ', '') else: productInfo['price'] = PyQuery(spanList).eq(1).text() productInfo['label_price'] = PyQuery(spanList).eq(2).text() productInfo['sku_id'] = re.findall("skid=([\w-]+)&", productInfo['product_url'])[0] if re.findall("skid=([\w-]+)&", productInfo['product_url']) else '' productInfo['reviews'] = nodeQ('div.ui-pl-reviews > span[itemprop="ratingCount"]').text() productInfo.set_categories(category_info) productList.append(productInfo) return productList
def crawl_vvic_category_tree(wb): h = httplib2.Http() response, content = h.request("http://www.vvic.com/") # fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w") # fw.write(content) # fw.close() ws = wb.add_sheet("vvic品类树") ws.write(0, 0, "一级品类") ws.write(0, 1, "二级品类") ws.write(0, 2, "三级品类") row = 0 doc = PyQuery(content) level1NodeList = doc("div.dd-inner > div.item") anotherLevel1NodeList = [ doc('div.sub-items')[0], doc('div.sub-items')[1], doc('div.sub-items')[2], doc('div.sub-items')[5] ] for index, level1Node in enumerate(level1NodeList): level1_category = PyQuery(level1Node)('h3 > a').text() level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) level2_category = level2NodeQ.children('dt > a').text() level3NodeList = level2NodeQ.children('dd > a') for level3Node in level3NodeList: level3_category = PyQuery(level3Node).text() row += 1 ws.write(row, 0, level1_category) ws.write(row, 1, level2_category) ws.write(row, 2, level3_category)
def parseProductsByCategory(self, category_page_content, category_info): if not self.crawling_category.has_key(category_info): self.crawling_category = {} #清空 self.crawling_category[category_info] = [1, 0] #开始抓该品类的第一页的商品,已抓商品总计0个 else: self.crawling_category[category_info][0] = self.crawling_category[category_info][0] + 1 doc = PyQuery(category_page_content) productList = [] productNodeList = doc('div#js_proList > ul > li') for i, productNode in enumerate(productNodeList): productNodeQ = PyQuery(productNode) productInfo = self.newProduct() part1 = productNodeQ.children('p.pr') productInfo['img_url'] = PyQuery(part1).children('a.pic > img').attr('data-original') part2 = productNodeQ.children('p.pro_name') productInfo['name'] = PyQuery(part2).children('a').text().strip() productInfo['product_url'] = PyQuery(part2).children('a').attr('href') productInfo['sku_id'] = re.findall('-([\d]+)\.html', productInfo['product_url'])[0] part3 = productNodeQ.children('p.pro_price') productInfo['price'] = PyQuery(part3).find('strong.my_shop_price').text() productInfo.set_categories(category_info) productInfo['page_idx'] = str(self.crawling_category[category_info][0]) productInfo['num_idx'] = str(i + 1) productInfo['cate_idx'] = str(self.crawling_category[category_info][1] + 1) productList.append(productInfo) self.crawling_category[category_info][1] = self.crawling_category[category_info][1] + 1 #每抓一个商品, 加1 info('%s has been crawled %d products after parse %d pages。' %(category_info, self.crawling_category[category_info][1], self.crawling_category[category_info][0])) return productList
def crawl_vvic_category_tree(wb): h = httplib2.Http() response, content = h.request("http://www.vvic.com/") # fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w") # fw.write(content) # fw.close() ws = wb.add_sheet("vvic品类树") ws.write(0,0,"一级品类") ws.write(0,1,"二级品类") ws.write(0,2,"三级品类") row = 0 doc = PyQuery(content) level1NodeList = doc("div.dd-inner > div.item") anotherLevel1NodeList = [doc('div.sub-items')[0], doc('div.sub-items')[1], doc('div.sub-items')[2], doc('div.sub-items')[5]] for index, level1Node in enumerate(level1NodeList): level1_category = PyQuery(level1Node)('h3 > a').text() level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) level2_category = level2NodeQ.children('dt > a').text() level3NodeList = level2NodeQ.children('dd > a') for level3Node in level3NodeList: level3_category = PyQuery(level3Node).text() row += 1 ws.write(row, 0, level1_category) ws.write(row, 1, level2_category) ws.write(row, 2, level3_category)
def test_mark_dirty(): node = PyQuery( '<a data-riot-id="0"><b data-riot-id="0.0"><c data-riot-id="0.0.0"></c></b></a>' ) mark_dirty(node.children('b')) assert node.attr['data-riot-dirty'] == 'true' assert node.children('b').attr['data-riot-dirty'] == 'true' assert not node.children('c').attr['data-riot-dirty']
def parseSupplierContactPage(m): #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm? if m['url'].find('\?') > 0: """do nothing""" else: if m['url'].endswith("/"): m['url'] = m['url'][:-1] m['url'] = m['url'] + '?' #拼出联系页面的url contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url']) content = fetchContent(contact_page_url) doc = PyQuery(content) #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载 if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt'): m['trade_medal'] = doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt') else: m['trade_medal'] = '' m['supply-grade'] = len(doc('div.detail > div.supply-grade > span.disc > a.image > img')) m['biz-type'] = doc('div.detail > div.biz-type > span').text() if not m['biz-type']: m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text() aList = doc('div.contcat-desc > dl') bList = [] for item in aList: itemQ = PyQuery(item) text = itemQ.children('dt').text() #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉 if text.find(u"话") > 0: bList.append(itemQ.children('dd').text()) m['contact'] = ', '.join(bList) #根据json数据获取 满意度 #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1 #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true} if re.findall('shop/(.*)/page', contact_page_url): stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall("shop/(.*)/page", contact_page_url)[0] + '&sati=1' content2 = fetchContent(stat_url) json_data = json.loads(content2) m['satisfication'] = json_data['data']['sati']['satisfaction'] #抓全部商品数 和 动销 商品数 #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true' merchantId=re.findall('shop/(.*)/page', contact_page_url)[0] all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true' active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm' content3 = fetchContent(all_products_url) doc3 = PyQuery(content3) m['products_count'] = extractNum(doc3('li[class="offer-list-tab-title current"] > a > em').text()) if m['products_count'] == 0: m['products_count'] = doc3('ul[data-sp="paging-a"] > li > em.offer-count').text() content4 = fetchContent(active_product_url) doc4 = PyQuery(content4) m['active_products_count'] = extractNum(doc4('li[class="offer-list-tab-title current"] > a > em').text()) if m['active_products_count'] == 0: m['active_products_count'] = doc4('ul[data-sp="paging-a"] > li > em.offer-count').text() else: m['satisfication'] = ''
def parse_xml_to_xmljson(node): pqi = PyQuery(node) items = pqi[0].attrib out = {'tag': pqi[0].tag} if len(items) > 0: out['attributes'] = dict(items) if len(pqi.children()) > 0: out['children'] = [] for child in pqi.children(): out['children'].append(parse_xml_to_xmljson(child)) else: out['text'] = pqi.text() return out
def parseCategories(): categoryList = [] node = doc("ul#categrayAll > li").eq(0) #只抓了第一个一级品类 nodeQ = PyQuery(node) level1Name = nodeQ.children('div.li > a > em').text() level2NodeList = nodeQ.children('div.sub-list > div > ul.column > li.level1') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) category = self.newCategory() category.name = level2NodeQ.children('a').text() category.url = level2NodeQ.children('a').attr("href") category.parent_categories = [level1Name] categoryList.append(category) return categoryList
def parseCategories(): categoryList = [] node = doc("ul#categrayAll > li").eq(0) #只抓了第一个一级品类 nodeQ = PyQuery(node) level1Name = nodeQ.children('div.li > a > em').text() level2NodeList = nodeQ.children( 'div.sub-list > div > ul.column > li.level1') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) category = self.newCategory() category.name = level2NodeQ.children('a').text() category.url = level2NodeQ.children('a').attr("href") category.parent_categories = [level1Name] categoryList.append(category) return categoryList
def paper_page(self, url): req = self.connect(url) container = PyQuery(req.text)('.container') output = [] title = container.children('h2').text() output.append(title + '\n') author_list = container.children('p').text() author_list = author_list.split('; ') author_list = ['\t' + x + '\n' for x in author_list] output.extend(author_list) abstract = container.children('.row').children('.col-lg-9').children( 'p').text() abstract = translate('en', 'zh-CN', abstract) output.append('\t' + abstract + '\n') return output
def render_svg(image, width=None, height=None, request=None, css_class='', img_class='', alt=''): """Render SVG file""" # pylint: disable=too-many-arguments options = {} if width or height: options['style'] = 'width: {0}{1}; height: {2}{3};'.format( width, 'px' if isinstance(width, int) else '', height, 'px' if isinstance(height, int) else '') else: options['style'] = '' options['css_class'] = css_class if alt or img_class: svg = PyQuery(image.data) if alt: group = PyQuery('<g></g>') group.append(PyQuery('<title />').text(alt)) for child in svg.children(): group.append(child) svg.empty().append(group) if img_class: svg.attr('class', img_class) options['svg'] = svg.outer_html() else: options['svg'] = image.data return render('templates/svg-render.pt', options, request)
def search(self, word, limit=None): r = requests.get(URL, params={ 'p': word, 'min': 1, 'price_type': 'bidorbuyprice', 's1': 'score2', 'o1': 'a', }) doc = PyQuery(r.text) results = [] for item in doc("div#list01")('table')('tr').not_('.la'): result = {} item = PyQuery(item) if not item.children('td.i'): continue # print(item.text()) # tag = item('div.srp-pdtaglist')('span.srp-tag') # if tag.hasClass('bid'): # continue # contentwrap = item('div.contentwrap') a = item('td.a1')('a') result['title'] = a.text() result['url'] = a.attr('href') result['img'] = item('td.i')('img').attr('src') result['price'] = int( item('td.pr2').text().replace(',', '').replace('円', '')) results.append(result) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}
def _add_nested(self, k, el): """Parse nested element by its children.""" el = Pq(el) tagname = Pq(el)[0].tag if tagname in self.invalid_tags: return id = self._format_id(el.attr('id')) classes = self._format_classes(el.attr('class')) selector = self._format_selector(el, id, classes) children = Pq(el).children() if not self._is_root_body_node(el): return # Add for single nodes only if not children: self.selectors.add(selector) # Build nested css by traversing all child nodes and getting # their attributes. while children: for child in children: # 1. Add current self.selectors.add(selector) # 2. Add child child = Pq(child) selector += self._add_id_and_classes(child) self.selectors.add(selector) # # 3. Move to next children children = child.children()
def get_inline_snippets(html): pq = PyQuery(html) codes = [ "<code>" + item.text() + "</code>" for item in pq.children("p code").items() ] return codes
def paper_page(self, url): req = self.connect(url) page_pyquery = PyQuery(req.text) post_pyquery = page_pyquery('.container-fluid.proceedings-detail') output = [] paper_header = PyQuery(post_pyquery('.row')[0]) paper_header = paper_header.children()[0] title = paper_header('h1').text() output.append(title + '\n') author = paper_header('h2').text() output.append('\t' + author + '\n') contain = PyQuery(post_pyquery('.row')[2]) abstract = contain.children()[0].text() abstract = self.translate_en2cn(abstract) output.append('\t' + abstract + '\n') return output
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc( "div#cms_page_922 > div[class='js-template-display js-template dept_wrap ']" ).find("div.topnav") for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1Name = level1NodeQ.children( "div > span[class='js-lego-data lego_text_field '] > a").text( ).strip() if level1Name.upper() in ['GIFTS', 'SALES']: #最后两个品类 不予考虑 continue level2NodeList = level1NodeQ( 'div[class="nav_link_block_title nav_link_block_text"] > span > a' ) for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo) #前期只是抓一个三级品类 需要扩展为全站 # category_info = self.newCategory() # category_info.name = 'Chandeliers' # category_info.parent_categories = ['Lighting', 'Ceiling Lights'] # category_info.url = 'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&curpage=7'#'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&refid=&sku=' # categoryList.append(category_info) return categoryList
def parseProductsAndCategoriesByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productList, categoryList = [], [] if category_info.parent_categories and len(category_info.parent_categories) == 2: productList = self.parseProductsByCategory(category_page_content, category_info) return productList, categoryList if category_info.name == 'New Arrivals': #特殊处理一下 for level2Node in doc.find('div#js_catelist_sec > div.item'): level2NodeQ = PyQuery(level2Node) level2CateName = level2NodeQ.children('p > a').text() for level3Node in level2NodeQ.children('ul > li > a'): categoryInfo = self.createCategory(PyQuery(level3Node)) categoryInfo.parent_categories = [category_info.name, level2CateName] categoryList.append(categoryInfo.formalize()) return productList, categoryList if category_info.name == 'Clearance': level2NodeList = doc('div.catelist > ul.cataUl_list > li > a') for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = ['Clearance'] categoryList.append(categoryInfo.formalize()) return productList, categoryList if doc.find('div#js_catelist_sec > div.cur > ul > li'): nodeList = doc.find('div#js_catelist_sec > div.cur > ul > li > a') for node in nodeList: nodeQ = PyQuery(node) categoryInfo = self.newCategory() categoryInfo.name = nodeQ.text() categoryInfo.url = nodeQ.attr('href') categoryInfo.set_categories(category_info) categoryList.append(categoryInfo.formalize()) elif doc.find('div.catelist > ul > li.cur > div.menuList > p'): nodeList = doc.find('div.catelist > ul > li.cur > div.menuList > p > a') for node in nodeList: nodeQ = PyQuery(node) categoryInfo = self.newCategory() categoryInfo.name = nodeQ.text() categoryInfo.url = nodeQ.attr('href') if category_info.parent_categories: result = category_info.parent_categories + [category_info.name] else: result = [category_info.name] categoryInfo.parent_categories = result categoryList.append(categoryInfo.formalize()) else: productList = self.parseProductsByCategory(category_page_content, category_info) return productList, categoryList
def test_render_each_to_document(): document = PyQuery('<custom data-riot-id="0"><button label="{ label }" each="{ items }" data-riot-id="0.0"></button></custom>') expressions = parse_document_expressions(document) render_document(expressions, {'items': [{'label': 'first'}, {'label': 'second'}]}) assert document.attr['data-riot-dirty'] == 'true' assert len(document.children()) == 2 assert document('button').eq(0).attr.label == 'first' assert document('button').eq(1).attr.label == 'second'
def tokenize_code(html): res = [] pq = PyQuery(html) snippets = pq.children("pre code") for snippet in snippets: res.append(snippet.text) return res
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) nodeList = doc('ul#header-navigation-menu > li.menu-container') categoryList = [] #去除前面三个和后面两个乱七八糟的分类 validNodeList = nodeList[3:10] for node in validNodeList: nodeQ = PyQuery(node) level1Name = nodeQ.children('a').text() level2NodeList = nodeQ.children('div > ul:first > li.indent-child > span') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) categoryInfo = self.newCategory() categoryInfo.name = level2NodeQ.text() categoryInfo.url = level2NodeQ.attr('href') categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo.formalize()) return categoryList
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) nodeList = doc('ul#header-navigation-menu > li.menu-container') categoryList = [] #去除前面三个和后面两个乱七八糟的分类 validNodeList = nodeList[3:10] for node in validNodeList: nodeQ = PyQuery(node) level1Name = nodeQ.children('a').text() level2NodeList = nodeQ.children( 'div > ul:first > li.indent-child > span') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) categoryInfo = self.newCategory() categoryInfo.name = level2NodeQ.text() categoryInfo.url = level2NodeQ.attr('href') categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo.formalize()) return categoryList
def __convert_to_ebook(self, book): row = PyQuery(book.find('div.sg-row')[1]) image_wrap = PyQuery(row.children()[0]) info_wrap = PyQuery(row.children()[1]).find('.sg-row') title_author_wrap = PyQuery(info_wrap[0]) price_wrap = PyQuery(info_wrap[1]) price = price_wrap.find('.a-price-whole').text() + \ price_wrap.find('.a-price-fraction').text() ebook = Ebook() ebook.title = title_author_wrap.find('h2').text() ebook.author = title_author_wrap.find('h2')\ .next().text().split('|')[0].strip() ebook.price = book.find('.u-price em').text() ebook.cover = image_wrap\ .find('[data-component-type="s-product-image"] img')\ .attr('src') ebook.price = float(price) return ebook
def tokenize_HTML(html): sentences = "" pq = PyQuery(html) codes = pq.children("p code").items() for code in codes: html = html.replace(code.text(), "CODE_ELEMENT_NN") html = html.replace("<code>", "") html = html.replace("</code>", "") new_pq = PyQuery(html) p_s = new_pq.children("p").items() for p in p_s: sentences = sentences + p.text() + " " return sentences
def parseCategories(self, homepage_content): '''抓取前六个一级品类''' doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc('ul.pet-main-nav > li.pet-main-nav-item-level1')[:6] for node in level1NodeList: nodeQ = PyQuery(node) level1Name = nodeQ.children('a > span').text() level2NodeList = nodeQ.children('div > div > ul > li')[:2] #写死了 for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) level2Name = level2NodeQ.children('a > span').text() level3NodeList = level2NodeQ.children('ul > li > a') for level3Node in level3NodeList: level3NodeQ = PyQuery(level3Node) categoryInfo = self.newCategory() categoryInfo.name = level3NodeQ.children('span').text() categoryInfo.url = level3NodeQ.attr('href') categoryInfo.parent_categories = [level1Name, level2Name] categoryList.append(categoryInfo) return categoryList
def parseCategories(self, homepage_content): '''从首页最多获取至二级品类 ''' doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc('nav#nav > div.w > ul > li') for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) if not level1NodeQ.children('div.sub_menu'): categoryInfo = self.createCategory(level1NodeQ.children('a')) categoryList.append(categoryInfo) else: level1Name = level1NodeQ.children('a').text() level2NodeList = level1NodeQ.children('div.sub_menu > div.leftWrap > div.leftTitle > dl') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) if level2NodeQ.find('dt'): #每个二级品类名会重复出现 continue elif level2NodeQ.find('dd'): categoryInfo = self.createCategory(level2NodeQ.children('dd > a')) categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo) return categoryList
def test_render_each_to_document(): document = PyQuery( '<custom data-riot-id="0"><button label="{ label }" each="{ items }" data-riot-id="0.0"></button></custom>' ) expressions = parse_document_expressions(document) render_document(expressions, {'items': [{ 'label': 'first' }, { 'label': 'second' }]}) assert document.attr['data-riot-dirty'] == 'true' assert len(document.children()) == 2 assert document('button').eq(0).attr.label == 'first' assert document('button').eq(1).attr.label == 'second'
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) #Gifts和Brands li的data-id属性分别为0, 1 level1NodeList = doc('ul.js-flyout-nav > li').filter(lambda i:PyQuery(this).attr('data-id') > '1') categoryList = [] for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1Name = level1NodeQ.children('a').text() level2NodeList = doc('div').filter(lambda i, this:PyQuery(this).attr('data-cat-id')==level1NodeQ.attr('data-id')).children('a') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) if not level2NodeQ.attr('class') or not level2NodeQ.text(): #<a class="" href="" data-title=""/> continue categoryInfo = self.newCategory(level2NodeQ.text(), 'http://www.backcountry.com' + level2NodeQ.attr('href'), [level1Name]) categoryList.append(categoryInfo) return categoryList
def parseProducts(category_page_content): doc = PyQuery(category_page_content) productList = [] nodeList = doc('div#categoryHotProductTable > table.responstable > tbody > tr') for node in nodeList: product = [] nodeQ = PyQuery(node) tdList = nodeQ.children('td') product.append(PyQuery(tdList[0]).children('img').attr('ng-src')) product.append(PyQuery(tdList[1]).children('a').attr('href')) product.append(PyQuery(tdList[1]).children('a').text()) product.append(PyQuery(tdList[2]).text()) product.append(PyQuery(tdList[4]).text()) product.append(PyQuery(tdList[8]).text()) product.append(re.findall('/([\d]+)$', product[1])[0]) #不加任何检查 productList.append(product) return productList
def parseProducts(category_page_content): doc = PyQuery(category_page_content) productList = [] nodeList = doc( 'div#categoryHotProductTable > table.responstable > tbody > tr') for node in nodeList: product = [] nodeQ = PyQuery(node) tdList = nodeQ.children('td') product.append(PyQuery(tdList[0]).children('img').attr('ng-src')) product.append(PyQuery(tdList[1]).children('a').attr('href')) product.append(PyQuery(tdList[1]).children('a').text()) product.append(PyQuery(tdList[2]).text()) product.append(PyQuery(tdList[4]).text()) product.append(PyQuery(tdList[8]).text()) product.append(re.findall('/([\d]+)$', product[1])[0]) #不加任何检查 productList.append(product) return productList
def parseCategories(self, homepage_content): '''从首页直接解析出全部品类路径 TOPS, BOTTOMS可以获取至三级品类 其余一级品类获取至二级品类 ''' doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc('ul#TS_menu > li.subCatName') for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1CateName = level1NodeQ.children('a').text().strip() if level1CateName == "WHAT'S NEW" or level1CateName == "Style Gallery": continue #################################################################################################### if level1CateName.upper() == 'DRESSES': level2NodeList = level1NodeQ.children( 'div.Second_ca > ul.loop_ul > li > div').find( 'li > a') # 1个li下多个div for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = [level1CateName] categoryList.append(categoryInfo) continue #################################################################################################### colNodeList = level1NodeQ.children( 'div.Second_ca > ul') #ul.loop_ul将SALE品类 for colNode in colNodeList: level2NodeList = PyQuery(colNode).children('li') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) if level2NodeQ.children('div'): #这个判断就是: 是否存在三级品类 level2CateName = level2NodeQ.children('a').text() level3NodeList = level2NodeQ.children('div').find( 'li > a') for level3Node in level3NodeList: categoryInfo = self.createCategory( PyQuery(level3Node)) categoryInfo.parent_categories = [ level1CateName, level2CateName ] categoryList.append(categoryInfo) else: categoryInfo = self.createCategory( PyQuery(level2NodeQ.children('a'))) categoryInfo.parent_categories = [level1CateName] categoryList.append(categoryInfo) return categoryList
def parseStorePage(product): store_url = product['store_url'] index = store_url.find('1688.com') store_url = store_url[:index] contact_url = store_url + '1688.com/page/contactinfo.htm' if store_info.has_key(contact_url): for key in store_info[contact_url].keys(): product[key] = store_info[contact_url][key] else: content = fetchContent(contact_url) store_info[contact_url] = {} doc = PyQuery(content) product['supplier_name'] = doc( 'div.detail > div.contactSeller > span.disc').text() if not product['supplier_name']: product['supplier_name'] = doc( 'div.detail > div.contactSeller').remove('label').text() store_info[contact_url]['supplier_name'] = product['supplier_name'] product['city'] = doc('div.detail > div.address > span.disc').text() if not product['city']: product['city'] = doc('div.detail').find('div.address').remove( 'label').text() store_info[contact_url]['city'] = product['city'] product['mobile'], product['telephone'], product[ 'store_address'] = '', '', '' for node in doc('div.contcat-desc > dl'): nodeQ = PyQuery(node) print nodeQ.children('dt').text() if nodeQ.children('dt').text().strip() == u'电 话:': product['telephone'] = nodeQ.children('dd').text() store_info[contact_url]['telephone'] = product['telephone'] if nodeQ.children('dt').text().strip() == u'地 址:': product['store_address'] = nodeQ.children('dd').text() store_info[contact_url]['store_address'] = product[ 'store_address'] if nodeQ.children('dt').text().strip() == u'移动电话:': product['mobile'] = nodeQ.children('dd').text() store_info[contact_url]['mobile'] = product['mobile'] return product
def __init__(self, url: str, pq_obj: pyquery.PyQuery): self.url = url self.country = pq_obj.children().eq(0)('img').attr('src').split('/')[-1].split('.')[0] self.session_number = self.__to_int__(pq_obj.children().eq(2).text().split(self.__nl__)[0].split()[0]) self.alive_days = self.__to_int__(pq_obj.children().eq(2).text().split(self.__nl__)[1].split()[0]) self.bandwidth = self.__to_float__(pq_obj.children().eq(3).text().split(self.__nl__)[0].split()[0]) self.ping = self.__to_int__(pq_obj.children().eq(3).text().split(self.__nl__)[1].split()[1]) self.score = pq_obj.children().eq(9).text().replace(',', '') href_list = pq_obj.children().eq(6)('a').attr('href').split('?')[1].split('&') self.ip = href_list[1].split(self.__eq_sign__)[1] self.tcp = href_list[2].split(self.__eq_sign__)[1] self.udp = href_list[3].split(self.__eq_sign__)[1] self.sid = href_list[4].split(self.__eq_sign__)[1] self.hid = href_list[5].split(self.__eq_sign__)[1] self.link = None
def tokenize_HTML(html): sentences = "" pq = PyQuery(html) p_s = pq.children("p").items() for p in p_s: sentences = sentences + p.text() + " " flag = False adj_words = [] for word in QUALITY_WORDS: if word in sentences: flag = True adj_words.append(word) if flag: pass #print(sentences + "\t" + adj_words.__str__()) return sentences
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict: # Find if has children elem = PyQuery(elem) children = list(elem.contents()) has_children = len(elem.children()) > 0 contents = [] if has_children: # Fix unwrapped children if not already_wrapped: children = fix_unwrapped_text(elem).contents() for child in children: child_dict = build_dict_from_sane_json(child, already_wrapped=True) if child_dict: contents.append(child_dict) else: contents = elem.html() extra = {} # Only tables need the HTML (to use later for extraction of relevant data) if elem.is_("table"): extra = {'original_html': str(elem)} if 'src' in elem[0].attrib: extra['src'] = elem.attr('src') if 'href' in elem[0].attrib: extra['href'] = elem.attr('href') tag_type = list(elem)[0].tag tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type) contents = PRE_CONTENTS_MATCH.get(tag_type, contents) return { 'type': tag_type_mapped, 'attrs': [], 'layout': {}, 'contents': contents, 'extra': extra }
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc("div#cms_page_922 > div[class='js-template-display js-template dept_wrap ']").find("div.topnav") for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1Name = level1NodeQ.children("div > span[class='js-lego-data lego_text_field '] > a").text().strip() if level1Name.upper() in ['GIFTS', 'SALES']: #最后两个品类 不予考虑 continue level2NodeList = level1NodeQ('div[class="nav_link_block_title nav_link_block_text"] > span > a') for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo) #前期只是抓一个三级品类 需要扩展为全站 # category_info = self.newCategory() # category_info.name = 'Chandeliers' # category_info.parent_categories = ['Lighting', 'Ceiling Lights'] # category_info.url = 'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&curpage=7'#'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&refid=&sku=' # categoryList.append(category_info) return categoryList
def parseStorePage(product): store_url = product["store_url"] index = store_url.find("1688.com") store_url = store_url[:index] contact_url = store_url + "1688.com/page/contactinfo.htm" if store_info.has_key(contact_url): for key in store_info[contact_url].keys(): product[key] = store_info[contact_url][key] else: content = fetchContent(contact_url) store_info[contact_url] = {} doc = PyQuery(content) product["supplier_name"] = doc("div.detail > div.contactSeller > span.disc").text() if not product["supplier_name"]: product["supplier_name"] = doc("div.detail > div.contactSeller").remove("label").text() store_info[contact_url]["supplier_name"] = product["supplier_name"] product["city"] = doc("div.detail > div.address > span.disc").text() if not product["city"]: product["city"] = doc("div.detail").find("div.address").remove("label").text() store_info[contact_url]["city"] = product["city"] product["mobile"], product["telephone"], product["store_address"] = "", "", "" for node in doc("div.contcat-desc > dl"): nodeQ = PyQuery(node) print nodeQ.children("dt").text() if nodeQ.children("dt").text().strip() == u"电 话:": product["telephone"] = nodeQ.children("dd").text() store_info[contact_url]["telephone"] = product["telephone"] if nodeQ.children("dt").text().strip() == u"地 址:": product["store_address"] = nodeQ.children("dd").text() store_info[contact_url]["store_address"] = product["store_address"] if nodeQ.children("dt").text().strip() == u"移动电话:": product["mobile"] = nodeQ.children("dd").text() store_info[contact_url]["mobile"] = product["mobile"] return product
def qichacha_search_result(j: PyQuery) -> dict: j = j.children() td_row = j.eq(2) company_name = td_row.children('a').text() p_first = td_row.children('p').eq(0) legal_representative = p_first.children('a').text() span_m_l = p_first("span:first").text().split(':') registered_capital = span_m_l[-1].strip('-') span_m_ls = p_first('span:last').text().split(':') date_of_establishment = span_m_ls[-1] p_two = td_row('p').eq(-3) p_obj = p_two.clone() p_obj.children().remove() email = p_obj.text().split(':')[-1].strip('-') phone = p_two.find('span').text().split(':')[-1].strip(' ').strip('-') register_address = td_row.find('p').eq(2).text().split(':')[-1] return dict(company_name=company_name, legal_representative=legal_representative, registered_capital=registered_capital, date_of_establishment=date_of_establishment, email=email, phone=phone, register_address=register_address)
def parse(self, content: str): """ Parse html to parsed object :param content: :return: """ d = PyQuery(content) element_list = [] # list of children children = d.contents() # if no children, parse first one if len(d.children()) == 0: element_list.append(self.__parse__(d[0])) else: for child in children: try: parsed = self.__parse__(child) if parsed: element_list.append(parsed) except Exception as e: pass self.parsed_objects = element_list return self
def parseCategories(self, homepage_content): '''从首页直接解析出全部品类路径 TOPS, BOTTOMS可以获取至三级品类 其余一级品类获取至二级品类 ''' doc = PyQuery(homepage_content) categoryList = [] level1NodeList = doc('ul#TS_menu > li.subCatName') for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1CateName = level1NodeQ.children('a').text().strip() if level1CateName == "WHAT'S NEW" or level1CateName == "Style Gallery": continue #################################################################################################### if level1CateName.upper() == 'DRESSES': level2NodeList = level1NodeQ.children('div.Second_ca > ul.loop_ul > li > div').find('li > a') # 1个li下多个div for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = [level1CateName] categoryList.append(categoryInfo) continue #################################################################################################### colNodeList = level1NodeQ.children('div.Second_ca > ul') #ul.loop_ul将SALE品类 for colNode in colNodeList: level2NodeList = PyQuery(colNode).children('li') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) if level2NodeQ.children('div'): #这个判断就是: 是否存在三级品类 level2CateName = level2NodeQ.children('a').text() level3NodeList = level2NodeQ.children('div').find('li > a') for level3Node in level3NodeList: categoryInfo = self.createCategory(PyQuery(level3Node)) categoryInfo.parent_categories = [level1CateName, level2CateName] categoryList.append(categoryInfo) else: categoryInfo = self.createCategory(PyQuery(level2NodeQ.children('a'))) categoryInfo.parent_categories = [level1CateName] categoryList.append(categoryInfo) return categoryList
def parseStorePage(product): store_url = product['store_url'] # index = store_url.find('1688.com') # store_url = store_url[:index] if store_url.endswith('/'): store_url = store_url[:-1] contact_url = store_url + '/page/contactinfo.htm' if store_info.has_key(contact_url): for key in store_info[contact_url].keys(): product[key] = store_info[contact_url][key] else: content = fetchContent(contact_url) store_info[contact_url] = {} doc=PyQuery(content) product['supplier_name'] = doc('div.detail > div.contactSeller > span.disc').text() if not product['supplier_name']: product['supplier_name'] = doc('div.detail > div.contactSeller').remove('label').text() store_info[contact_url]['supplier_name'] = product['supplier_name'] product['city'] = doc('div.detail > div.address > span.disc').text() if not product['city']: product['city'] = doc('div.detail').find('div.address').remove('label').text() store_info[contact_url]['city'] = product['city'] product['mobile'], product['telephone'], product['store_address'] = '', '', '' for node in doc('div.contcat-desc > dl'): nodeQ = PyQuery(node) #print nodeQ.children('dt').text() if nodeQ.children('dt').text().strip() == u'电 话:': product['telephone'] = nodeQ.children('dd').text() store_info[contact_url]['telephone'] = product['telephone'] if nodeQ.children('dt').text().strip() == u'地 址:': product['store_address'] = nodeQ.children('dd').text() store_info[contact_url]['store_address'] = product['store_address'] if nodeQ.children('dt').text().strip() == u'移动电话:': product['mobile'] = nodeQ.children('dd').text() store_info[contact_url]['mobile'] = product['mobile'] return product
def parseNextPageUrl(self, category_page_content): doc = PyQuery(category_page_content) for liNode in doc('div.pagination:first > ul > li'): liNodeQ = PyQuery(liNode) if liNodeQ.text().strip().startswith("Next"): return liNodeQ.children('a').attr('href')
def parseSupplierContactPage(m): #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm? if m['url'].find('\?') > 0: """do nothing""" else: if m['url'].endswith("/"): m['url'] = m['url'][:-1] m['url'] = m['url'] + '?' #拼出联系页面的url contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url']) content = fetchContent(contact_page_url) doc = PyQuery(content) #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载 if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq( 0).attr('alt'): m['trade_medal'] = doc( 'div.detail > div.trade-medal > span.disc > a.image > img').eq( 0).attr('alt') else: m['trade_medal'] = '' m['supply-grade'] = len( doc('div.detail > div.supply-grade > span.disc > a.image > img')) m['biz-type'] = doc('div.detail > div.biz-type > span').text() if not m['biz-type']: m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text() aList = doc('div.contcat-desc > dl') bList = [] for item in aList: itemQ = PyQuery(item) text = itemQ.children('dt').text() #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉 if text.find(u"话") > 0: bList.append(itemQ.children('dd').text()) m['contact'] = ', '.join(bList) #根据json数据获取 满意度 #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1 #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true} if re.findall('shop/(.*)/page', contact_page_url): stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall( "shop/(.*)/page", contact_page_url)[0] + '&sati=1' content2 = fetchContent(stat_url) json_data = json.loads(content2) m['satisfication'] = json_data['data']['sati']['satisfaction'] #抓全部商品数 和 动销 商品数 #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true' merchantId = re.findall('shop/(.*)/page', contact_page_url)[0] all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true' active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm' content3 = fetchContent(all_products_url) doc3 = PyQuery(content3) m['products_count'] = extractNum( doc3('li[class="offer-list-tab-title current"] > a > em').text()) if m['products_count'] == 0: m['products_count'] = doc3( 'ul[data-sp="paging-a"] > li > em.offer-count').text() content4 = fetchContent(active_product_url) doc4 = PyQuery(content4) m['active_products_count'] = extractNum( doc4('li[class="offer-list-tab-title current"] > a > em').text()) if m['active_products_count'] == 0: m['active_products_count'] = doc4( 'ul[data-sp="paging-a"] > li > em.offer-count').text() else: m['satisfication'] = ''
def search(self, word): # print(self.URL.format(word=word)) response = requests.get(self.URL.format(word=word.replace(' ', '-')), headers=headers) text = response.text # f = open('temp.txt') # text = f.read() # f.close() doc = PyQuery(text) results = [] if ' ' in word: divs = doc('section div:has("section.entry-headword")') else: divs = doc( 'section div:has("section.entry-headword"):has(".pron-spell-container"):has(".pron-ipa-content")' ) if not divs: return {"status": 'error', "error_detail": "Nothing found."} for def_div in divs: def_div = PyQuery(def_div) # 単語 word = def_div('h1,h2').text() # 単語 # 発音 pron = def_div('.pron-ipa-content').text() if pron == '': pron = None else: pron = pron.replace(' ', '').replace('/', '') # 発音 # 音声 sound = def_div('audio source[type="audio/mpeg"]').attr('src') # 音声 # 定義 definitions = [] meaning_section = def_div('section:not(.entry-headword)') for section in meaning_section: # debug() section = PyQuery(section) word_type = section('h3').text() meanings = [] meaning_divs = section( '.default-content>div, .expandable-content>div') if not meaning_divs: meaning_divs = section.children('div>div') for meaning_div in meaning_divs: meaning_div = PyQuery(meaning_div) # label = meaning_div('.luna-label') # if label: # # print('xxx', label.text()) # x = label.text() # meaning_div('.luna-labset').replaceWith(x) # # print(meaning_div) # a = meaning_div('a') # if a: # x = a.text() # meaning_div('a').replaceWith(x) # decoration = meaning_div('.italic, .bold') # if decoration: # # debug() # for _decoration in decoration: # # # x = decoration.text() # _decoration.replaceWith(_decoration.text()) # text = meaning_div.children('span').clone().children().remove().end().text() meaning = dict() example = meaning_div('.luna-example').text() if example: meaning['example'] = example sub_lis = meaning_div('li') if sub_lis: meaning['subs'] = list( map(lambda x: PyQuery(x).text(), sub_lis)) meaning_div('.luna-example').remove() meaning_div('li').remove() text = meaning_div.text()[:-1] meaning['text'] = text meanings.append(meaning) # print(len(meaning_divs)) definitions.append(dict(word_type=word_type, meanings=meanings)) # print(len(definitions)) # 定義 results.append( dict(word=word, pron=pron, sound=sound, definitions=definitions)) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}
def extract(dom, param_dict): res = [] # dom head head_list = str(param_dict['dom_head']).strip().split(',') d_divs = dom(head_list[0]) if len(head_list) > 1: for pos in range(1, len(head_list)): try: value = int(head_list[pos]) d_divs = d_divs.eq(value) except: d_divs = d_divs.children(head_list[pos]) for div in d_divs: d_div = PyQuery(div) if param_dict['sandwich'] != 'None': sandwich_list = str(param_dict['sandwich']).strip().split(',') for sandwich in sandwich_list: try: positon = int(sandwich) d_div = d_div.eq(positon) except: d_div = d_div.children(sandwich) header = str(param_dict['title']).strip().split(',')[0] if not d_div.children(header): continue # 获取url 信息 url_list = str(param_dict['url']).strip().split(',') url = d_div.children(url_list[0]) for pos in range(1, len(url_list)): try: n_url = int(url_list[pos]) url = url.eq(n_url) except: if url_list[pos] == 'href': url = url.attr('href') break else: url = url.children(url_list[pos]) # join url if 'www' not in url and 'http' not in url: match = re.search('^/', url) if match: url = param_dict['domain'] + url else: url = param_dict['domain'] + '/' + url if 'http://' not in url: url = 'http://' + url # 获取title 信息 title_list = str(param_dict['title']).strip().split(',') title = d_div for item in title_list: try: n_title = int(item) title = title.eq(n_title) except: title = title.children(item) title = title.text() date_list = str(param_dict['date']).strip().split(',') date = d_div is_attr = False for item in date_list: try: n_item = int(item) date = date.eq(n_item) except: if 'attr' not in item: date = date.children(item) else: item = item[:item.find(':')] date = date.attr(item)[:20].strip() is_attr = True date = date if is_attr else date.text() if ' / ' in date: date = date.replace(' / ', '-') if '/' in date: date = date.replace('/', '-') if re.search(u'\d{4}-\d{1,2}-\d{1,2}', date): date = ''.join(x for x in date if ord(x) < 256).strip() start_index = date.rfind('201') #第一次出现的位置 end_index1 = date.rfind('-') end_index2 = date.rfind(':') end_index = end_index1 if end_index1 > end_index2 else end_index2 date = date[start_index:end_index + 3] if len(date) == 10: date = '%s %s' % ( date, time.strftime("%H:%M", time.localtime(time.time()))) elif re.search(u'\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{1,2}:\d{1,2}', date): arr_time = date.split(' ') arr_date = arr_time[0].split('-') date = '%s-%s-%s %s' % (arr_date[2], arr_date[0], arr_date[1], arr_time[1]) else: try: # 时间戳转化成日期 date_stamp = int(date) if date_stamp > 9999999999: date_stamp = int(date[:10]) x = time.localtime(date_stamp) date = time.strftime('%Y-%m-%d %H:%M', x) except: date = fomate_date_output(date) date = format_date_time(date) if len(date) == 16: if cmp(date, str_today) >= 0 and cmp( date, end_today) <= 0 and len(title) > 0: res.append([date, url, title]) return res
def test_mark_dirty(): node = PyQuery('<a data-riot-id="0"><b data-riot-id="0.0"><c data-riot-id="0.0.0"></c></b></a>') mark_dirty(node.children('b')) assert node.attr['data-riot-dirty'] == 'true' assert node.children('b').attr['data-riot-dirty'] == 'true' assert not node.children('c').attr['data-riot-dirty']