예제 #1
0
 def parseProductsByCategory(self, category_page_content, category_info):
     if PyQuery(category_page_content)('section.main').find('a').filter(lambda i:PyQuery(this).text().strip()=='View all').eq(0).attr('href'):
         category_info.url = 'http://www.backcountry.com' + PyQuery(category_page_content)('section.main').find('a').filter(lambda i:PyQuery(this).text().strip()=='View all').eq(0).attr('href')
         category_page_content = self.crawler.fetchCategoryPageContent(category_info.url)
     productNodeList = PyQuery(category_page_content)('div#products > div.product')
     productList = []
     for node in productNodeList:
         nodeQ = PyQuery(node)
         productInfo = self.newProduct()
         productInfo['name'] = nodeQ.children('a').attr('title')
         productInfo['product_url'] = 'http://www.backcountry.com' + nodeQ.children('a').attr('href')
         productInfo['img_url'] = nodeQ.children('a > div.ui-pl-img > img[itemprop="image"]').attr('src')
         if not productInfo['img_url']:
             productInfo['img_url'] = nodeQ.children('a > div.ui-pl-img > img[itemprop="image"]').attr('data-src')
         productInfo['img_url'] = "http:" + productInfo['img_url']
         spanList = nodeQ('div.ui-pl-offers > span.ui-pl-pricing > span')
         if len(spanList) <= 2: #价格区间是两个span
             productInfo['price'] = PyQuery(spanList).text().replace(' ', '')
         else:
             productInfo['price'] = PyQuery(spanList).eq(1).text()
             productInfo['label_price'] = PyQuery(spanList).eq(2).text()
         productInfo['sku_id'] = re.findall("skid=([\w-]+)&", productInfo['product_url'])[0] if re.findall("skid=([\w-]+)&", productInfo['product_url']) else ''
         productInfo['reviews'] = nodeQ('div.ui-pl-reviews > span[itemprop="ratingCount"]').text()
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
예제 #2
0
def crawl_vvic_category_tree(wb):
    h = httplib2.Http()
    response, content = h.request("http://www.vvic.com/")
    #     fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w")
    #     fw.write(content)
    #     fw.close()
    ws = wb.add_sheet("vvic品类树")
    ws.write(0, 0, "一级品类")
    ws.write(0, 1, "二级品类")
    ws.write(0, 2, "三级品类")
    row = 0
    doc = PyQuery(content)
    level1NodeList = doc("div.dd-inner > div.item")
    anotherLevel1NodeList = [
        doc('div.sub-items')[0],
        doc('div.sub-items')[1],
        doc('div.sub-items')[2],
        doc('div.sub-items')[5]
    ]
    for index, level1Node in enumerate(level1NodeList):
        level1_category = PyQuery(level1Node)('h3 > a').text()
        level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl')
        for level2Node in level2NodeList:
            level2NodeQ = PyQuery(level2Node)
            level2_category = level2NodeQ.children('dt > a').text()
            level3NodeList = level2NodeQ.children('dd > a')
            for level3Node in level3NodeList:
                level3_category = PyQuery(level3Node).text()
                row += 1
                ws.write(row, 0, level1_category)
                ws.write(row, 1, level2_category)
                ws.write(row, 2, level3_category)
예제 #3
0
 def parseProductsByCategory(self, category_page_content, category_info):
     if not self.crawling_category.has_key(category_info):
         self.crawling_category = {} #清空
         self.crawling_category[category_info] = [1, 0] #开始抓该品类的第一页的商品,已抓商品总计0个
     else:
         self.crawling_category[category_info][0] = self.crawling_category[category_info][0] + 1
     doc = PyQuery(category_page_content)
     productList = []
     productNodeList = doc('div#js_proList > ul > li')
     for i, productNode in enumerate(productNodeList):
         productNodeQ = PyQuery(productNode)
         productInfo = self.newProduct()
         part1 = productNodeQ.children('p.pr')
         productInfo['img_url'] = PyQuery(part1).children('a.pic > img').attr('data-original')
         part2 = productNodeQ.children('p.pro_name')
         productInfo['name'] = PyQuery(part2).children('a').text().strip()
         productInfo['product_url'] = PyQuery(part2).children('a').attr('href')
         productInfo['sku_id'] = re.findall('-([\d]+)\.html', productInfo['product_url'])[0]
         part3 = productNodeQ.children('p.pro_price')
         productInfo['price'] = PyQuery(part3).find('strong.my_shop_price').text()
         productInfo.set_categories(category_info)
         productInfo['page_idx'] = str(self.crawling_category[category_info][0])
         productInfo['num_idx'] = str(i + 1)
         productInfo['cate_idx'] = str(self.crawling_category[category_info][1] + 1)
         productList.append(productInfo)
         self.crawling_category[category_info][1] = self.crawling_category[category_info][1] + 1 #每抓一个商品, 加1
     info('%s has been crawled %d products after parse %d pages。' %(category_info, self.crawling_category[category_info][1], self.crawling_category[category_info][0]))
     return productList
def crawl_vvic_category_tree(wb):
    h = httplib2.Http()
    response, content = h.request("http://www.vvic.com/")
#     fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w")
#     fw.write(content)
#     fw.close()
    ws = wb.add_sheet("vvic品类树")
    ws.write(0,0,"一级品类")
    ws.write(0,1,"二级品类")
    ws.write(0,2,"三级品类")
    row = 0
    doc = PyQuery(content)
    level1NodeList = doc("div.dd-inner > div.item")
    anotherLevel1NodeList = [doc('div.sub-items')[0], doc('div.sub-items')[1], doc('div.sub-items')[2], doc('div.sub-items')[5]]
    for index, level1Node in enumerate(level1NodeList):
        level1_category = PyQuery(level1Node)('h3 > a').text()
        level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl')
        for level2Node in level2NodeList:
            level2NodeQ = PyQuery(level2Node)
            level2_category = level2NodeQ.children('dt > a').text()
            level3NodeList = level2NodeQ.children('dd > a')
            for level3Node in level3NodeList:
                level3_category = PyQuery(level3Node).text()
                row += 1
                ws.write(row, 0, level1_category)
                ws.write(row, 1, level2_category)
                ws.write(row, 2, level3_category)
예제 #5
0
def test_mark_dirty():
    node = PyQuery(
        '<a data-riot-id="0"><b data-riot-id="0.0"><c data-riot-id="0.0.0"></c></b></a>'
    )
    mark_dirty(node.children('b'))
    assert node.attr['data-riot-dirty'] == 'true'
    assert node.children('b').attr['data-riot-dirty'] == 'true'
    assert not node.children('c').attr['data-riot-dirty']
예제 #6
0
def parseSupplierContactPage(m):
    #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm?
    if m['url'].find('\?') > 0:
        """do nothing"""
    else:
        if m['url'].endswith("/"):
            m['url'] = m['url'][:-1]
        m['url'] = m['url'] + '?'
    #拼出联系页面的url
    contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url'])
    content = fetchContent(contact_page_url)
    doc = PyQuery(content)
    #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载
    if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt'):
        m['trade_medal'] = doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt')
    else:
        m['trade_medal'] = ''
    m['supply-grade'] = len(doc('div.detail > div.supply-grade > span.disc > a.image > img'))
    m['biz-type'] = doc('div.detail > div.biz-type > span').text()
    if not m['biz-type']:
        m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text()
    aList = doc('div.contcat-desc > dl')
    bList = []
    for item in aList:
        itemQ = PyQuery(item)
        text = itemQ.children('dt').text()
        #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉
        if text.find(u"话") > 0:
            bList.append(itemQ.children('dd').text())
    m['contact'] = ', '.join(bList)
    #根据json数据获取 满意度
    #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1
    #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true}
    if re.findall('shop/(.*)/page', contact_page_url):
        stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall("shop/(.*)/page", contact_page_url)[0] + '&sati=1'
        content2 = fetchContent(stat_url)
        json_data = json.loads(content2)
        m['satisfication'] = json_data['data']['sati']['satisfaction']
        #抓全部商品数 和 动销 商品数
        #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true'
        merchantId=re.findall('shop/(.*)/page', contact_page_url)[0]
        all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true'
        active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm'
        content3 = fetchContent(all_products_url)
        doc3 = PyQuery(content3)
        m['products_count'] = extractNum(doc3('li[class="offer-list-tab-title current"] > a > em').text())
        if m['products_count'] == 0:
            m['products_count'] = doc3('ul[data-sp="paging-a"] > li > em.offer-count').text() 
        content4 = fetchContent(active_product_url)
        doc4 = PyQuery(content4)
        m['active_products_count'] = extractNum(doc4('li[class="offer-list-tab-title current"] > a > em').text())
        if m['active_products_count'] == 0:
            m['active_products_count'] = doc4('ul[data-sp="paging-a"] > li > em.offer-count').text() 
    else:
        m['satisfication'] = ''
예제 #7
0
def parse_xml_to_xmljson(node):
    pqi = PyQuery(node)
    items = pqi[0].attrib
    out = {'tag': pqi[0].tag}
    if len(items) > 0:
        out['attributes'] = dict(items)
    if len(pqi.children()) > 0:
        out['children'] = []
        for child in pqi.children():
            out['children'].append(parse_xml_to_xmljson(child))
    else:
        out['text'] = pqi.text()
    return out
예제 #8
0
def parse_xml_to_xmljson(node):
    pqi = PyQuery(node)
    items = pqi[0].attrib
    out = {'tag': pqi[0].tag}
    if len(items) > 0:
        out['attributes'] = dict(items)
    if len(pqi.children()) > 0:
        out['children'] = []
        for child in pqi.children():
            out['children'].append(parse_xml_to_xmljson(child))
    else:
        out['text'] = pqi.text()
    return out
예제 #9
0
def parseCategories():
    categoryList = []
    node = doc("ul#categrayAll > li").eq(0) #只抓了第一个一级品类
    nodeQ = PyQuery(node)
    level1Name = nodeQ.children('div.li > a > em').text()
    level2NodeList = nodeQ.children('div.sub-list > div > ul.column > li.level1')
    for level2Node in level2NodeList:
        level2NodeQ = PyQuery(level2Node)
        category = self.newCategory()
        category.name = level2NodeQ.children('a').text()
        category.url = level2NodeQ.children('a').attr("href")
        category.parent_categories = [level1Name]
        categoryList.append(category)
    return categoryList
예제 #10
0
def parseCategories():
    categoryList = []
    node = doc("ul#categrayAll > li").eq(0)  #只抓了第一个一级品类
    nodeQ = PyQuery(node)
    level1Name = nodeQ.children('div.li > a > em').text()
    level2NodeList = nodeQ.children(
        'div.sub-list > div > ul.column > li.level1')
    for level2Node in level2NodeList:
        level2NodeQ = PyQuery(level2Node)
        category = self.newCategory()
        category.name = level2NodeQ.children('a').text()
        category.url = level2NodeQ.children('a').attr("href")
        category.parent_categories = [level1Name]
        categoryList.append(category)
    return categoryList
예제 #11
0
 def paper_page(self, url):
     req = self.connect(url)
     container = PyQuery(req.text)('.container')
     output = []
     title = container.children('h2').text()
     output.append(title + '\n')
     author_list = container.children('p').text()
     author_list = author_list.split('; ')
     author_list = ['\t' + x + '\n' for x in author_list]
     output.extend(author_list)
     abstract = container.children('.row').children('.col-lg-9').children(
         'p').text()
     abstract = translate('en', 'zh-CN', abstract)
     output.append('\t' + abstract + '\n')
     return output
예제 #12
0
def render_svg(image,
               width=None,
               height=None,
               request=None,
               css_class='',
               img_class='',
               alt=''):
    """Render SVG file"""
    # pylint: disable=too-many-arguments
    options = {}
    if width or height:
        options['style'] = 'width: {0}{1}; height: {2}{3};'.format(
            width, 'px' if isinstance(width, int) else '', height,
            'px' if isinstance(height, int) else '')
    else:
        options['style'] = ''
    options['css_class'] = css_class
    if alt or img_class:
        svg = PyQuery(image.data)
        if alt:
            group = PyQuery('<g></g>')
            group.append(PyQuery('<title />').text(alt))
            for child in svg.children():
                group.append(child)
            svg.empty().append(group)
        if img_class:
            svg.attr('class', img_class)
        options['svg'] = svg.outer_html()
    else:
        options['svg'] = image.data
    return render('templates/svg-render.pt', options, request)
예제 #13
0
    def search(self, word, limit=None):
        r = requests.get(URL,
                         params={
                             'p': word,
                             'min': 1,
                             'price_type': 'bidorbuyprice',
                             's1': 'score2',
                             'o1': 'a',
                         })
        doc = PyQuery(r.text)
        results = []
        for item in doc("div#list01")('table')('tr').not_('.la'):
            result = {}
            item = PyQuery(item)
            if not item.children('td.i'):
                continue
            # print(item.text())
            # tag = item('div.srp-pdtaglist')('span.srp-tag')
            # if tag.hasClass('bid'):
            #     continue
            # contentwrap = item('div.contentwrap')
            a = item('td.a1')('a')
            result['title'] = a.text()
            result['url'] = a.attr('href')
            result['img'] = item('td.i')('img').attr('src')
            result['price'] = int(
                item('td.pr2').text().replace(',', '').replace('円', ''))
            results.append(result)

        if results:
            return {"status": 'success', "results": results}
        else:
            return {"status": 'error', "error_detail": "Nothing found."}
예제 #14
0
 def _add_nested(self, k, el):
     """Parse nested element by its children."""
     el = Pq(el)
     tagname = Pq(el)[0].tag
     if tagname in self.invalid_tags:
         return
     id = self._format_id(el.attr('id'))
     classes = self._format_classes(el.attr('class'))
     selector = self._format_selector(el, id, classes)
     children = Pq(el).children()
     if not self._is_root_body_node(el):
         return
     # Add for single nodes only
     if not children:
         self.selectors.add(selector)
     # Build nested css by traversing all child nodes and getting
     # their attributes.
     while children:
         for child in children:
             # 1. Add current
             self.selectors.add(selector)
             # 2. Add child
             child = Pq(child)
             selector += self._add_id_and_classes(child)
             self.selectors.add(selector)
             # # 3. Move to next children
             children = child.children()
예제 #15
0
def get_inline_snippets(html):
    pq = PyQuery(html)
    codes = [
        "<code>" + item.text() + "</code>"
        for item in pq.children("p code").items()
    ]
    return codes
예제 #16
0
 def paper_page(self, url):
     req = self.connect(url)
     page_pyquery = PyQuery(req.text)
     post_pyquery = page_pyquery('.container-fluid.proceedings-detail')
     output = []
     paper_header = PyQuery(post_pyquery('.row')[0])
     paper_header = paper_header.children()[0]
     title = paper_header('h1').text()
     output.append(title + '\n')
     author = paper_header('h2').text()
     output.append('\t' + author + '\n')
     contain = PyQuery(post_pyquery('.row')[2])
     abstract = contain.children()[0].text()
     abstract = self.translate_en2cn(abstract)
     output.append('\t' + abstract + '\n')
     return output
예제 #17
0
    def parseCategories(self, homepage_content):
        doc = PyQuery(homepage_content)
        categoryList = []
        level1NodeList = doc(
            "div#cms_page_922 > div[class='js-template-display js-template  dept_wrap ']"
        ).find("div.topnav")
        for level1Node in level1NodeList:
            level1NodeQ = PyQuery(level1Node)
            level1Name = level1NodeQ.children(
                "div > span[class='js-lego-data lego_text_field '] > a").text(
                ).strip()
            if level1Name.upper() in ['GIFTS', 'SALES']:  #最后两个品类 不予考虑
                continue
            level2NodeList = level1NodeQ(
                'div[class="nav_link_block_title nav_link_block_text"] > span > a'
            )
            for level2Node in level2NodeList:
                categoryInfo = self.createCategory(PyQuery(level2Node))
                categoryInfo.parent_categories = [level1Name]
                categoryList.append(categoryInfo)
        #前期只是抓一个三级品类 需要扩展为全站
#         category_info = self.newCategory()
#         category_info.name = 'Chandeliers'
#         category_info.parent_categories = ['Lighting', 'Ceiling Lights']
#         category_info.url = 'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&curpage=7'#'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&refid=&sku='
#         categoryList.append(category_info)
        return categoryList
예제 #18
0
 def parseProductsAndCategoriesByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productList, categoryList = [], []
     if category_info.parent_categories and len(category_info.parent_categories) == 2:
         productList = self.parseProductsByCategory(category_page_content, category_info)
         return productList, categoryList
     if category_info.name == 'New Arrivals': #特殊处理一下
         for level2Node in doc.find('div#js_catelist_sec > div.item'):
             level2NodeQ = PyQuery(level2Node)
             level2CateName = level2NodeQ.children('p > a').text()
             for level3Node in level2NodeQ.children('ul > li > a'):
                 categoryInfo = self.createCategory(PyQuery(level3Node))
                 categoryInfo.parent_categories = [category_info.name, level2CateName]
                 categoryList.append(categoryInfo.formalize())
         return  productList, categoryList
     if category_info.name == 'Clearance':
         level2NodeList = doc('div.catelist > ul.cataUl_list > li > a')
         for level2Node in level2NodeList:
             categoryInfo = self.createCategory(PyQuery(level2Node))
             categoryInfo.parent_categories = ['Clearance']
             categoryList.append(categoryInfo.formalize())
         return productList, categoryList
     if doc.find('div#js_catelist_sec > div.cur > ul > li'):
         nodeList = doc.find('div#js_catelist_sec > div.cur > ul > li > a')
         for node in nodeList:
             nodeQ = PyQuery(node)
             categoryInfo = self.newCategory()
             categoryInfo.name = nodeQ.text()
             categoryInfo.url = nodeQ.attr('href')
             categoryInfo.set_categories(category_info)
             categoryList.append(categoryInfo.formalize())
     elif doc.find('div.catelist > ul > li.cur > div.menuList > p'):
         nodeList = doc.find('div.catelist > ul > li.cur > div.menuList > p > a')
         for node in nodeList:
             nodeQ = PyQuery(node)
             categoryInfo = self.newCategory()
             categoryInfo.name = nodeQ.text()
             categoryInfo.url = nodeQ.attr('href')
             if  category_info.parent_categories:
                 result = category_info.parent_categories + [category_info.name]
             else:
                 result = [category_info.name]
             categoryInfo.parent_categories = result
             categoryList.append(categoryInfo.formalize())
     else:
         productList = self.parseProductsByCategory(category_page_content, category_info)
     return productList, categoryList
예제 #19
0
def test_render_each_to_document():
    document = PyQuery('<custom data-riot-id="0"><button label="{ label }" each="{ items }" data-riot-id="0.0"></button></custom>')
    expressions = parse_document_expressions(document)
    render_document(expressions, {'items': [{'label': 'first'}, {'label': 'second'}]})
    assert document.attr['data-riot-dirty'] == 'true'
    assert len(document.children()) == 2
    assert document('button').eq(0).attr.label == 'first'
    assert document('button').eq(1).attr.label == 'second'
    def tokenize_code(html):
        res = []
        pq = PyQuery(html)

        snippets = pq.children("pre code")
        for snippet in snippets:
            res.append(snippet.text)

        return res
예제 #21
0
 def parseCategories(self, homepage_content):
     doc = PyQuery(homepage_content)
     nodeList = doc('ul#header-navigation-menu > li.menu-container')
     categoryList = []
     #去除前面三个和后面两个乱七八糟的分类
     validNodeList = nodeList[3:10]
     for node in validNodeList:
         nodeQ = PyQuery(node)
         level1Name = nodeQ.children('a').text()
         level2NodeList = nodeQ.children('div > ul:first > li.indent-child > span')
         for level2Node in level2NodeList:
             level2NodeQ = PyQuery(level2Node)
             categoryInfo = self.newCategory()
             categoryInfo.name = level2NodeQ.text()
             categoryInfo.url = level2NodeQ.attr('href')
             categoryInfo.parent_categories = [level1Name]
             categoryList.append(categoryInfo.formalize())
     return categoryList
예제 #22
0
 def parseCategories(self, homepage_content):
     doc = PyQuery(homepage_content)
     nodeList = doc('ul#header-navigation-menu > li.menu-container')
     categoryList = []
     #去除前面三个和后面两个乱七八糟的分类
     validNodeList = nodeList[3:10]
     for node in validNodeList:
         nodeQ = PyQuery(node)
         level1Name = nodeQ.children('a').text()
         level2NodeList = nodeQ.children(
             'div > ul:first > li.indent-child > span')
         for level2Node in level2NodeList:
             level2NodeQ = PyQuery(level2Node)
             categoryInfo = self.newCategory()
             categoryInfo.name = level2NodeQ.text()
             categoryInfo.url = level2NodeQ.attr('href')
             categoryInfo.parent_categories = [level1Name]
             categoryList.append(categoryInfo.formalize())
     return categoryList
예제 #23
0
    def __convert_to_ebook(self, book):
        row = PyQuery(book.find('div.sg-row')[1])
        image_wrap = PyQuery(row.children()[0])
        info_wrap = PyQuery(row.children()[1]).find('.sg-row')
        title_author_wrap = PyQuery(info_wrap[0])
        price_wrap = PyQuery(info_wrap[1])
        price = price_wrap.find('.a-price-whole').text() + \
            price_wrap.find('.a-price-fraction').text()

        ebook = Ebook()
        ebook.title = title_author_wrap.find('h2').text()
        ebook.author = title_author_wrap.find('h2')\
            .next().text().split('|')[0].strip()
        ebook.price = book.find('.u-price em').text()
        ebook.cover = image_wrap\
            .find('[data-component-type="s-product-image"] img')\
            .attr('src')
        ebook.price = float(price)

        return ebook
예제 #24
0
def tokenize_HTML(html):
    sentences = ""
    pq = PyQuery(html)

    codes = pq.children("p code").items()

    for code in codes:
        html = html.replace(code.text(), "CODE_ELEMENT_NN")

    html = html.replace("<code>", "")
    html = html.replace("</code>", "")

    new_pq = PyQuery(html)

    p_s = new_pq.children("p").items()
    for p in p_s:

        sentences = sentences + p.text() + " "

    return sentences
예제 #25
0
 def parseCategories(self, homepage_content):
     '''抓取前六个一级品类'''
     doc = PyQuery(homepage_content)
     categoryList = []
     level1NodeList = doc('ul.pet-main-nav > li.pet-main-nav-item-level1')[:6]
     for node in level1NodeList:
         nodeQ = PyQuery(node)
         level1Name = nodeQ.children('a > span').text()
         level2NodeList = nodeQ.children('div > div > ul > li')[:2] #写死了
         for level2Node in level2NodeList:
             level2NodeQ = PyQuery(level2Node)
             level2Name = level2NodeQ.children('a > span').text()
             level3NodeList = level2NodeQ.children('ul > li > a')
             for level3Node in level3NodeList:
                 level3NodeQ = PyQuery(level3Node)
                 categoryInfo = self.newCategory()
                 categoryInfo.name = level3NodeQ.children('span').text()
                 categoryInfo.url = level3NodeQ.attr('href')
                 categoryInfo.parent_categories = [level1Name, level2Name]
                 categoryList.append(categoryInfo)
     return categoryList
예제 #26
0
 def parseCategories(self, homepage_content):
     '''从首页最多获取至二级品类
     '''
     doc = PyQuery(homepage_content)
     categoryList = []
     level1NodeList = doc('nav#nav > div.w > ul > li')
     for level1Node in level1NodeList:
         level1NodeQ = PyQuery(level1Node)
         if not level1NodeQ.children('div.sub_menu'):
             categoryInfo = self.createCategory(level1NodeQ.children('a'))
             categoryList.append(categoryInfo)
         else:
             level1Name = level1NodeQ.children('a').text()
             level2NodeList = level1NodeQ.children('div.sub_menu > div.leftWrap > div.leftTitle > dl')
             for level2Node in level2NodeList:
                 level2NodeQ = PyQuery(level2Node)
                 if level2NodeQ.find('dt'): #每个二级品类名会重复出现
                     continue
                 elif level2NodeQ.find('dd'):
                     categoryInfo = self.createCategory(level2NodeQ.children('dd > a'))
                     categoryInfo.parent_categories = [level1Name]
                     categoryList.append(categoryInfo)
     return categoryList
예제 #27
0
def test_render_each_to_document():
    document = PyQuery(
        '<custom data-riot-id="0"><button label="{ label }" each="{ items }" data-riot-id="0.0"></button></custom>'
    )
    expressions = parse_document_expressions(document)
    render_document(expressions,
                    {'items': [{
                        'label': 'first'
                    }, {
                        'label': 'second'
                    }]})
    assert document.attr['data-riot-dirty'] == 'true'
    assert len(document.children()) == 2
    assert document('button').eq(0).attr.label == 'first'
    assert document('button').eq(1).attr.label == 'second'
예제 #28
0
 def parseCategories(self, homepage_content):
     doc = PyQuery(homepage_content)
     #Gifts和Brands li的data-id属性分别为0, 1
     level1NodeList = doc('ul.js-flyout-nav > li').filter(lambda i:PyQuery(this).attr('data-id') > '1')
     categoryList = []
     for level1Node in level1NodeList:
         level1NodeQ = PyQuery(level1Node)
         level1Name = level1NodeQ.children('a').text()
         level2NodeList = doc('div').filter(lambda i, this:PyQuery(this).attr('data-cat-id')==level1NodeQ.attr('data-id')).children('a')
         for level2Node in level2NodeList:
             level2NodeQ = PyQuery(level2Node)
             if not level2NodeQ.attr('class') or not level2NodeQ.text(): #<a class="" href="" data-title=""/>&#13;
                 continue
             categoryInfo = self.newCategory(level2NodeQ.text(), 'http://www.backcountry.com' + level2NodeQ.attr('href'), [level1Name])
             categoryList.append(categoryInfo)
     return categoryList
def parseProducts(category_page_content):
    doc = PyQuery(category_page_content)
    productList = []
    nodeList = doc('div#categoryHotProductTable > table.responstable > tbody > tr')
    for node in nodeList:
        product = []
        nodeQ = PyQuery(node)
        tdList = nodeQ.children('td')
        product.append(PyQuery(tdList[0]).children('img').attr('ng-src'))
        product.append(PyQuery(tdList[1]).children('a').attr('href'))
        product.append(PyQuery(tdList[1]).children('a').text())
        product.append(PyQuery(tdList[2]).text())
        product.append(PyQuery(tdList[4]).text())
        product.append(PyQuery(tdList[8]).text())
        product.append(re.findall('/([\d]+)$', product[1])[0]) #不加任何检查
        productList.append(product)
    return productList
예제 #30
0
def parseProducts(category_page_content):
    doc = PyQuery(category_page_content)
    productList = []
    nodeList = doc(
        'div#categoryHotProductTable > table.responstable > tbody > tr')
    for node in nodeList:
        product = []
        nodeQ = PyQuery(node)
        tdList = nodeQ.children('td')
        product.append(PyQuery(tdList[0]).children('img').attr('ng-src'))
        product.append(PyQuery(tdList[1]).children('a').attr('href'))
        product.append(PyQuery(tdList[1]).children('a').text())
        product.append(PyQuery(tdList[2]).text())
        product.append(PyQuery(tdList[4]).text())
        product.append(PyQuery(tdList[8]).text())
        product.append(re.findall('/([\d]+)$', product[1])[0])  #不加任何检查
        productList.append(product)
    return productList
예제 #31
0
 def parseCategories(self, homepage_content):
     '''从首页直接解析出全部品类路径
        TOPS, BOTTOMS可以获取至三级品类 其余一级品类获取至二级品类
     '''
     doc = PyQuery(homepage_content)
     categoryList = []
     level1NodeList = doc('ul#TS_menu > li.subCatName')
     for level1Node in level1NodeList:
         level1NodeQ = PyQuery(level1Node)
         level1CateName = level1NodeQ.children('a').text().strip()
         if level1CateName == "WHAT'S NEW" or level1CateName == "Style Gallery":
             continue
         ####################################################################################################
         if level1CateName.upper() == 'DRESSES':
             level2NodeList = level1NodeQ.children(
                 'div.Second_ca > ul.loop_ul > li > div').find(
                     'li > a')  # 1个li下多个div
             for level2Node in level2NodeList:
                 categoryInfo = self.createCategory(PyQuery(level2Node))
                 categoryInfo.parent_categories = [level1CateName]
                 categoryList.append(categoryInfo)
             continue
         ####################################################################################################
         colNodeList = level1NodeQ.children(
             'div.Second_ca > ul')  #ul.loop_ul将SALE品类
         for colNode in colNodeList:
             level2NodeList = PyQuery(colNode).children('li')
             for level2Node in level2NodeList:
                 level2NodeQ = PyQuery(level2Node)
                 if level2NodeQ.children('div'):  #这个判断就是: 是否存在三级品类
                     level2CateName = level2NodeQ.children('a').text()
                     level3NodeList = level2NodeQ.children('div').find(
                         'li > a')
                     for level3Node in level3NodeList:
                         categoryInfo = self.createCategory(
                             PyQuery(level3Node))
                         categoryInfo.parent_categories = [
                             level1CateName, level2CateName
                         ]
                         categoryList.append(categoryInfo)
                 else:
                     categoryInfo = self.createCategory(
                         PyQuery(level2NodeQ.children('a')))
                     categoryInfo.parent_categories = [level1CateName]
                     categoryList.append(categoryInfo)
     return categoryList
예제 #32
0
def parseStorePage(product):
    store_url = product['store_url']
    index = store_url.find('1688.com')
    store_url = store_url[:index]
    contact_url = store_url + '1688.com/page/contactinfo.htm'
    if store_info.has_key(contact_url):
        for key in store_info[contact_url].keys():
            product[key] = store_info[contact_url][key]
    else:
        content = fetchContent(contact_url)
        store_info[contact_url] = {}
        doc = PyQuery(content)
        product['supplier_name'] = doc(
            'div.detail > div.contactSeller > span.disc').text()
        if not product['supplier_name']:
            product['supplier_name'] = doc(
                'div.detail > div.contactSeller').remove('label').text()
        store_info[contact_url]['supplier_name'] = product['supplier_name']

        product['city'] = doc('div.detail > div.address > span.disc').text()
        if not product['city']:
            product['city'] = doc('div.detail').find('div.address').remove(
                'label').text()
        store_info[contact_url]['city'] = product['city']

        product['mobile'], product['telephone'], product[
            'store_address'] = '', '', ''
        for node in doc('div.contcat-desc > dl'):
            nodeQ = PyQuery(node)
            print nodeQ.children('dt').text()
            if nodeQ.children('dt').text().strip() == u'电      话:':
                product['telephone'] = nodeQ.children('dd').text()
                store_info[contact_url]['telephone'] = product['telephone']
            if nodeQ.children('dt').text().strip() == u'地      址:':
                product['store_address'] = nodeQ.children('dd').text()
                store_info[contact_url]['store_address'] = product[
                    'store_address']
            if nodeQ.children('dt').text().strip() == u'移动电话:':
                product['mobile'] = nodeQ.children('dd').text()
                store_info[contact_url]['mobile'] = product['mobile']
    return product
예제 #33
0
    def __init__(self, url: str, pq_obj: pyquery.PyQuery):
        self.url = url
        self.country = pq_obj.children().eq(0)('img').attr('src').split('/')[-1].split('.')[0]
        self.session_number = self.__to_int__(pq_obj.children().eq(2).text().split(self.__nl__)[0].split()[0])
        self.alive_days = self.__to_int__(pq_obj.children().eq(2).text().split(self.__nl__)[1].split()[0])
        self.bandwidth = self.__to_float__(pq_obj.children().eq(3).text().split(self.__nl__)[0].split()[0])
        self.ping = self.__to_int__(pq_obj.children().eq(3).text().split(self.__nl__)[1].split()[1])
        self.score = pq_obj.children().eq(9).text().replace(',', '')

        href_list = pq_obj.children().eq(6)('a').attr('href').split('?')[1].split('&')

        self.ip = href_list[1].split(self.__eq_sign__)[1]
        self.tcp = href_list[2].split(self.__eq_sign__)[1]
        self.udp = href_list[3].split(self.__eq_sign__)[1]
        self.sid = href_list[4].split(self.__eq_sign__)[1]
        self.hid = href_list[5].split(self.__eq_sign__)[1]
        self.link = None
예제 #34
0
def tokenize_HTML(html):
    sentences = ""
    pq = PyQuery(html)
    p_s = pq.children("p").items()
    for p in p_s:
        sentences = sentences + p.text() + " "

    flag = False
    adj_words = []

    for word in QUALITY_WORDS:
        if word in sentences:
            flag = True
            adj_words.append(word)

    if flag:
        pass
        #print(sentences + "\t" + adj_words.__str__())

    return sentences
예제 #35
0
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict:
    # Find if has children
    elem = PyQuery(elem)
    children = list(elem.contents())
    has_children = len(elem.children()) > 0

    contents = []
    if has_children:
        # Fix unwrapped children
        if not already_wrapped:
            children = fix_unwrapped_text(elem).contents()

        for child in children:
            child_dict = build_dict_from_sane_json(child, already_wrapped=True)
            if child_dict:
                contents.append(child_dict)
    else:
        contents = elem.html()

    extra = {}

    # Only tables need the HTML (to use later for extraction of relevant data)
    if elem.is_("table"):
        extra = {'original_html': str(elem)}

    if 'src' in elem[0].attrib:
        extra['src'] = elem.attr('src')
    if 'href' in elem[0].attrib:
        extra['href'] = elem.attr('href')

    tag_type = list(elem)[0].tag
    tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type)
    contents = PRE_CONTENTS_MATCH.get(tag_type, contents)

    return {
        'type': tag_type_mapped,
        'attrs': [],
        'layout': {},
        'contents': contents,
        'extra': extra
    }
예제 #36
0
    def parseCategories(self, homepage_content):
        doc = PyQuery(homepage_content)
        categoryList = []
        level1NodeList = doc("div#cms_page_922 > div[class='js-template-display js-template  dept_wrap ']").find("div.topnav")
        for level1Node in level1NodeList:
            level1NodeQ = PyQuery(level1Node)
            level1Name = level1NodeQ.children("div > span[class='js-lego-data lego_text_field '] > a").text().strip()
            if level1Name.upper() in ['GIFTS', 'SALES']: #最后两个品类 不予考虑
                continue
            level2NodeList = level1NodeQ('div[class="nav_link_block_title nav_link_block_text"] > span > a')
            for level2Node in level2NodeList:
                categoryInfo = self.createCategory(PyQuery(level2Node))
                categoryInfo.parent_categories = [level1Name]
                categoryList.append(categoryInfo)
        #前期只是抓一个三级品类 需要扩展为全站
#         category_info = self.newCategory()
#         category_info.name = 'Chandeliers'
#         category_info.parent_categories = ['Lighting', 'Ceiling Lights']
#         category_info.url = 'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&curpage=7'#'http://www.wayfair.com/Chandeliers-C215419.html?sortby=1&refid=&sku='
#         categoryList.append(category_info)
        return categoryList
예제 #37
0
def parseStorePage(product):
    store_url = product["store_url"]
    index = store_url.find("1688.com")
    store_url = store_url[:index]
    contact_url = store_url + "1688.com/page/contactinfo.htm"
    if store_info.has_key(contact_url):
        for key in store_info[contact_url].keys():
            product[key] = store_info[contact_url][key]
    else:
        content = fetchContent(contact_url)
        store_info[contact_url] = {}
        doc = PyQuery(content)
        product["supplier_name"] = doc("div.detail > div.contactSeller > span.disc").text()
        if not product["supplier_name"]:
            product["supplier_name"] = doc("div.detail > div.contactSeller").remove("label").text()
        store_info[contact_url]["supplier_name"] = product["supplier_name"]

        product["city"] = doc("div.detail > div.address > span.disc").text()
        if not product["city"]:
            product["city"] = doc("div.detail").find("div.address").remove("label").text()
        store_info[contact_url]["city"] = product["city"]

        product["mobile"], product["telephone"], product["store_address"] = "", "", ""
        for node in doc("div.contcat-desc > dl"):
            nodeQ = PyQuery(node)
            print nodeQ.children("dt").text()
            if nodeQ.children("dt").text().strip() == u"电      话:":
                product["telephone"] = nodeQ.children("dd").text()
                store_info[contact_url]["telephone"] = product["telephone"]
            if nodeQ.children("dt").text().strip() == u"地      址:":
                product["store_address"] = nodeQ.children("dd").text()
                store_info[contact_url]["store_address"] = product["store_address"]
            if nodeQ.children("dt").text().strip() == u"移动电话:":
                product["mobile"] = nodeQ.children("dd").text()
                store_info[contact_url]["mobile"] = product["mobile"]
    return product
예제 #38
0
def qichacha_search_result(j: PyQuery) -> dict:
    j = j.children()
    td_row = j.eq(2)
    company_name = td_row.children('a').text()
    p_first = td_row.children('p').eq(0)
    legal_representative = p_first.children('a').text()
    span_m_l = p_first("span:first").text().split(':')
    registered_capital = span_m_l[-1].strip('-')
    span_m_ls = p_first('span:last').text().split(':')
    date_of_establishment = span_m_ls[-1]
    p_two = td_row('p').eq(-3)
    p_obj = p_two.clone()
    p_obj.children().remove()
    email = p_obj.text().split(':')[-1].strip('-')
    phone = p_two.find('span').text().split(':')[-1].strip(' ').strip('-')
    register_address = td_row.find('p').eq(2).text().split(':')[-1]
    return dict(company_name=company_name,
                legal_representative=legal_representative,
                registered_capital=registered_capital,
                date_of_establishment=date_of_establishment,
                email=email,
                phone=phone,
                register_address=register_address)
예제 #39
0
    def parse(self, content: str):
        """
        Parse html to parsed object
        :param content:
        :return:
        """
        d = PyQuery(content)
        element_list = []
        # list of children
        children = d.contents()
        # if no children, parse first one
        if len(d.children()) == 0:
            element_list.append(self.__parse__(d[0]))
        else:
            for child in children:
                try:
                    parsed = self.__parse__(child)
                    if parsed:
                        element_list.append(parsed)
                except Exception as e:
                    pass

        self.parsed_objects = element_list
        return self
예제 #40
0
 def parseCategories(self, homepage_content):
     '''从首页直接解析出全部品类路径
        TOPS, BOTTOMS可以获取至三级品类 其余一级品类获取至二级品类
     '''
     doc = PyQuery(homepage_content)
     categoryList = []
     level1NodeList = doc('ul#TS_menu > li.subCatName')
     for level1Node in level1NodeList:
         level1NodeQ = PyQuery(level1Node)
         level1CateName = level1NodeQ.children('a').text().strip()
         if level1CateName == "WHAT'S NEW" or level1CateName == "Style Gallery":
             continue
         ####################################################################################################
         if level1CateName.upper() == 'DRESSES':
             level2NodeList = level1NodeQ.children('div.Second_ca > ul.loop_ul > li > div').find('li > a') # 1个li下多个div
             for level2Node in level2NodeList:
                 categoryInfo = self.createCategory(PyQuery(level2Node))
                 categoryInfo.parent_categories = [level1CateName]
                 categoryList.append(categoryInfo)
             continue
         ####################################################################################################        
         colNodeList = level1NodeQ.children('div.Second_ca > ul') #ul.loop_ul将SALE品类
         for colNode in colNodeList:
             level2NodeList = PyQuery(colNode).children('li')
             for level2Node in level2NodeList:
                 level2NodeQ = PyQuery(level2Node)
                 if level2NodeQ.children('div'): #这个判断就是: 是否存在三级品类
                     level2CateName = level2NodeQ.children('a').text()
                     level3NodeList = level2NodeQ.children('div').find('li > a')
                     for level3Node in level3NodeList:
                         categoryInfo = self.createCategory(PyQuery(level3Node))
                         categoryInfo.parent_categories = [level1CateName, level2CateName]
                         categoryList.append(categoryInfo)
                 else:
                     categoryInfo = self.createCategory(PyQuery(level2NodeQ.children('a')))
                     categoryInfo.parent_categories = [level1CateName]
                     categoryList.append(categoryInfo)  
     return categoryList
예제 #41
0
def parseStorePage(product):
    store_url = product['store_url']
#     index = store_url.find('1688.com')
#    store_url = store_url[:index]
    if store_url.endswith('/'):
        store_url = store_url[:-1]
    contact_url = store_url + '/page/contactinfo.htm'                           
    if store_info.has_key(contact_url):
        for key in store_info[contact_url].keys():
            product[key] = store_info[contact_url][key]
    else:
       content = fetchContent(contact_url)
       store_info[contact_url] = {}
       doc=PyQuery(content)
       product['supplier_name'] = doc('div.detail > div.contactSeller > span.disc').text()
       if not product['supplier_name']:
           product['supplier_name'] = doc('div.detail > div.contactSeller').remove('label').text()
       store_info[contact_url]['supplier_name'] = product['supplier_name']

       product['city'] = doc('div.detail > div.address > span.disc').text()
       if not product['city']:
           product['city'] = doc('div.detail').find('div.address').remove('label').text()
       store_info[contact_url]['city'] = product['city']

       product['mobile'], product['telephone'], product['store_address'] = '', '', ''
       for node in doc('div.contcat-desc > dl'):
           nodeQ = PyQuery(node)
           #print nodeQ.children('dt').text()
           if nodeQ.children('dt').text().strip() == u'电      话:':
               product['telephone'] = nodeQ.children('dd').text()
               store_info[contact_url]['telephone'] = product['telephone']
           if nodeQ.children('dt').text().strip() == u'地      址:':
               product['store_address'] = nodeQ.children('dd').text()
               store_info[contact_url]['store_address'] = product['store_address']
           if nodeQ.children('dt').text().strip() == u'移动电话:':
               product['mobile'] = nodeQ.children('dd').text()
               store_info[contact_url]['mobile'] = product['mobile']    
    return product
예제 #42
0
 def parseNextPageUrl(self, category_page_content):
     doc = PyQuery(category_page_content)
     for liNode in doc('div.pagination:first > ul > li'):
         liNodeQ = PyQuery(liNode)
         if liNodeQ.text().strip().startswith("Next"):
             return liNodeQ.children('a').attr('href')
예제 #43
0
def parseSupplierContactPage(m):
    #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm?
    if m['url'].find('\?') > 0:
        """do nothing"""
    else:
        if m['url'].endswith("/"):
            m['url'] = m['url'][:-1]
        m['url'] = m['url'] + '?'
    #拼出联系页面的url
    contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url'])
    content = fetchContent(contact_page_url)
    doc = PyQuery(content)
    #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载
    if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(
            0).attr('alt'):
        m['trade_medal'] = doc(
            'div.detail > div.trade-medal > span.disc > a.image > img').eq(
                0).attr('alt')
    else:
        m['trade_medal'] = ''
    m['supply-grade'] = len(
        doc('div.detail > div.supply-grade > span.disc > a.image > img'))
    m['biz-type'] = doc('div.detail > div.biz-type > span').text()
    if not m['biz-type']:
        m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text()
    aList = doc('div.contcat-desc > dl')
    bList = []
    for item in aList:
        itemQ = PyQuery(item)
        text = itemQ.children('dt').text()
        #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉
        if text.find(u"话") > 0:
            bList.append(itemQ.children('dd').text())
    m['contact'] = ', '.join(bList)
    #根据json数据获取 满意度
    #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1
    #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true}
    if re.findall('shop/(.*)/page', contact_page_url):
        stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall(
            "shop/(.*)/page", contact_page_url)[0] + '&sati=1'
        content2 = fetchContent(stat_url)
        json_data = json.loads(content2)
        m['satisfication'] = json_data['data']['sati']['satisfaction']
        #抓全部商品数 和 动销 商品数
        #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true'
        merchantId = re.findall('shop/(.*)/page', contact_page_url)[0]
        all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true'
        active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm'
        content3 = fetchContent(all_products_url)
        doc3 = PyQuery(content3)
        m['products_count'] = extractNum(
            doc3('li[class="offer-list-tab-title current"] > a > em').text())
        if m['products_count'] == 0:
            m['products_count'] = doc3(
                'ul[data-sp="paging-a"] > li > em.offer-count').text()
        content4 = fetchContent(active_product_url)
        doc4 = PyQuery(content4)
        m['active_products_count'] = extractNum(
            doc4('li[class="offer-list-tab-title current"] > a > em').text())
        if m['active_products_count'] == 0:
            m['active_products_count'] = doc4(
                'ul[data-sp="paging-a"] > li > em.offer-count').text()
    else:
        m['satisfication'] = ''
예제 #44
0
    def search(self, word):
        # print(self.URL.format(word=word))
        response = requests.get(self.URL.format(word=word.replace(' ', '-')),
                                headers=headers)
        text = response.text

        # f = open('temp.txt')
        # text = f.read()
        # f.close()

        doc = PyQuery(text)
        results = []
        if ' ' in word:
            divs = doc('section div:has("section.entry-headword")')
        else:
            divs = doc(
                'section div:has("section.entry-headword"):has(".pron-spell-container"):has(".pron-ipa-content")'
            )
        if not divs:
            return {"status": 'error', "error_detail": "Nothing found."}

        for def_div in divs:
            def_div = PyQuery(def_div)
            # 単語
            word = def_div('h1,h2').text()
            # 単語

            # 発音
            pron = def_div('.pron-ipa-content').text()
            if pron == '':
                pron = None
            else:
                pron = pron.replace(' ', '').replace('/', '')
            # 発音

            # 音声
            sound = def_div('audio source[type="audio/mpeg"]').attr('src')
            # 音声

            # 定義
            definitions = []
            meaning_section = def_div('section:not(.entry-headword)')

            for section in meaning_section:
                # debug()
                section = PyQuery(section)
                word_type = section('h3').text()
                meanings = []
                meaning_divs = section(
                    '.default-content>div, .expandable-content>div')
                if not meaning_divs:
                    meaning_divs = section.children('div>div')
                for meaning_div in meaning_divs:
                    meaning_div = PyQuery(meaning_div)
                    # label = meaning_div('.luna-label')
                    # if label:
                    #     # print('xxx', label.text())
                    #     x = label.text()
                    #     meaning_div('.luna-labset').replaceWith(x)
                    #     # print(meaning_div)
                    # a = meaning_div('a')
                    # if a:
                    #     x = a.text()
                    #     meaning_div('a').replaceWith(x)
                    # decoration = meaning_div('.italic, .bold')
                    # if decoration:
                    #     # debug()
                    #     for _decoration in decoration:
                    #
                    #         # x = decoration.text()
                    #         _decoration.replaceWith(_decoration.text())
                    # text = meaning_div.children('span').clone().children().remove().end().text()

                    meaning = dict()
                    example = meaning_div('.luna-example').text()
                    if example:
                        meaning['example'] = example
                    sub_lis = meaning_div('li')
                    if sub_lis:
                        meaning['subs'] = list(
                            map(lambda x: PyQuery(x).text(), sub_lis))

                    meaning_div('.luna-example').remove()
                    meaning_div('li').remove()
                    text = meaning_div.text()[:-1]
                    meaning['text'] = text

                    meanings.append(meaning)
                # print(len(meaning_divs))
                definitions.append(dict(word_type=word_type,
                                        meanings=meanings))
            # print(len(definitions))
            # 定義
            results.append(
                dict(word=word,
                     pron=pron,
                     sound=sound,
                     definitions=definitions))
        if results:
            return {"status": 'success', "results": results}
        else:
            return {"status": 'error', "error_detail": "Nothing found."}
예제 #45
0
def extract(dom, param_dict):
    res = []
    # dom head
    head_list = str(param_dict['dom_head']).strip().split(',')
    d_divs = dom(head_list[0])

    if len(head_list) > 1:
        for pos in range(1, len(head_list)):
            try:
                value = int(head_list[pos])
                d_divs = d_divs.eq(value)
            except:
                d_divs = d_divs.children(head_list[pos])

    for div in d_divs:
        d_div = PyQuery(div)
        if param_dict['sandwich'] != 'None':
            sandwich_list = str(param_dict['sandwich']).strip().split(',')
            for sandwich in sandwich_list:
                try:
                    positon = int(sandwich)
                    d_div = d_div.eq(positon)
                except:
                    d_div = d_div.children(sandwich)

        header = str(param_dict['title']).strip().split(',')[0]
        if not d_div.children(header):
            continue

        # 获取url 信息
        url_list = str(param_dict['url']).strip().split(',')
        url = d_div.children(url_list[0])
        for pos in range(1, len(url_list)):
            try:
                n_url = int(url_list[pos])
                url = url.eq(n_url)
            except:
                if url_list[pos] == 'href':
                    url = url.attr('href')
                    break
                else:
                    url = url.children(url_list[pos])

        # join url
        if 'www' not in url and 'http' not in url:
            match = re.search('^/', url)
            if match:
                url = param_dict['domain'] + url
            else:
                url = param_dict['domain'] + '/' + url

        if 'http://' not in url:
            url = 'http://' + url

        # 获取title 信息
        title_list = str(param_dict['title']).strip().split(',')
        title = d_div
        for item in title_list:
            try:
                n_title = int(item)
                title = title.eq(n_title)
            except:
                title = title.children(item)

        title = title.text()
        date_list = str(param_dict['date']).strip().split(',')
        date = d_div
        is_attr = False
        for item in date_list:
            try:
                n_item = int(item)
                date = date.eq(n_item)
            except:
                if 'attr' not in item:
                    date = date.children(item)
                else:
                    item = item[:item.find(':')]
                    date = date.attr(item)[:20].strip()
                    is_attr = True

        date = date if is_attr else date.text()
        if ' / ' in date: date = date.replace(' / ', '-')
        if '/' in date: date = date.replace('/', '-')

        if re.search(u'\d{4}-\d{1,2}-\d{1,2}', date):
            date = ''.join(x for x in date if ord(x) < 256).strip()
            start_index = date.rfind('201')  #第一次出现的位置
            end_index1 = date.rfind('-')
            end_index2 = date.rfind(':')
            end_index = end_index1 if end_index1 > end_index2 else end_index2
            date = date[start_index:end_index + 3]
            if len(date) == 10:
                date = '%s %s' % (
                    date, time.strftime("%H:%M", time.localtime(time.time())))
        elif re.search(u'\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{1,2}:\d{1,2}', date):
            arr_time = date.split(' ')
            arr_date = arr_time[0].split('-')
            date = '%s-%s-%s %s' % (arr_date[2], arr_date[0], arr_date[1],
                                    arr_time[1])
        else:
            try:
                # 时间戳转化成日期
                date_stamp = int(date)
                if date_stamp > 9999999999:
                    date_stamp = int(date[:10])

                x = time.localtime(date_stamp)
                date = time.strftime('%Y-%m-%d %H:%M', x)
            except:
                date = fomate_date_output(date)

        date = format_date_time(date)
        if len(date) == 16:
            if cmp(date, str_today) >= 0 and cmp(
                    date, end_today) <= 0 and len(title) > 0:
                res.append([date, url, title])

    return res
예제 #46
0
def test_mark_dirty():
    node = PyQuery('<a data-riot-id="0"><b data-riot-id="0.0"><c data-riot-id="0.0.0"></c></b></a>')
    mark_dirty(node.children('b'))
    assert node.attr['data-riot-dirty'] == 'true'
    assert node.children('b').attr['data-riot-dirty'] == 'true'
    assert not node.children('c').attr['data-riot-dirty']