Python PyQuery.encode примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyquery

Класс/Тип: PyQuery

Метод/Функция: encode

Примеров на hotexamples.com: 10

Python PyQuery.encode - 10 примеров найдено. Это лучшие примеры Python кода для pyquery.PyQuery.encode, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PyQuery(30)

remove(30)

eq(30)

outerHtml(30)

attr(30)

children(30)

items(30)

html(29)

make_links_absolute(24)

split(24)

find(19)

replace(17)

__unicode__(14)

outer_html(13)

contents(12)

append(12)

size(10)

strip(9)

is_(8)

replaceWith(8)

startswith(6)

has_class(5)

encode(5)

index(4)

parents(4)

lower(4)

get(4)

next(3)

remove_namespaces(3)

hasClass(2)

parent(2)

each(2)

decode(2)

__html__(2)

closest(2)

clone(2)

add_class(2)

addClass(2)

map(2)

endswith(2)

show(1)

replace_with(1)

serialize(1)

siblings(1)

start(1)

terminate(1)

removeClass(1)

insertBefore(1)

removeAttr(1)

read(1)

Пример #1

Показать файл

def get_album_page(sub_path, page_count):
    album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % (
        sub_path, SUB_PATH_LIST[sub_path], page_count)
    album_pagination_response = net.http_request(album_pagination_url,
                                                 method="GET")
    result = {
        "album_info_list": [],  # 全部图集信息
        "is_over": False,  # 是不是最后一页图集
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_pagination_response.status))
    # 页面编码
    album_pagination_html = album_pagination_response.data.decode("GBK")
    # 获取图集信息，存在两种页面样式
    album_list_selector = PQ(album_pagination_html).find("div.xxx li a")
    if album_list_selector.length == 0:
        album_list_selector = PQ(album_pagination_html).find("div.yyy li a")
    if album_list_selector.length == 0:
        raise crawler.CrawlerException("页面截取图集列表失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    for album_index in range(0, album_list_selector.length):
        result_album_info = {
            "album_title": "",  # 图集id
            "page_id": None,  # 图集页面id
        }
        album_selector = album_list_selector.eq(album_index)
        # 获取图集id
        album_url = album_selector.attr("href")
        if not album_url:
            raise crawler.CrawlerException(
                "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = album_url.split("/")[-2]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url))
        result_album_info["page_id"] = album_id
        # 获取图集标题
        album_title = album_selector.attr("title").encode("UTF-8")
        if len(re.findall("_共\d*张", album_title)) == 1:
            result_album_info["album_title"] = album_title[:album_title.
                                                           rfind("_共")]
        else:
            result_album_info["album_title"] = album_title
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text()
    if not max_page_info:
        raise crawler.CrawlerException("总页数信息截取失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共",
                                          "页")
    if not crawler.is_integer(max_page_count):
        raise crawler.CrawlerException("总页数截取失败\n%s" %
                                       max_page_info.encode("UTF-8"))
    result["is_over"] = page_count >= int(max_page_count)
    return result

Пример #2

Показать файл

def get_album_page(album_id):
    album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id
    album_response = net.http_request(album_url, method="GET")
    result = {
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
        "model_name": "",  # 模特名字
    }
    if album_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_response.status))
    if album_response.data.find("该页面不存在,或者已经被删除!") >= 0:
        result["is_delete"] = True
        return result
    # 获取模特名字
    model_name = PQ(album_response.data).find(
        "div.ren_head div.ren_head_c a").attr("title")
    if not model_name:
        raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" %
                                       album_response.data)
    result["model_name"] = model_name.encode("UTF-8").strip()
    # 获取所有图片地址
    image_list_selector = PQ(album_response.data).find("ul#myGallery li img")
    if image_list_selector.length == 0:
        raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data)
    for image_index in range(0, image_list_selector.length):
        image_url = image_list_selector.eq(image_index).attr("src")
        if image_url.find("_magazine_web_m.") == -1:
            raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url)
        result["image_url_list"].append(
            image_url.replace("_magazine_web_m.", "_magazine_web_l."))
    return result

Пример #3

Показать файл

Файл: split.py Проект: tiry/reveal-js-tools

def  _dump_slide(slide, idx_slide, outputdir):

    html = PyQuery(slide).html();
    slide_name = '%03d.html' % idx_slide
    print "dump slide {} in dir {}".format(idx_slide, outputdir)    
    dump = open(os.path.join(outputdir,slide_name), 'w+')
    dump.write("@template:content_bare\n")
    dump.write(html.encode('utf-8','replace'))
    dump.close()

Пример #4

Показать файл

Файл: session.py Проект: jotaV/WebCapture

	def getData(self, selector):
		"""
		Return the all text in the area limitet by the selector
		"""
		tags = self.sorceCode.find(selector)
		text = PQ(tags.html()).text()
		text = text.encode(self.encoding, 'xmlcharrefreplace')
		#print text

		return text

Пример #5

Показать файл

Файл: youzi.py Проект: zhoushuqiang/PyCrawler

def get_album_page(album_id):
    page_count = max_page_count = 1
    result = {
        "album_title": "",  # 图集标题
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
    }
    while page_count <= max_page_count:
        album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % (
            album_id, album_id, page_count)
        album_pagination_response = net.http_request(album_pagination_url,
                                                     method="GET")
        if album_pagination_response.status == 404 and page_count == 1:
            result["is_delete"] = True
            return result
        if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(album_pagination_response.status))
        # 判断图集是否已经被删除
        if page_count == 1:
            # 获取图集标题
            album_title = PQ(album_pagination_response.data.decode(
                "UTF-8")).find("meta[name='description']").attr("content")
            if not album_title:
                raise crawler.CrawlerException("页面截取标题失败\n%s" %
                                               album_pagination_response.data)
            result["album_title"] = album_title.encode("UTF-8")
        # 获取图集图片地址
        image_list_selector = PQ(
            album_pagination_response.data).find("div.articleV4Body a img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, album_pagination_response.data))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                str(image_list_selector.eq(image_index).attr("src")))
        # 获取总页数
        pagination_list_selector = PQ(
            album_pagination_response.data).find("ul.articleV4Page a.page-a")
        if pagination_list_selector.length > 0:
            for pagination_index in range(0, pagination_list_selector.length):
                temp_page_count = pagination_list_selector.eq(
                    pagination_index).html()
                if crawler.is_integer(temp_page_count):
                    max_page_count = max(int(temp_page_count), max_page_count)
        else:
            if page_count > 1:
                raise crawler.CrawlerException(
                    "第%s页 页面匹配分页信息失败\n%s" %
                    (page_count, album_pagination_response.data))
        page_count += 1
    return result

Пример #6

Показать файл

 def qidian(self):
     #list url
     if isinstance(self.url, list):
         for url in self.url:
             try:
                 bookname = PyQuery(requests.get(url).content)(
                     'h1 > em').text().strip().replace(" ", "")
                 name = bookname.encode("utf-8") + ".epub"
                 self.exists(name)
                 bookid = re.search("\d+", url).group()
                 download_url = "http://download.qidian.com/epub/%s.epub" % (
                     bookid)
                 content = requests.get(download_url).content
                 self.save(name, content)
             except Exception, e:
                 logging.warning("download error [%s]" % (url))

Пример #7

Показать файл

Файл: a.py Проект: niu2x/novelpush

	server.sendmail(mailFrom, rcptToList, message.as_string())
	server.quit()

if '__main__' == __name__:
	configFile = 'config.cfg'
	novels = PyQuery(filename = configFile)
	message = ''
	for novel in novels('novel'):
		name = PyQuery(novel)('name').text()
		url = PyQuery(novel)('url').text()
		prefix = PyQuery(novel)('prefix').text()
		next = int(PyQuery(novel)('next').text())
		rcptToList = []
		for addr in PyQuery(novel)('emails>email'):
			rcptToList.append(PyQuery(addr).text())
		print rcptToList
		html = PyQuery(url = url)
		nextUrl = None
		for i in html('div.threadlist_title.pull_left.j_th_tit.member_thread_title_frs > a.j_th_tit'):
			if i.text.find(number2chinese(next)) != -1:
				nextUrl = prefix + PyQuery(i).attr('href')
				break
		if nextUrl:
			next += 1
			PyQuery(novel)('next').text(str(next))
			text = PyQuery(url=nextUrl)('cc:first > div:first').html()
			text = text.replace(u'<br/>', '\n').strip()
			subject = name + u' ' + u'第'+unicode(str(next))+u'章'
			send_mail('*****@*****.**', rcptToList, subject.encode('utf8'), text.encode('utf8'))
	open(configFile, 'wt').write(str(novels))

Пример #8

Показать файл

def get_album_page(album_id):
    page_count = max_page_count = 1
    image_count = 0
    result = {
        "album_title": "",  # 图集标题
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
    }
    while page_count <= max_page_count:
        album_pagination_url = "https://www.nvshens.com/g/%s/%s.html" % (
            album_id, page_count)
        album_pagination_response = net.http_request(album_pagination_url,
                                                     method="GET")
        if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(album_pagination_response.status))
        # 判断图集是否已经被删除
        if page_count == 1:
            result["is_delete"] = album_pagination_response.data.find(
                "<title>该页面未找到-宅男女神</title>") >= 0
            if result["is_delete"]:
                return result
            # 获取图集图片总数
            album_info = PQ(
                album_pagination_response.data).find("#dinfo span").text()
            if not album_info and album_info.encode("UTF-8").find("张照片") == -1:
                raise crawler.CrawlerException("页面截取图片总数信息失败\n%s" %
                                               album_pagination_response.data)
            image_count = album_info.encode("UTF-8").replace("张照片", "")
            if not crawler.is_integer(image_count):
                raise crawler.CrawlerException("页面截取图片总数失败\n%s" %
                                               album_pagination_response.data)
            image_count = int(image_count)
            if image_count == 0:
                result["is_delete"] = True
                return result
            # 获取图集标题
            result["album_title"] = str(
                tool.find_sub_string(album_pagination_response.data,
                                     '<h1 id="htilte">', "</h1>")).strip()
            if not result["album_title"]:
                raise crawler.CrawlerException("页面截取标题失败\n%s" %
                                               album_pagination_response.data)
        # 获取图集图片地址，存在两种页面样式
        image_list_selector = PQ(
            album_pagination_response.data).find("#hgallery img")
        if image_list_selector.length == 0:
            image_list_selector = PQ(
                album_pagination_response.data).find("#pgallery img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, album_pagination_response.data))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                str(image_list_selector.eq(image_index).attr("src")))
        # 获取总页数
        pagination_html = PQ(
            album_pagination_response.data).find("#pages").html()
        if pagination_html:
            page_count_find = re.findall(
                '/g/' + str(album_id) + '/([\d]*).html', pagination_html)
            if len(page_count_find) != 0:
                max_page_count = max(map(int, page_count_find))
            else:
                log.error("图集%s 第%s页分页异常" % (album_id, page_count))
        page_count += 1
    # 判断页面上的总数和实际地址数量是否一致
    if image_count != len(result["image_url_list"]):
        raise crawler.CrawlerException(
            "页面截取的图片数量 %s 和显示的总数 %s 不一致" %
            (image_count, len(result["image_url_list"])))
    return result

Пример #9

Показать файл

Файл: html_to_mm.py Проект: appanp/ceqna

def prn_tbl_sec(index, node) :
	global node_id, curr_dep, last_dep, depth, opTyp
	if index != 0 :
		print >>sys.stderr,"...Start of PART, depth="+str(depth)
		ce = PyQuery(node)
    	# Print the part heading as containing node
		partLst = ce.prevAll('h3')
		partTxt = PyQuery(partLst[len(partLst)-1]).text()
		if index % 2 == 0 :
			print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="left" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">'
		else :
			print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="right" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">'
    	rows = ce('tr')
    	rows.each(prn_mm_for_sec)
    	# Print the closing tags for this table
    	print >>sys.stderr,"...End of PART, depth="+str(depth)
    	for i in range (0,depth) :
      		print '</node>'
    	print '</node>' #For the part heading containing node
    	depth=0
    	last_dep = 3

Пример #10

Показать файл

Файл: html_to_mm.py Проект: appanp/ceqna

def prn_mm_for_sec(index, node) :
  global last_rowTxt,node_id, curr_dep, last_dep, depth
  ce = PyQuery(node)
  rowTxt = ce.text()
  cols = ce('td')
  curr_dep = len(cols)

  # First close the previous node if required
  #if curr_dep == 1 and cols[0].text() == '' :
  if curr_dep == 1 :
       # This is a blank line which ends a section or sub-sec
       print >>sys.stderr,"...Blank line: End of NODE, depth="+str(depth)
       print >>sys.stderr,"......Last Row Text:"+last_rowTxt
       for i in range (0,depth) :
           print '</node>'
       depth=0
  elif curr_dep == (last_dep + 1) :
    # This means a new nesting starts, just inc. depth
    depth = depth + 1
    if index == 0 :
        print >>sys.stderr,"...Start of new level-2 node: "+rowTxt
  elif (curr_dep + 1) == last_dep :
    # This means a nesting has ended, dec. depth & print 2 end tags
    depth = depth - 1
    print '</node>'
    print '</node>'
  elif curr_dep  == last_dep :
    # This means are at the same level: just end the previous node tag
    print '</node>'
  elif curr_dep >= 3 and  last_dep == 1 :
    # This means start of a new level-1 node
    # DO NOTHING
    print >>sys.stderr,"...Start of new level-2 node: "+rowTxt
    depth = 1
  else :
    print >>sys.stderr,"...Curr dep. is neither one more nor less than prev. depth"
    print >>sys.stderr,"......Curr. dep:"+str(curr_dep)+" last dep:"+str(last_dep)
    print >>sys.stderr,"......Last Row Text:"+last_rowTxt
    print >>sys.stderr,"......Curr. Row Text:"+rowTxt
  # Next print the text for current node if not empty line
  if curr_dep >= 2 :
    nodeTxt = PyQuery(cols[curr_dep - 2]).text()+" "+PyQuery(cols[curr_dep - 1]).text()
    print '<node CREATED="1347382439772" ID="ID_'+str(node_id)+'" MODIFIED="1347382510988" TEXT="'+nodeTxt.encode('utf-8')+'">'
  last_dep = curr_dep
  last_rowTxt = rowTxt
  node_id = node_id + 1