def get_album_page(sub_path, page_count): album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % ( sub_path, SUB_PATH_LIST[sub_path], page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") result = { "album_info_list": [], # 全部图集信息 "is_over": False, # 是不是最后一页图集 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_pagination_response.status)) # 页面编码 album_pagination_html = album_pagination_response.data.decode("GBK") # 获取图集信息,存在两种页面样式 album_list_selector = PQ(album_pagination_html).find("div.xxx li a") if album_list_selector.length == 0: album_list_selector = PQ(album_pagination_html).find("div.yyy li a") if album_list_selector.length == 0: raise crawler.CrawlerException("页面截取图集列表失败\n%s" % album_pagination_html.encode("UTF-8")) for album_index in range(0, album_list_selector.length): result_album_info = { "album_title": "", # 图集id "page_id": None, # 图集页面id } album_selector = album_list_selector.eq(album_index) # 获取图集id album_url = album_selector.attr("href") if not album_url: raise crawler.CrawlerException( "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = album_url.split("/")[-2] if not crawler.is_integer(album_id): raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url)) result_album_info["page_id"] = album_id # 获取图集标题 album_title = album_selector.attr("title").encode("UTF-8") if len(re.findall("_共\d*张", album_title)) == 1: result_album_info["album_title"] = album_title[:album_title. rfind("_共")] else: result_album_info["album_title"] = album_title result["album_info_list"].append(result_album_info) # 判断是不是最后一页 max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text() if not max_page_info: raise crawler.CrawlerException("总页数信息截取失败\n%s" % album_pagination_html.encode("UTF-8")) max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共", "页") if not crawler.is_integer(max_page_count): raise crawler.CrawlerException("总页数截取失败\n%s" % max_page_info.encode("UTF-8")) result["is_over"] = page_count >= int(max_page_count) return result
def get_album_page(album_id): album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id album_response = net.http_request(album_url, method="GET") result = { "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 "model_name": "", # 模特名字 } if album_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_response.status)) if album_response.data.find("该页面不存在,或者已经被删除!") >= 0: result["is_delete"] = True return result # 获取模特名字 model_name = PQ(album_response.data).find( "div.ren_head div.ren_head_c a").attr("title") if not model_name: raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" % album_response.data) result["model_name"] = model_name.encode("UTF-8").strip() # 获取所有图片地址 image_list_selector = PQ(album_response.data).find("ul#myGallery li img") if image_list_selector.length == 0: raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data) for image_index in range(0, image_list_selector.length): image_url = image_list_selector.eq(image_index).attr("src") if image_url.find("_magazine_web_m.") == -1: raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url) result["image_url_list"].append( image_url.replace("_magazine_web_m.", "_magazine_web_l.")) return result
def _dump_slide(slide, idx_slide, outputdir): html = PyQuery(slide).html(); slide_name = '%03d.html' % idx_slide print "dump slide {} in dir {}".format(idx_slide, outputdir) dump = open(os.path.join(outputdir,slide_name), 'w+') dump.write("@template:content_bare\n") dump.write(html.encode('utf-8','replace')) dump.close()
def getData(self, selector): """ Return the all text in the area limitet by the selector """ tags = self.sorceCode.find(selector) text = PQ(tags.html()).text() text = text.encode(self.encoding, 'xmlcharrefreplace') #print text return text
def get_album_page(album_id): page_count = max_page_count = 1 result = { "album_title": "", # 图集标题 "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 } while page_count <= max_page_count: album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % ( album_id, album_id, page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") if album_pagination_response.status == 404 and page_count == 1: result["is_delete"] = True return result if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "第%s页 " % page_count + crawler.request_failre(album_pagination_response.status)) # 判断图集是否已经被删除 if page_count == 1: # 获取图集标题 album_title = PQ(album_pagination_response.data.decode( "UTF-8")).find("meta[name='description']").attr("content") if not album_title: raise crawler.CrawlerException("页面截取标题失败\n%s" % album_pagination_response.data) result["album_title"] = album_title.encode("UTF-8") # 获取图集图片地址 image_list_selector = PQ( album_pagination_response.data).find("div.articleV4Body a img") if image_list_selector.length == 0: raise crawler.CrawlerException( "第%s页 页面匹配图片地址失败\n%s" % (page_count, album_pagination_response.data)) for image_index in range(0, image_list_selector.length): result["image_url_list"].append( str(image_list_selector.eq(image_index).attr("src"))) # 获取总页数 pagination_list_selector = PQ( album_pagination_response.data).find("ul.articleV4Page a.page-a") if pagination_list_selector.length > 0: for pagination_index in range(0, pagination_list_selector.length): temp_page_count = pagination_list_selector.eq( pagination_index).html() if crawler.is_integer(temp_page_count): max_page_count = max(int(temp_page_count), max_page_count) else: if page_count > 1: raise crawler.CrawlerException( "第%s页 页面匹配分页信息失败\n%s" % (page_count, album_pagination_response.data)) page_count += 1 return result
def qidian(self): #list url if isinstance(self.url, list): for url in self.url: try: bookname = PyQuery(requests.get(url).content)( 'h1 > em').text().strip().replace(" ", "") name = bookname.encode("utf-8") + ".epub" self.exists(name) bookid = re.search("\d+", url).group() download_url = "http://download.qidian.com/epub/%s.epub" % ( bookid) content = requests.get(download_url).content self.save(name, content) except Exception, e: logging.warning("download error [%s]" % (url))
server.sendmail(mailFrom, rcptToList, message.as_string()) server.quit() if '__main__' == __name__: configFile = 'config.cfg' novels = PyQuery(filename = configFile) message = '' for novel in novels('novel'): name = PyQuery(novel)('name').text() url = PyQuery(novel)('url').text() prefix = PyQuery(novel)('prefix').text() next = int(PyQuery(novel)('next').text()) rcptToList = [] for addr in PyQuery(novel)('emails>email'): rcptToList.append(PyQuery(addr).text()) print rcptToList html = PyQuery(url = url) nextUrl = None for i in html('div.threadlist_title.pull_left.j_th_tit.member_thread_title_frs > a.j_th_tit'): if i.text.find(number2chinese(next)) != -1: nextUrl = prefix + PyQuery(i).attr('href') break if nextUrl: next += 1 PyQuery(novel)('next').text(str(next)) text = PyQuery(url=nextUrl)('cc:first > div:first').html() text = text.replace(u'<br/>', '\n').strip() subject = name + u' ' + u'第'+unicode(str(next))+u'章' send_mail('*****@*****.**', rcptToList, subject.encode('utf8'), text.encode('utf8')) open(configFile, 'wt').write(str(novels))
def get_album_page(album_id): page_count = max_page_count = 1 image_count = 0 result = { "album_title": "", # 图集标题 "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 } while page_count <= max_page_count: album_pagination_url = "https://www.nvshens.com/g/%s/%s.html" % ( album_id, page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "第%s页 " % page_count + crawler.request_failre(album_pagination_response.status)) # 判断图集是否已经被删除 if page_count == 1: result["is_delete"] = album_pagination_response.data.find( "<title>该页面未找到-宅男女神</title>") >= 0 if result["is_delete"]: return result # 获取图集图片总数 album_info = PQ( album_pagination_response.data).find("#dinfo span").text() if not album_info and album_info.encode("UTF-8").find("张照片") == -1: raise crawler.CrawlerException("页面截取图片总数信息失败\n%s" % album_pagination_response.data) image_count = album_info.encode("UTF-8").replace("张照片", "") if not crawler.is_integer(image_count): raise crawler.CrawlerException("页面截取图片总数失败\n%s" % album_pagination_response.data) image_count = int(image_count) if image_count == 0: result["is_delete"] = True return result # 获取图集标题 result["album_title"] = str( tool.find_sub_string(album_pagination_response.data, '<h1 id="htilte">', "</h1>")).strip() if not result["album_title"]: raise crawler.CrawlerException("页面截取标题失败\n%s" % album_pagination_response.data) # 获取图集图片地址,存在两种页面样式 image_list_selector = PQ( album_pagination_response.data).find("#hgallery img") if image_list_selector.length == 0: image_list_selector = PQ( album_pagination_response.data).find("#pgallery img") if image_list_selector.length == 0: raise crawler.CrawlerException( "第%s页 页面匹配图片地址失败\n%s" % (page_count, album_pagination_response.data)) for image_index in range(0, image_list_selector.length): result["image_url_list"].append( str(image_list_selector.eq(image_index).attr("src"))) # 获取总页数 pagination_html = PQ( album_pagination_response.data).find("#pages").html() if pagination_html: page_count_find = re.findall( '/g/' + str(album_id) + '/([\d]*).html', pagination_html) if len(page_count_find) != 0: max_page_count = max(map(int, page_count_find)) else: log.error("图集%s 第%s页分页异常" % (album_id, page_count)) page_count += 1 # 判断页面上的总数和实际地址数量是否一致 if image_count != len(result["image_url_list"]): raise crawler.CrawlerException( "页面截取的图片数量 %s 和显示的总数 %s 不一致" % (image_count, len(result["image_url_list"]))) return result
def prn_tbl_sec(index, node) : global node_id, curr_dep, last_dep, depth, opTyp if index != 0 : print >>sys.stderr,"...Start of PART, depth="+str(depth) ce = PyQuery(node) # Print the part heading as containing node partLst = ce.prevAll('h3') partTxt = PyQuery(partLst[len(partLst)-1]).text() if index % 2 == 0 : print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="left" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">' else : print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="right" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">' rows = ce('tr') rows.each(prn_mm_for_sec) # Print the closing tags for this table print >>sys.stderr,"...End of PART, depth="+str(depth) for i in range (0,depth) : print '</node>' print '</node>' #For the part heading containing node depth=0 last_dep = 3
def prn_mm_for_sec(index, node) : global last_rowTxt,node_id, curr_dep, last_dep, depth ce = PyQuery(node) rowTxt = ce.text() cols = ce('td') curr_dep = len(cols) # First close the previous node if required #if curr_dep == 1 and cols[0].text() == '' : if curr_dep == 1 : # This is a blank line which ends a section or sub-sec print >>sys.stderr,"...Blank line: End of NODE, depth="+str(depth) print >>sys.stderr,"......Last Row Text:"+last_rowTxt for i in range (0,depth) : print '</node>' depth=0 elif curr_dep == (last_dep + 1) : # This means a new nesting starts, just inc. depth depth = depth + 1 if index == 0 : print >>sys.stderr,"...Start of new level-2 node: "+rowTxt elif (curr_dep + 1) == last_dep : # This means a nesting has ended, dec. depth & print 2 end tags depth = depth - 1 print '</node>' print '</node>' elif curr_dep == last_dep : # This means are at the same level: just end the previous node tag print '</node>' elif curr_dep >= 3 and last_dep == 1 : # This means start of a new level-1 node # DO NOTHING print >>sys.stderr,"...Start of new level-2 node: "+rowTxt depth = 1 else : print >>sys.stderr,"...Curr dep. is neither one more nor less than prev. depth" print >>sys.stderr,"......Curr. dep:"+str(curr_dep)+" last dep:"+str(last_dep) print >>sys.stderr,"......Last Row Text:"+last_rowTxt print >>sys.stderr,"......Curr. Row Text:"+rowTxt # Next print the text for current node if not empty line if curr_dep >= 2 : nodeTxt = PyQuery(cols[curr_dep - 2]).text()+" "+PyQuery(cols[curr_dep - 1]).text() print '<node CREATED="1347382439772" ID="ID_'+str(node_id)+'" MODIFIED="1347382510988" TEXT="'+nodeTxt.encode('utf-8')+'">' last_dep = curr_dep last_rowTxt = rowTxt node_id = node_id + 1