def get_change_info_detail(self, match_feature, detail): before = '' after = '' if detail is None: return before, after def get_change_list_detail(table): tr_list = table.find('tr').items() position_list = [] for tr in tr_list: td_list = tr.find('td') if len(td_list) < 3: continue position_list.append(td_list.eq(1).text().strip('*').strip() + ':' + td_list.eq(2).text()) return ','.join(position_list) for item in detail: feature = item.get('match_feature') if feature is None: continue if feature.strip() not in match_feature and match_feature not in feature: continue text = item.get('text') if text is None or text.strip() == '': return before, after table_list = PyQuery(text, parser='html').find('.table-result') before = get_change_list_detail(table_list.eq(1)) after = get_change_list_detail(table_list.eq(2)) break return before, after
def search(self, word): response = requests.get(self.URL.format(word=word)) text = response.text doc = PyQuery(text) results = [] table = doc("table.school-course") if table: # print(table) # table = table[0] table = PyQuery(table) for tr in table('tr'): tr = PyQuery(tr) if tr('th'): continue td = tr('td') td = PyQuery(td) result = { 'word': td.eq(0).text().split('(')[0], 'meaning': td.eq(1).text() } if result['word'][-1] == ' ': result['word'] = result['word'][:-1] results.append(result) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}
def get_market_game_trade_card_price(game_id, login_cookie): cookies_list = {"steamLogin": login_cookie} market_search_url = "http://steamcommunity.com/market/search/render/" market_search_url += "?query=&count=20&appid=753&category_753_Game[0]=tag_app_%s&category_753_cardborder[0]=tag_cardborder_0" % game_id market_search_response = net.http_request(market_search_url, method="GET", cookies_list=cookies_list, json_decode=True) if market_search_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(market_search_response.status)) market_item_list = {} if not crawler.check_sub_key( ("success", "results_html"), market_search_response.json_data): raise crawler.CrawlerException( "返回信息'success'或'results_html'字段不存在\n%s" % market_search_response.json_data) if market_search_response.json_data["success"] is not True: raise crawler.CrawlerException("返回信息'success'字段取值不正确\n%s" % market_search_response.json_data) card_selector = PQ(market_search_response.json_data["results_html"]).find( ".market_listing_row_link") for index in range(0, card_selector.length): card_name = card_selector.eq(index).find( ".market_listing_item_name").text() card_min_price = card_selector.eq(index).find( "span.normal_price span.normal_price").text().encode( "UTF-8").replace("¥ ", "") market_item_list[card_name] = card_min_price # {'Pamu': '1.77', 'Fumi (Trading Card)': '2.14', 'Mio (Trading Card)': '1.33', 'Bonnibel (Trading Card)': '1.49', 'Groupshot': '1.87', 'Q-Piddy': '1.35', 'Elle (Trading Card)': '1.19', 'Quill': '1.50', 'Iro (Trading Card)': '1.42', 'Bearverly (Trading Card)': '1.27', 'Cassie (Trading Card)': '1.35'} return market_item_list
def get_album_page(album_id): page_count = max_page_count = 1 result = { "album_title": "", # 图集标题 "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 } while page_count <= max_page_count: album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % ( album_id, album_id, page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") if album_pagination_response.status == 404 and page_count == 1: result["is_delete"] = True return result if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "第%s页 " % page_count + crawler.request_failre(album_pagination_response.status)) # 判断图集是否已经被删除 if page_count == 1: # 获取图集标题 album_title = PQ(album_pagination_response.data.decode( "UTF-8")).find("meta[name='description']").attr("content") if not album_title: raise crawler.CrawlerException("页面截取标题失败\n%s" % album_pagination_response.data) result["album_title"] = album_title.encode("UTF-8") # 获取图集图片地址 image_list_selector = PQ( album_pagination_response.data).find("div.articleV4Body a img") if image_list_selector.length == 0: raise crawler.CrawlerException( "第%s页 页面匹配图片地址失败\n%s" % (page_count, album_pagination_response.data)) for image_index in range(0, image_list_selector.length): result["image_url_list"].append( str(image_list_selector.eq(image_index).attr("src"))) # 获取总页数 pagination_list_selector = PQ( album_pagination_response.data).find("ul.articleV4Page a.page-a") if pagination_list_selector.length > 0: for pagination_index in range(0, pagination_list_selector.length): temp_page_count = pagination_list_selector.eq( pagination_index).html() if crawler.is_integer(temp_page_count): max_page_count = max(int(temp_page_count), max_page_count) else: if page_count > 1: raise crawler.CrawlerException( "第%s页 页面匹配分页信息失败\n%s" % (page_count, album_pagination_response.data)) page_count += 1 return result
def get_user_events(data): user_events = {} query = PyQuery(data)("#platnosci")("table")("tr") print(query.eq(0)) for iter, row in enumerate(query): row_data = {} row_data["title"] = str(query.eq(iter)("td").eq(0)("a").html()).replace(r"<br />", ";") row_data["sign_in_url"] = query.eq(iter)("td").eq(0)("a").attr("href") row_data["edit_url"] = query.eq(iter)("td").eq(2)("a").attr("href") row_data["state"] = query.eq(iter)("td").eq(1).text() user_events[iter] = row_data return user_events
def define_external_type(url: str) -> None: """ Used for adding types from ``schema.org`` domain to ``types``. Fetches ``url`` and looks for parents, which are also recursively added to ``types``. Parameters ---------- url : str URL of the type. Should be from ``schema.org`` domain. """ global types if url in types: return types[url] = { 'label': url[url.rfind('/') + 1:], 'description': '', 'parents': [] } candidates = PyQuery(url)('link') for i in range(len(candidates)): link = candidates.eq(i) if link.attr('property') == 'rdfs:subClassOf': parent = link.attr('href') if len(parent) > 0: if parent not in types[url]['parents']: types[url]['parents'].append(parent) define_external_type(parent)
def get_self_account_badges(account_id, login_cookie): # 徽章第一页 badges_index_url = "http://steamcommunity.com/profiles/%s/badges/" % account_id cookies_list = {"steamLogin": login_cookie} badges_index_response = net.http_request(badges_index_url, method="GET", cookies_list=cookies_list) if badges_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(badges_index_response.status)) badges_detail_url_list = [] # 徽章div badges_selector = PQ(badges_index_response.data).find( ".maincontent .badges_sheet .badge_row") for index in range(0, badges_selector.length): badge_html = badges_selector.eq(index).html().encode("UTF-8") # 已经掉落全部卡牌的徽章 if badge_html.find("无剩余卡牌掉落") >= 0: # 徽章详细信息页面地址 badge_detail_url = tool.find_sub_string( badge_html, '<a class="badge_row_overlay" href="', '"/>') if not badge_detail_url: raise crawler.CrawlerException("徽章信息截取徽章详细界面地址失败\n%s" % badge_html) badges_detail_url_list.append(badge_detail_url) # ['http://steamcommunity.com/profiles/76561198172925593/gamecards/459820/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/357200/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/502740/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359600/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/354380/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359670/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/525300/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/337980/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/591420/'] return badges_detail_url_list
def _find_url(self): a = PQ(self.html).find('a') for i in range(len(a)): url = a.eq(i).attr('href') if url is not None: self._distribute_url(url) return None
def __process_item(self): all_td = PyQuery(self.__getattribute__('__el')).find('td') a_tag = all_td.eq(6).find('a[href^="do_openvpn.aspx?"]') if a_tag.length == 0: return href = a_tag.attr('href').replace('do_openvpn.aspx?', '') items = href.split('&') server = [ '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '' ] for item in items: props = item.split('=') if len(props) < 2: continue if props[0] == 'fqdn': server[0] = props[1].replace('.opengw.net', '') elif props[0] == 'ip': server[1] = props[1] elif props[0] == 'tcp': server[15] = props[1] elif props[0] == 'udp': server[16] = props[1] server = self.__fill_other_value(all_td, server) # OpenVPN_ConfigData_Base64 server[14] = self.__get_openvpn_config_base64(items) if server[14] is None: return # openvpn_config_base64 is none skip this item if self.__getattribute__('__sleep_time') > 0: time.sleep(self.__getattribute__('__sleep_time')) self.lock.acquire() self.__getattribute__('__list_server').append(server) self.lock.release()
def get_album_page(album_id): album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id album_response = net.http_request(album_url, method="GET") result = { "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 "model_name": "", # 模特名字 } if album_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_response.status)) if album_response.data.find("该页面不存在,或者已经被删除!") >= 0: result["is_delete"] = True return result # 获取模特名字 model_name = PQ(album_response.data).find( "div.ren_head div.ren_head_c a").attr("title") if not model_name: raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" % album_response.data) result["model_name"] = model_name.encode("UTF-8").strip() # 获取所有图片地址 image_list_selector = PQ(album_response.data).find("ul#myGallery li img") if image_list_selector.length == 0: raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data) for image_index in range(0, image_list_selector.length): image_url = image_list_selector.eq(image_index).attr("src") if image_url.find("_magazine_web_m.") == -1: raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url) result["image_url_list"].append( image_url.replace("_magazine_web_m.", "_magazine_web_l.")) return result
def crawl_infected_person_okayama( db: Session = Depends(get_db), is_update: bool = False): try: response = requests.get('https://fight-okayama.jp/attribute/') response.encoding = response.apparent_encoding doc = PyQuery(response.text.encode('utf-8')) except Exception as e: return {"exception": e.args} for tr_node in doc.find('tbody').children('tr'): td_nodes = PyQuery(tr_node)('tr').find('td') valid_values = validate_crawled_data( **takeout_and_processing_nodes(td_nodes, ("number", "date", "residence", "age", "sex"))) if not valid_values: if crud.get_mistaken_data_by_number( db=db, number_str=td_nodes.eq(0).text()) is None: mistaken_data_dict = takeout_and_processing_nodes( td_nodes, ("number_str", "date_str", "residence_str", "age_str", "sex_str")) create_mistaken_data( data=models.MistakenData(**mistaken_data_dict), db=db) continue # 値が既に存在していた場合、is_updateがTrueであればUPDATE if crud.get_data_by_number(db=db, number=valid_values.number) is not None: if is_update: update_infected_data(data=valid_values, db=db) else: return "the crawled data is existing" # 値が存在していない場合、値を保存 else: create_infected_data(data=valid_values, db=db)
def get_one_page_account(page_count): account_pagination_url = "http://jigadori.fkoji.com/users" query_data = {"p": page_count} account_pagination_response = net.http_request(account_pagination_url, method="GET", fields=query_data) pagination_account_list = {} if account_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: crawler.CrawlerException( crawler.request_failre(account_pagination_response.status)) account_list_selector = PQ(account_pagination_response.data.decode( "UTF-8")).find(".users-list li") for account_index in range(0, account_list_selector.length): account_selector = account_list_selector.eq(account_index) # 获取成员名字 account_name = account_selector.find(".profile-name").eq(0).text() if not account_name: account_name = "" # raise robot.CrawlerException("成员信息截取成员名字失败\n\%s" % account_selector.html().encode("UTF-8")) else: account_name = account_name.strip().encode("UTF-8") # 获取twitter账号 account_id = account_selector.find(".screen-name a").text() if not account_id: raise crawler.CrawlerException( "成员信息截取twitter账号失败\n\%s" % account_selector.html().encode("UTF-8")) account_id = account_id.strip().replace("@", "") pagination_account_list[account_id] = account_name return pagination_account_list
def get_album_page(sub_path, page_count): album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % ( sub_path, SUB_PATH_LIST[sub_path], page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") result = { "album_info_list": [], # 全部图集信息 "is_over": False, # 是不是最后一页图集 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_pagination_response.status)) # 页面编码 album_pagination_html = album_pagination_response.data.decode("GBK") # 获取图集信息,存在两种页面样式 album_list_selector = PQ(album_pagination_html).find("div.xxx li a") if album_list_selector.length == 0: album_list_selector = PQ(album_pagination_html).find("div.yyy li a") if album_list_selector.length == 0: raise crawler.CrawlerException("页面截取图集列表失败\n%s" % album_pagination_html.encode("UTF-8")) for album_index in range(0, album_list_selector.length): result_album_info = { "album_title": "", # 图集id "page_id": None, # 图集页面id } album_selector = album_list_selector.eq(album_index) # 获取图集id album_url = album_selector.attr("href") if not album_url: raise crawler.CrawlerException( "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = album_url.split("/")[-2] if not crawler.is_integer(album_id): raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url)) result_album_info["page_id"] = album_id # 获取图集标题 album_title = album_selector.attr("title").encode("UTF-8") if len(re.findall("_共\d*张", album_title)) == 1: result_album_info["album_title"] = album_title[:album_title. rfind("_共")] else: result_album_info["album_title"] = album_title result["album_info_list"].append(result_album_info) # 判断是不是最后一页 max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text() if not max_page_info: raise crawler.CrawlerException("总页数信息截取失败\n%s" % album_pagination_html.encode("UTF-8")) max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共", "页") if not crawler.is_integer(max_page_count): raise crawler.CrawlerException("总页数截取失败\n%s" % max_page_info.encode("UTF-8")) result["is_over"] = page_count >= int(max_page_count) return result
def takeout_and_processing_nodes(td_nodes: PyQuery, keys_name: tuple = ("number", "date", "residence", "age", "sex")): nodes_dict = {} for i in range(len(keys_name)): takeout_node = td_nodes.eq(i).text() processed_node = ''.join(takeout_node.split()) nodes_dict[keys_name[i]] = processed_node return nodes_dict
def getNearby(self,xml): locations={'names':[],'types':[]} places={'name':'','types':[]} placeData=PyQuery(xml.encode('utf-8'))('result') for place in placeData: place=PyQuery(place) name=PyQuery(place('name')) types=PyQuery(place('type')) locations['names'].append(name.text()) locations['types'].append(types.eq(0).text()) return locations
def find_task(db, which): count = 0 # 股权出质 equity_pledged_info = u'equity_pledged_info' source_table = "online_crawl_gansu_new" for item in db.traverse_batch(source_table): data_list = item.get('datalist') company = item.get('_id') count += 1 if not isinstance(data_list, dict): log.error("{which} table: 没有 datalist company = {company}".format( company=company, which=which)) continue if equity_pledged_info not in data_list: continue value = data_list.get(equity_pledged_info) if value is None: continue if 'detail' in value: log.info("{which} table: {equity} company = {company} have detail".format( equity=equity_pledged_info, company=company, which=which)) continue if 'list' not in value: continue list_array = value.get('list') if not isinstance(list_array, list) or len(list_array) <= 0: continue for item0 in list_array: text = item0.get('text') if text is None: continue tr_list = PyQuery(text, parser='html').find('#stockTab').find('tr') if tr_list.length > 2: log.info("{which} table: {equity} company = {company} have list".format( equity=equity_pledged_info, company=company, which=which)) break if tr_list.length == 2 and tr_list.eq(1).find('td').length > 5: log.info("{which} table: {equity} company = {company} have list".format( equity=equity_pledged_info, company=company, which=which)) break log.info("查找结束: {which} count = {count}".format(which=which, count=count))
def pullSubmissions(subredditName): html = urllib2.urlopen("http://reddit.com/r/%s" % subredditName).read() storyObjects = PyQuery(html)(".entry") for storyObject in [storyObjects.eq(i) for i in range(len(storyObjects))]: title = storyObject.find("a.title").html() url = storyObject.find("a.title").attr.href redditURL = storyObject.find("a.comments").attr.href if ( redditURL ): # advertisement submissions have no comments page and thus the property is None (NOT TRUE ANYMORE //FIXME) yield (title, url, redditURL)
def get_one_page_photo(page_count): photo_pagination_url = "http://kelagirls.com/bizhi!findForIndexMore.action" query_data = {"page": page_count} photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data) result = { "image_info_list": [], # 全部图片地址 "is_over": False, # 是不是最后一页壁纸 } if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status)) photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find(".bizhinmore .bizhi") if photo_list_selector.length == 0: raise crawler.CrawlerException("页面匹配图片列失败\n%s" % photo_pagination_response.data) for photo_index in range(0, photo_list_selector.length): result_image_info = { "image_id": None, # 图片id "image_url": None, # 图片地址 "model_name": "", # 模特名字 } # 获取图片id image_id = photo_list_selector.eq(photo_index).find(".bizhibigwrap").attr("id") if not image_id: raise crawler.CrawlerException("图片列表匹配图片id失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) if not (image_id[0:3] == "big" and crawler.is_integer(image_id[3:])): raise crawler.CrawlerException("图片列表匹配的图片id格式不正确\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["image_id"] = str(image_id[3:]) # 获取图片地址 image_path = photo_list_selector.eq(photo_index).find(".bizhibig img").eq(1).attr("src") if not image_path: raise crawler.CrawlerException("图片列表匹配图片地址失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["image_url"] = "http://kelagirls.com/" + str(image_path.encode("UTF-8")) # 获取模特名字 model_name = photo_list_selector.eq(photo_index).find(".bzwdown span").eq(0).text().encode("UTF-8") if not model_name: raise crawler.CrawlerException("图片列表匹配模特名字失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["model_name"] = str(model_name) result["image_info_list"].append(result_image_info) # 判断是不是最后一页 pagination_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find(".pageBottom div") max_page_count = page_count for pagination_index in range(0, pagination_selector.length): if crawler.is_integer(pagination_selector.eq(pagination_index).text()): max_page_count = max(max_page_count, int(pagination_selector.eq(pagination_index).text())) result["is_over"] = page_count >= max_page_count return result
def get_one_page_blog(account_id, page_count): # http://moexia.lofter.com/?page=1 blog_pagination_url = "http://blog.sina.com.cn/s/articlelist_%s_0_%s.html" % (account_id, page_count) blog_pagination_response = net.http_request(blog_pagination_url, method="GET") result = { "blog_info_list": [], # 全部日志地址 "is_over": False, # 是不是最后一页 } if blog_pagination_response.status == net.HTTP_RETURN_CODE_SUCCEED: if page_count == 1 and blog_pagination_response.data.find("抱歉,您要访问的页面不存在或被删除!") >= 0: raise crawler.CrawlerException("账号不存在") article_list_selector = PQ(blog_pagination_response.data.decode("UTF-8")).find(".articleList .articleCell") if article_list_selector.size() == 0: raise crawler.CrawlerException("页面截取日志列表失败\n%s" % blog_pagination_response.data) for article_index in range(article_list_selector.size()): result_blog_info = { "blog_url": None, # 日志地址 "blog_time": None, # 日志时间 "blog_title": "", # 日志标题 } article_selector = article_list_selector.eq(article_index) # 获取日志地址 blog_url = article_selector.find("span.atc_title a").attr("href") if not blog_url: raise crawler.CrawlerException("日志列表解析日志地址失败\n%s" % article_selector.html().encode("UTF-8")) result_blog_info["blog_url"] = str(blog_url) # 获取日志标题 blog_title = article_selector.find("span.atc_title a").text().encode("UTF-8") if not blog_title: raise crawler.CrawlerException("日志列表解析日志标题失败\n%s" % article_selector.html().encode("UTF-8")) result_blog_info["blog_title"] = str(blog_title) # 获取日志时间 blog_time = article_selector.find("span.atc_tm").text() if not blog_time: raise crawler.CrawlerException("日志列表解析日志时间失败\n%s" % article_selector.html().encode("UTF-8")) try: result_blog_info["blog_time"] = int(time.mktime(time.strptime(blog_time, "%Y-%m-%d %H:%M"))) except ValueError: raise crawler.CrawlerException("日志时间格式不正确\n%s" % blog_time) result["blog_info_list"].append(result_blog_info) # 获取分页信息 pagination_html = tool.find_sub_string(blog_pagination_response.data, '<div class="SG_page">', '</div>') if not pagination_html: result["is_over"] = True else: max_page_count = tool.find_sub_string(pagination_html, "共", "页") if not crawler.is_integer(max_page_count): raise crawler.CrawlerException("分页信息截取总页数失败\n%s" % pagination_html) result["is_over"] = page_count >= int(max_page_count) else: raise crawler.CrawlerException(crawler.request_failre(blog_pagination_response.status)) return result
def extractCourse(commentsHTML): c = Course() table = PyQuery(commentsHTML).find("table.plaintable").children() c.id = table.eq(0).text().split()[-1] c.subject_code, c.crse = table.eq(1).text().split()[0].split('-') c.title = ' '.join(table.eq(1).text().split()[1:]) c.description = table.eq(2).children().eq(1).text() if c.description == "Description Not Found": c.description = None mtimesPQ = table.eq(3).children().eq(1).children().children() if mtimesPQ.eq(1).children().length > 2: mtime = MeetingTime() mtime.days = mtimesPQ.eq(1).children().eq(1).text().split() mtime.begin = mtimesPQ.eq(1).children().eq(2).text() mtime.end = mtimesPQ.eq(1).children().eq(3).text() mtime.location = mtimesPQ.eq(1).children().eq(4).text() c.exam = mtimesPQ.eq(1).children().eq(5).text() if mtime.days == ['(ARR)']: mtime.days = None mtime.begin = None c.exam = mtime.location mtime.location = mtime.end mtime.end = None c.meeting_times.append(mtime) for i in xrange(mtimesPQ.length - 2): # get additional times mtime = MeetingTime() mtime.days = mtimesPQ.eq(i).children().eq(5).text().split() mtime.begin = mtimesPQ.eq(i).children().eq(6).text() mtime.end = mtimesPQ.eq(i).children().eq(7).text() mtime.location = mtimesPQ.eq(i).children().eq(8).text() c.meeting_times.append(mtime) sectionInfoPQ = table.eq(4).find("table").children().eq(2).children() c.instructor = sectionInfoPQ.eq(0).text() c.type = sectionInfoPQ.eq(1).text() c.status = sectionInfoPQ.eq(2).text() c.capacity = sectionInfoPQ.eq(3).text() comments = table.eq(5).children().eq(1).text() if comments == 'None': c.comments = None else: c.comments = comments return c
def replace_image(self, target, image_name): elements = self.html_obj('*').filter('[dzid="' + target + '"]') location = self.location + urllib.quote_plus(image_name) for e in elements: pq = PyQuery(e) if pq.eq(0).is_('img'): pq.attr('src', location) else: pq.css('background-image', 'url("' + location + '");') return location return None
def get_one_page_photo(page_count): photo_pagination_url = "http://jigadori.fkoji.com/" query_data = {"p": page_count} photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data) result = { "image_info_list": [], # 全部图片信息 } if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status)) photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find("#wrapper .row .photo") for photo_index in range(0, photo_list_selector.size()): photo_selector = photo_list_selector.eq(photo_index) photo_selector_html = photo_selector.html().encode("UTF-8") result_photo_info = { "account_name": "", # twitter账号 "image_url_list": [], # 图片地址 "tweet_id": None, # tweet id "tweet_time": None, # tweet发布时间 } # 获取tweet id tweet_url = photo_selector.find(".photo-link-outer a").eq(0).attr("href") if not tweet_url: raise crawler.CrawlerException("图片信息截取tweet地址失败\n%s" % photo_selector_html) tweet_id = tool.find_sub_string(tweet_url.strip(), "status/") if not crawler.is_integer(tweet_id): raise crawler.CrawlerException("tweet地址截取tweet id失败\n%s" % tweet_url) result_photo_info["tweet_id"] = int(tweet_id) # 获取twitter账号 account_name = photo_selector.find(".user-info .user-name .screen-name").text() if not account_name: raise crawler.CrawlerException("图片信息截取twitter账号失败\n%s" % photo_selector_html) result_photo_info["account_name"] = str(account_name).strip().replace("@", "") # 获取tweet发布时间 tweet_time = photo_selector.find(".tweet-text .tweet-created-at").text().strip() if not tweet_time: raise crawler.CrawlerException("图片信息截取tweet发布时间失败\n%s" % photo_selector_html) try: result_photo_info["tweet_time"] = int(time.mktime(time.strptime(str(tweet_time).strip(), "%Y-%m-%d %H:%M:%S"))) except ValueError: raise crawler.CrawlerException("tweet发布时间文本格式不正确\n%s" % tweet_time) # 获取图片地址 image_list_selector = photo_selector.find(".photo-link-outer a img") for image_index in range(0, image_list_selector.size()): image_url = image_list_selector.eq(image_index).attr("src") if not image_url: raise crawler.CrawlerException("图片列表截取图片地址失败\n%s" % image_list_selector.eq(image_index).html()) result_photo_info["image_url_list"].append(str(image_url).strip()) result["image_info_list"].append(result_photo_info) return result
def get_one_page_album(account_id, page_count): # http://bcy.net/u/50220/post/cos?&p=1 album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id query_data = {"p": page_count} album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data) result = { "album_info_list": [], # 全部作品信息 "coser_id": None, # coser id "is_over": False, # 是不是最后一页作品 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status)) if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取coser id coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data) if len(coser_id_find) != 1: raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data) if not crawler.is_integer(coser_id_find[0]): raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data) result["coser_id"] = coser_id_find[0] # 获取作品信息 album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item") for album_index in range(0, album_list_selector.size()): album_selector = album_list_selector.eq(album_index) result_album_info = { "album_id": None, # 作品id "album_title": None, # 作品标题 } # 获取作品id album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href") if not album_url: raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = str(album_url).split("/")[-1] if not crawler.is_integer(album_id): raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8"))) result_album_info['album_id'] = album_id # 获取作品标题 album_title = album_selector.find(".postWorkCard__img img").attr("alt") result_album_info["album_title"] = str(album_title.encode("UTF-8")) result["album_info_list"].append(result_album_info) # 判断是不是最后一页 last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a") if last_pagination_selector.size() == 1: max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1]) result["is_over"] = page_count >= max_page_count else: result["is_over"] = True return result
def scrape_grants_for_fy(year): b.open(PAST_GRANTS_URL) try: b.select_form(name="Form1") b["oUcStartDate$ddlDay"] = ["1"] b["oUcStartDate$ddlMonth"] = ["4"] b["oUcStartDate$ddlYear"] = [str(year)] b["oUcEndDate$ddlDay"] = ["31"] b["oUcEndDate$ddlMonth"] = ["3"] b["oUcEndDate$ddlYear"] = [str(year + 1)] resp = b.submit() except mechanize._form.ItemNotFoundError: print("ERROR: could not submit form. This usually means you're " "trying to scrape for a year that doesn't exist " "on the GOTW website.", file=sys.stderr) raise page = PyQuery(resp.read()) for r in page("table tr:not(.GridHeader)"): grant = {} anchors = PyQuery(r).find('a') grant['id'] = anchors.eq(0).attr.title grant['title'] = anchors.eq(0).text() grant['pi'] = pi = {} pi['id'] = util.extract_id(anchors.eq(1).attr.href, 'Person') pi['name'] = anchors.eq(1).text() grant['organisation'] = org = {} org['id'] = util.extract_id(anchors.eq(2).attr.href, 'Organisation') org['name'] = anchors.eq(2).text() grant['department'] = dept = {} dept['id'] = util.extract_id(anchors.eq(3).attr.href, 'Department') dept['name'] = anchors.eq(3).text() value = PyQuery(r).find('span').eq(0).attr.title grant['value'] = util.extract_monetary_value(value) yield grant
def query_wubi(char): url = 'http://www.chaiwubi.com/bmcx/' data = {'wz': char, 'select_value': '查单字'} r = requests.post(url, data=data) h = html.fromstring(r.text) tb = h.cssselect('.dw-bmcx')[0] # 五笔王码86版 # 大一统新世纪版五笔编码 d = defaultdict(list) trs = tb.cssselect('tr') # 取前三个 tr86 = trs[0] tds = PyQuery(tr86).children('td') d['86'] = [ tds.eq(2).text().strip() or None, tds.eq(3).text().strip() or None, tds.eq(4).text().strip() or None, tds.eq(5).text().strip() or None, ] for tr in trs[1:3]: tds = PyQuery(tr).children('td') d[tds.eq(0).text()] = [ tds.eq(1).text().strip() or None, tds.eq(2).text().strip() or None, tds.eq(3).text().strip() or None, tds.eq(4).text().strip() or None, ] return d
def get_album_photo(sub_path, page_id): page_count = 1 result = { "image_url_list": [], # 全部图片地址 } while True: if page_count == 1: photo_pagination_url = "http://www.88mmw.com/%s/%s" % (sub_path, page_id) else: photo_pagination_url = "http://www.88mmw.com/%s/%s/index_%s.html" % ( sub_path, page_id, page_count) photo_pagination_response = net.http_request(photo_pagination_url, method="GET") if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "第%s页 " % page_count + crawler.request_failre(photo_pagination_response.status)) # 页面编码 photo_pagination_html = photo_pagination_response.data.decode("GBK") # 获取图片地址 image_list_selector = PQ(photo_pagination_html).find("div.zzz li img") if image_list_selector.length == 0: raise crawler.CrawlerException( "第%s页 页面匹配图片地址失败\n%s" % (page_count, photo_pagination_html.encode("UTF-8"))) for image_index in range(0, image_list_selector.length): result["image_url_list"].append( "http://www.88mmw.com" + str(image_list_selector.eq(image_index).attr("src")).replace( "-lp", "")) # 判断是不是最后一页 is_over = False max_page_selector = PQ(photo_pagination_html).find("div.page").eq( 0).find("span strong").text() if not max_page_selector: is_over = True elif crawler.is_integer(max_page_selector): is_over = page_count >= int(max_page_selector) if is_over: break else: page_count += 1 return result
def get_account_talks(account_id, account_name, talk_list): account_index = "https://7gogo.jp/users/%s" % account_id account_index_response = net.http_request(account_index, method="GET") if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(account_index_response.status)) talk_list_selector = PQ(account_index_response.data.decode("UTF-8")).find( ".UserTalkWrapper .UserTalk") for talk_index in range(0, talk_list_selector.size()): talk_selector = talk_list_selector.eq(talk_index) # 获取talk地址 talk_url_path = talk_selector.attr("href") if not talk_url_path: raise crawler.CrawlerException("talk信息截取talk地址失败\n%s" % talk_selector.html.encode("UTF-8")) talk_id = str(talk_url_path.replace("/", "")) if not talk_id: raise crawler.CrawlerException("talk地址截取talk id失败\n%s" % talk_url_path) # 获取talk名字 talk_name = talk_selector.find(".UserTalk__talkname").text() if not talk_name: raise crawler.CrawlerException("talk信息截取talk名字失败\n%s" % talk_selector.html.encode("UTF-8")) talk_name = crawler.filter_emoji( str(talk_name.encode("UTF-8")).strip()) # 获取talk描述 talk_description = crawler.filter_emoji( talk_selector.find(".UserTalk__description").text()) if talk_description: talk_description = crawler.filter_emoji( str(talk_description.encode("UTF-8")).strip()) else: talk_description = "" if talk_id in talk_list: talk_list[talk_id]["account_list"].append(account_name) else: talk_list[talk_id] = { "account_list": [account_name], "talk_name": talk_name, "talk_description": talk_description, } output.print_msg(account_id + ": " + talk_name + ", " + talk_description)
def searchStatus(self, keyword, max_length=20): url = 'http://browse.renren.com/s/status?offset=0&sort=1&range=0&q=%s&l=%d' % (keyword, max_length) r = self.session.get(url, timeout=5) status_elements = PyQuery(r.text)('.list_status .status_content') id_pattern = re.compile("forwardDoing\('(\d+)','(\d+)'\)") results = [] for index, _ in enumerate(status_elements): status_element = status_elements.eq(index) # 跳过转发的 if status_element('.status_root_msg'): continue status_element = status_element('.status_content_footer') status_time = status_element('span').text() m = id_pattern.search(status_element('.share_status').attr('onclick')) status_id, user_id = m.groups() results.append( (int(user_id), int(status_id), status_time) ) return results
def search_status(self, keyword, max_length=20): url = 'http://browse.renren.com/s/status?offset=0&sort=1&range=0&q=%s&l=%d' % (keyword, max_length) r = self.session.get(url, timeout=5) status_elements = PyQuery(r.text)('.list_status .status_content') id_pattern = re.compile("forwardDoing\('(\d+)','(\d+)'\)") results = [] for index, _ in enumerate(status_elements): status_element = status_elements.eq(index) # 跳过转发的 if status_element('.status_root_msg'): continue status_element = status_element('.status_content_footer') status_time = status_element('span').text() m = id_pattern.search(status_element('.share_status').attr('onclick')) status_id, user_id = m.groups() results.append( (int(user_id), int(status_id), status_time) ) return results
def qichacha_search_result(j: PyQuery) -> dict: j = j.children() td_row = j.eq(2) company_name = td_row.children('a').text() p_first = td_row.children('p').eq(0) legal_representative = p_first.children('a').text() span_m_l = p_first("span:first").text().split(':') registered_capital = span_m_l[-1].strip('-') span_m_ls = p_first('span:last').text().split(':') date_of_establishment = span_m_ls[-1] p_two = td_row('p').eq(-3) p_obj = p_two.clone() p_obj.children().remove() email = p_obj.text().split(':')[-1].strip('-') phone = p_two.find('span').text().split(':')[-1].strip(' ').strip('-') register_address = td_row.find('p').eq(2).text().split(':')[-1] return dict(company_name=company_name, legal_representative=legal_representative, registered_capital=registered_capital, date_of_establishment=date_of_establishment, email=email, phone=phone, register_address=register_address)
def get_account_index_page(account_name): account_index_url = "http://%s.pp.163.com/" % account_name account_index_response = net.http_request(account_index_url, method="GET") result = { "album_url_list": [], # 全部相册地址 } if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(account_index_response.status)) # 页面编码 account_index_html = account_index_response.data.decode("GBK").encode( "UTF-8") if account_index_html.find("<title>该页面不存在</title>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取全部相册地址 album_result_selector = PQ(account_index_html).find("#p_contents li") if album_result_selector.size() == 0: raise crawler.CrawlerException("页面匹配相册列表失败\n%s" % account_index_html) for album_index in range(0, album_result_selector.size()): result["album_url_list"].append( str( album_result_selector.eq(album_index).find("a.detail").attr( "href"))) return result
def extract(dom, param_dict): res = [] # dom head head_list = str(param_dict['dom_head']).strip().split(',') d_divs = dom(head_list[0]) if len(head_list) > 1: for pos in range(1, len(head_list)): try: value = int(head_list[pos]) d_divs = d_divs.eq(value) except: d_divs = d_divs.children(head_list[pos]) for div in d_divs: d_div = PyQuery(div) if param_dict['sandwich'] != 'None': sandwich_list = str(param_dict['sandwich']).strip().split(',') for sandwich in sandwich_list: try: positon = int(sandwich) d_div = d_div.eq(positon) except: d_div = d_div.children(sandwich) header = str(param_dict['title']).strip().split(',')[0] if not d_div.children(header): continue # 获取url 信息 url_list = str(param_dict['url']).strip().split(',') url = d_div.children(url_list[0]) for pos in range(1, len(url_list)): try: n_url = int(url_list[pos]) url = url.eq(n_url) except: if url_list[pos] == 'href': url = url.attr('href') break else: url = url.children(url_list[pos]) # join url if 'www' not in url and 'http' not in url: match = re.search('^/', url) if match: url = param_dict['domain'] + url else: url = param_dict['domain'] + '/' + url if 'http://' not in url: url = 'http://' + url # 获取title 信息 title_list = str(param_dict['title']).strip().split(',') title = d_div for item in title_list: try: n_title = int(item) title = title.eq(n_title) except: title = title.children(item) title = title.text() date_list = str(param_dict['date']).strip().split(',') date = d_div is_attr = False for item in date_list: try: n_item = int(item) date = date.eq(n_item) except: if 'attr' not in item: date = date.children(item) else: item = item[:item.find(':')] date = date.attr(item)[:20].strip() is_attr = True date = date if is_attr else date.text() if ' / ' in date: date = date.replace(' / ', '-') if '/' in date: date = date.replace('/', '-') if re.search(u'\d{4}-\d{1,2}-\d{1,2}', date): date = ''.join(x for x in date if ord(x) < 256).strip() start_index = date.rfind('201') #第一次出现的位置 end_index1 = date.rfind('-') end_index2 = date.rfind(':') end_index = end_index1 if end_index1 > end_index2 else end_index2 date = date[start_index:end_index + 3] if len(date) == 10: date = '%s %s' % ( date, time.strftime("%H:%M", time.localtime(time.time()))) elif re.search(u'\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{1,2}:\d{1,2}', date): arr_time = date.split(' ') arr_date = arr_time[0].split('-') date = '%s-%s-%s %s' % (arr_date[2], arr_date[0], arr_date[1], arr_time[1]) else: try: # 时间戳转化成日期 date_stamp = int(date) if date_stamp > 9999999999: date_stamp = int(date[:10]) x = time.localtime(date_stamp) date = time.strftime('%Y-%m-%d %H:%M', x) except: date = fomate_date_output(date) date = format_date_time(date) if len(date) == 16: if cmp(date, str_today) >= 0 and cmp( date, end_today) <= 0 and len(title) > 0: res.append([date, url, title]) return res
#!/usr/bin/env python import requests from pyquery import PyQuery USER_AGENT = 'Mozilla/5.0' CG_URL = 'http://www.cordobaguias.com.ar/cotizacion-dolar-en-cordoba.html' PDB_URL = 'http://www.preciodolarblue.com.ar' AMBITO_URL = 'http://www.ambito.com/economia/mercados/monedas/dolar/info/?ric=ARSB=C' headers = {'User-Agent': USER_AGENT} cg = PyQuery(requests.get(CG_URL, headers=headers).content).find('.cuadroPrecioD').text().replace(' pesos', '') pdb_tds = PyQuery(requests.get(PDB_URL, headers=headers).content).find('td') pdb = (pdb_tds.eq(3).text(), pdb_tds.eq(4).text()) ambito = PyQuery(requests.get(AMBITO_URL, headers=headers).content) print 'cordobaguias | preciodolarblue' print '-' * 30 print '%s | %s' % (cg, ' / '.join(pdb)) print 'Cueva (Ambito): %.2f | %.2f' % (float(ambito.find('#compra>big').text().replace(',', '.')), float(ambito.find("#venta>big").text().replace(',', '.')))
def get_one_page_audio(account_id, page_count): # http://www.ximalaya.com/1014267/index_tracks?page=2 audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id query_data = {"page": page_count} audit_pagination_response = net.http_request(audit_pagination_url, method="GET", fields=query_data, json_decode=True) result = { "audio_info_list": [], # 页面解析出的歌曲信息列表 "is_over": False, # 是不是最后一页 } if audit_pagination_response.status == 404: raise crawler.CrawlerException("账号不存在") elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(audit_pagination_response.status)) if not crawler.check_sub_key( ("res", "html"), audit_pagination_response.json_data): raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" % audit_pagination_response.json_data) if audit_pagination_response.json_data["res"] is not True: raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" % audit_pagination_response.json_data) # 获取歌曲信息 audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find( "ul.body_list li.item") for audio_index in range(0, audio_list_selector.size()): audio_info = { "audio_id": None, # 页面解析出的歌曲id "audio_title": "", # 页面解析出的歌曲标题 } audio_selector = audio_list_selector.eq(audio_index) # 获取歌曲id audio_id = audio_selector.find(".content_wrap").attr("sound_id") if not crawler.is_integer(audio_id): raise crawler.CrawlerException( "歌曲信息匹配歌曲id失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_id"] = str(audio_id) # 获取歌曲标题 audio_title = audio_selector.find(".sound_title").attr("title") if not audio_title: raise crawler.CrawlerException( "歌曲信息匹配歌曲标题失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip()) result["audio_info_list"].append(audio_info) # 判断是不是最后一页 max_page_count = 1 pagination_list_selector = PQ( audit_pagination_response.json_data["html"]).find( ".pagingBar_wrapper a.pagingBar_page") for pagination_index in range(0, pagination_list_selector.size()): pagination_selector = pagination_list_selector.eq(pagination_index) data_page = pagination_selector.attr("data-page") if data_page is None: continue if not crawler.is_integer(data_page): raise crawler.CrawlerException( "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8")) max_page_count = max(max_page_count, int(data_page)) result["is_over"] = page_count >= max_page_count return result
#!/usr/bin/env python import requests from pyquery import PyQuery USER_AGENT = 'Mozilla/5.0' CG_URL = 'http://www.cordobaguias.com.ar/cotizacion-dolar-en-cordoba.html' PDB_URL = 'http://www.preciodolarblue.com.ar' AMBITO_URL = 'http://www.ambito.com/economia/mercados/monedas/dolar/info/?ric=ARSB=C' headers = {'User-Agent': USER_AGENT} cg = PyQuery(requests.get( CG_URL, headers=headers).content).find('.cuadroPrecioD').text().replace( ' pesos', '') pdb_tds = PyQuery(requests.get(PDB_URL, headers=headers).content).find('td') pdb = (pdb_tds.eq(3).text(), pdb_tds.eq(4).text()) ambito = PyQuery(requests.get(AMBITO_URL, headers=headers).content) print 'cordobaguias | preciodolarblue' print '-' * 30 print '%s | %s' % (cg, ' / '.join(pdb)) print 'Cueva (Ambito): %.2f | %.2f' % (float( ambito.find('#compra>big').text().replace( ',', '.')), float(ambito.find("#venta>big").text().replace(',', '.')))
def get_one_page_favorite(page_count): # http://www.weibo.com/fav?page=1 favorite_pagination_url = "http://www.weibo.com/fav" query_data = {"page": page_count} cookies_list = {"SUB": COOKIE_INFO["SUB"]} favorite_pagination_response = net.http_request(favorite_pagination_url, method="GET", fields=query_data, cookies_list=cookies_list) result = { "blog_info_list": [], # 所有微博信息 "is_error": False, # 是不是不符合格式 "is_over": False, # 是不是最后一页收藏 } if favorite_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(favorite_pagination_response.status)) favorite_data_html = tool.find_sub_string( favorite_pagination_response.data, '"ns":"pl.content.favoriteFeed.index"', '"})</script>', 2) favorite_data_html = tool.find_sub_string(favorite_data_html, '"html":"', '"})') if not favorite_data_html: raise crawler.CrawlerException("页面截取收藏信息失败\n%s" % favorite_data_html) # 替换全部转义斜杠以及没有用的换行符等 html_data = favorite_data_html.replace("\\\\", chr(1)) for replace_string in ["\\n", "\\r", "\\t", "\\"]: html_data = html_data.replace(replace_string, "") html_data = html_data.replace(chr(1), "\\") # 解析页面 children_selector = PQ( html_data.decode("UTF-8")).find('div.WB_feed').children() if children_selector.length == 0: raise crawler.CrawlerException("匹配收藏信息失败\n%s" % favorite_data_html) if children_selector.length == 1: raise crawler.CrawlerException("没有收藏了") # 解析日志id和图片地址 for i in range(0, children_selector.length - 1): feed_selector = children_selector.eq(i) # 已被删除的微博 if not feed_selector.has_class("WB_feed_type"): continue result_blog_info = { "blog_id": None, # 日志id(mid) "image_url_list": [], # 所有图片地址 } # 解析日志id blog_id = feed_selector.attr("mid") if not crawler.is_integer(blog_id): raise crawler.CrawlerException( "收藏信息解析微博id失败\n%s" % feed_selector.html().encode("UTF-8")) result_blog_info["blog_id"] = str(blog_id) # WB_text 微博文本 # WB_media_wrap 微博媒体(图片) # .WB_feed_expand .WB_expand 转发的微博,下面同样包含WB_text、WB_media_wrap这些结构 # 包含转发微博 if feed_selector.find(".WB_feed_expand .WB_expand").length == 0: media_selector = feed_selector.find(".WB_media_wrap") else: media_selector = feed_selector.find( ".WB_feed_expand .WB_expand .WB_media_wrap") # 如果存在媒体 if media_selector.length == 1: thumb_image_url_list = re.findall('<img src="([^"]*)"/>', media_selector.html()) if len(thumb_image_url_list) > 0: image_url_list = [] for image_url in thumb_image_url_list: temp_list = image_url.split("/") temp_list[3] = "large" image_url_list.append("http:" + str("/".join(temp_list))) result_blog_info["image_url_list"] = image_url_list if len(result_blog_info["image_url_list"]) > 0: result["blog_info_list"].append(result_blog_info) # 最后一条feed是分页信息 page_selector = children_selector.eq(children_selector.length - 1) # 判断是不是最后一页 page_count_find = re.findall("第(\d*)页", page_selector.html().encode("UTF-8")) if len(page_count_find) > 0: page_count_find = map(int, page_count_find) result["is_over"] = page_count >= max(page_count_find) else: result["is_over"] = True return result
def get_chattel_mortgage_info_detail(self, onclick, detail_list): result = dict() if onclick is None or onclick.strip() == '': return result temp_list = onclick.split(u'\'') if temp_list is None or len(temp_list) < 2: return result temp_list = temp_list[1].split(u'\'') if temp_list is None or len(temp_list) <= 0: return result morreg_id = temp_list[0] # 遍历所有页面 for detail in detail_list: url = detail.get('url') if not isinstance(url, basestring): continue if morreg_id not in url: continue text = detail.get('text') if not isinstance(text, basestring) or text.strip() == u'': continue table_list = PyQuery(text, parser='html').find('.detailsList') if table_list is None or table_list.length < 5: raise FieldMissError # 动产抵押登记信息 td_list = table_list.eq(0).find('td') cm_dict = dict() result[GsModel.ChattelMortgageInfo.ChattelDetail. CHATTEL_MORTGAGE] = cm_dict cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage. REGISTER_NUM] = td_list.eq(0).text() cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage. REGISTER_DATE] = td_list.eq(1).text() cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage. REGISTER_OFFICE] = td_list.eq(2).text() # 抵押权人概况信息 tr_list = table_list.eq(1).find('tr').items() mps_list = list() result[GsModel.ChattelMortgageInfo.ChattelDetail. MORTGAGE_PERSON_STATUS] = mps_list for tr in tr_list: td_list = tr.find('td') if td_list is None or td_list.length < 5: continue item = dict() item[GsModel.ChattelMortgageInfo.ChattelDetail. MortgagePersonStatus.MORTGAGE_PERSON_NAME] = td_list.eq( 1).text() item[GsModel.ChattelMortgageInfo.ChattelDetail. MortgagePersonStatus.CERTIFICATE_TYPE] = td_list.eq( 2).text() item[GsModel.ChattelMortgageInfo.ChattelDetail. MortgagePersonStatus.CERTIFICATE_NUM] = td_list.eq( 3).text() item[GsModel.ChattelMortgageInfo.ChattelDetail. MortgagePersonStatus.ADDRESS] = td_list.eq(4).text() mps_list.append(item) # 被担保债权概况信息 td_list = table_list.eq(2).find('td') gps_dict = dict() result[GsModel.ChattelMortgageInfo.ChattelDetail. GUARANTEED_PERSON_STATUS] = gps_dict gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail. GuaranteedPersonStatus.KIND] = td_list.eq(0).text() gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail. GuaranteedPersonStatus.AMOUNT] = td_list.eq(1).text() gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail. GuaranteedPersonStatus.SCOPE] = td_list.eq(2).text() gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail. GuaranteedPersonStatus.PERIOD] = td_list.eq(3).text() gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail. GuaranteedPersonStatus.REMARK] = td_list.eq(4).text() # 抵押物概况信息 tr_list = table_list.eq(3).find('tr').items() gs_list = list() result[GsModel.ChattelMortgageInfo.ChattelDetail. GUARANTEE_STATUS] = gs_list for tr in tr_list: td_list = tr.find('td') if td_list is None or td_list.length < 5: continue item = dict() item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus. NAME] = td_list.eq(1).text() item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus. AFFILIATION] = td_list.eq(2).text() item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus. SITUATION] = td_list.eq(3).text() item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus. REMARK] = td_list.eq(4).text() gs_list.append(item) # 变更信息 tr_list = table_list.eq(4).find('tr').items() change_list = list() result[GsModel.ChattelMortgageInfo.ChattelDetail. CHANGE_INFO] = change_list for tr in tr_list: td_list = tr.find('td') if td_list is None or td_list.length < 3: continue item = dict() item[GsModel.ChattelMortgageInfo.ChattelDetail.ChangeInfo. CHANGE_DATE] = td_list.eq(1).text() item[GsModel.ChattelMortgageInfo.ChattelDetail.ChangeInfo. CHANGE_CONTENT] = td_list.eq(2).text() break return result
batch_mode = True cfg=configparser.RawConfigParser() cfg.read(os.path.expanduser('~/secured/myukrsib.cfg')) smtp_host = cfg.get('default','smtp_host') smtp_user = cfg.get('default','smtp_user') smtp_secret = cfg.get('default','smtp_secret') #fname = sys.argv[len(sys.argv)-1] #f = open(fname,'rb') q = PyQuery(sys.stdin.read()) tbls = PyQuery(q('form#cardAccountInfoForm').children('table')) t = tbls.eq(0)('td').eq(1).text().split(':') available_amount = t[2] global_own_amount = t[1].split()[0] t = tbls.eq(2)('td').eq(0).text().split(':') overdraft = t[1] t = tbls.eq(2)('td').eq(1).text().split(':') replenishment = t[1] t = tbls.eq(2)('td').eq(4).text().split(':') own_amount = t[1] t = tbls.eq(2)('td').eq(5).text().split(':') withdrawal = t[1] account_ops = [] card_ops = [] holds = []