def get_fund_mapping(self): # sub_code, sub_name, main_code, main_name sub_to_main_mapping = [] html = requests.get(self._url, timeout=30.0).content document = PyQuery(unicode(html, 'utf-8')) fund_blocks = [document.items('.aa'), document.items('.dd')] for each_block in fund_blocks: for class_tag in each_block: items_list = [item.text() for item in class_tag.items('td')] sub_to_main_mapping.append((items_list[1], items_list[3])) return dict(sub_to_main_mapping)
def find_indicacoes(html: str) -> List[Indicacao]: d = PyQuery(html) indicacoes = [] current_index = 0 for heading in d.items('div[class="w3-col w3-left-align"] > p > b'): indicacoes.append(dict({ "heading": heading.text(), })) for content in d.items('div#Demo1'): autor = content('div[class="w3-col w3-left w3-twothird"]').remove( 'label').remove('br').text() protocolo = content('div[class="w3-col w3-left w3-third"]').remove( 'label').remove('br').text() tramites = [] tramite_idx = 0 for content_row in content.items('div[class="w3-row-padding"]'): for col in content_row.items( 'div[class="w3-col w3-left w3-quarter"]'): text = col.text() if text not in [ 'Data de Trâmite', 'Hora de Trâmite', 'Trâmite', 'Anotações' ]: if tramite_idx == 0: tramites.append([text]) else: tramites[-1].append(text) tramite_idx += 1 if tramite_idx == 4: tramite_idx = 0 indicacoes[current_index] = { **indicacoes[current_index], "autor": autor, "protocolo": protocolo, "tramites": [{ "data": tramite[0], "hora": tramite[1], "texto": tramite[2], "anotacoes": tramite[3] } for tramite in tramites], } current_index += 1 return indicacoes
def __init__(self, data: PyQuery, pages: List[PyQuery], index: int): self.origin: PyQuery = data # 原始数据 # 结果返回值 self.raw: List[GoogleItem] = [GoogleItem(i) for i in data.items()] self.index: int = index # 当前页 self.page: int = len(pages) # 总页数 self.pages: List[PyQuery] = pages # 页面源
def getResTb(): html = fileworker.getHTML() pq = PyQuery(html) result = dict() blocks = list() for i in pq.items('.row.result'): list.append(i)
def Parse(self, html_data): parser_result = ParserResult() product_list = parser_result.product_list doc = PyQuery(html_data) for productEl in doc.items(self._item_query): link_el = productEl.find(self._link_el_query) if not link_el: logging.info('Failed to find link element!') continue url = _GetLinkHref(link_el[0]) if not url: logging.info('Failed to find href!') continue name_el = productEl.find(self._name_el_query) if not name_el: logging.info('Failed to find name element!') continue name = _GetText(name_el[0]) image_el = productEl.find(self._image_el_query) if not image_el: logging.info('Failed to find image element!') continue image_url = _GetImageSrc(image_el[0]) if not image_url: logging.info('Failed to find image src!') continue product = product_list.product.add() product.url = url.strip() product.name = name product.image_url = image_url.strip() return parser_result
def extract_urls(html): #提取url是cpu操作,无需改成协程 '''文章解析出url放入seen_urls队列''' pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") #去掉已经爬取过的url,放入到等待爬取的列表 if url and url.startswith("http") and url not in seen_urls: waitting_urls.put_nowait(url)
def FetchAndSave(): #百度风云榜页面网址(含有50个热门新闻的关键词) fengyunbang_url = 'http://top.baidu.com/buzz?b=1' resp = requests.get(fengyunbang_url) resp.encoding='gb2312' #新建excel文件保存数据。 csvf = open('data.csv', 'a+', encoding='gbk', newline='') writer = csv.writer(csvf) writer.writerow(('news_content', 'keyword')) #从heml文件中解析出 事件字段和 网址字段 doc = PyQuery(resp.text) for item in doc.items('.keyword'): keyword = item('a').text().split(' ')[0] keyword_link = item('a').attr.href news_links = get_keywords_news_links(keyword_link) for news_link in news_links: try: content = get_news_content(news_link) if content: print(keyword, content[0:20]) writer.writerow((content, keyword)) except: print(news_link)
def all_sub_div(partial): """Helper function that returns all the direct sub div of the given partial. Params: partial (str): an HTML partial to analyse it must be a div element. Returns: list(str): List of the content of all the direct `div` child elements in the partial. Empty list if the root element is not a div or if not direct div child exist. Examples: >>> all_sub_div('<div></div>') [] >>> all_sub_div('<section></section>') [] >>> all_sub_div('<div><div>toto</div><div>titi</div></div>') ['<div>toto</div>', '<div>titi</div>'] >>> all_sub_div('<div><div>toto</div></div>') ['<div>toto</div>'] >>> all_sub_div('<section><div>toto</div></section>') [] """ d = PyQuery(partial) if not d(":root").is_("div"): return [] return [elem.outer_html() for elem in d.items(":root>div")]
def extract_urls(html): """从返回的html中及解析出新的url""" pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") if url and url.startswith("http") and url not in seen_urls: waitting_urls.append(url)
def get_organization(): url = 'https://summerofcode.withgoogle.com/archive/2017/organizations/' response = open_url(url) soup = PyQuery(response) soup = soup('.organization-card__link') for each in soup.items(): yield 'https://summerofcode.withgoogle.com' + each.attr('href')
def video_download(url,title): response = requests.get(url,headers=headers).text doc = PyQuery(response) #<div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{"card":{"content":{"type":"Video","sub_type":"SelfHosted","video_id":"1161414208179904512","is_playable":true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div>V<div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{"card":{"content":{"type":"Video","sub_type":"SelfHosted","video_id":"1161414208179904512","is_playable":true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div> #<div class="Post-RichTextContainer"><div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{"card":{"content":{"type":"Video","sub_type":"SelfHosted","video_id":"1161414208179904512","is_playable":true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div></div> for item in doc.items('#root .Post-RichTextContainer div'): #<span class="z-ico-video"/>https://www.zhihu.com/video/1161414208179904512</span></span> #for url in item.items('.z-ico-video '): #<span class="z-ico-video"/>**************27****28******和等于55 videourl = str(item.find('a span .z-ico-video'))[55:] #print(videourl) #获取视频链接的存储json文件链接 videos = 'https://lens.zhihu.com/api/v4/videos/'+videourl resp = requests.get(videos,headers=headers).text urls = json.loads(resp) #获取视频链接 videourl = urls['playlist']['LD']['play_url'] print(videourl) videourl = videourl.replace('&','&').replace(' ','') #下载视频 videofile = requests.get(videourl,headers=headers) with open(title+'.mp4','wb') as file: file.write(videofile.content) print('完成下载:'+title+'.mp4') time.sleep(1)
def gain_data(): """ :return: """ for i in range(1, PAGE_NUM + 1): time.sleep(random.randint(5, 9)) url = "https://jobs.51job.com/pachongkaifa/p" + str(i) r = requests.get(url, headers=headers) r.encoding = "gbk" content = PyQuery(r.text)(".detlist.gbox").children("div") for d in content.items(): info = d(".info span").text().split(" ") data["Job"].append(d(".info .title").text()) data["Company"].append(d(".info a").text().split(" ")[-1]) data["Location"].append(info[-3]) data["Salary"].append(info[-2]) data["Date"].append(info[-1]) order = d(".order").text().split("|") data["Education"].append(list_helper(order[0])) data["Experience"].append(list_helper(order[1])) data["Type"].append(list_helper(order[2])) data["Scope"].append(list_helper(order[3])) data["Detail"].append(d(".text").text().replace(" ", "", -1)) data["Url"].append(d(".info span a").attr("href")) pandas.DataFrame(data).to_csv("../resource/qcwy/data.csv")
def extract_urls(html): # urls = [] doc = PyQuery(html) for link in doc.items("a"): # 获取a标签 url = link.attr("href") # a标签中的href属性 if url and url.startswith("https") and url not in seen_urls: # urls.append(url) # 如果详情页还有url(初始页时有),可以将url加入到urls中,然后迭代爬取 waitting_urls.append(url) # 待爬取的url加到waitting队列中
def extract_urls(html): urls = [] pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") if url and url.startswith('http') and url not in seen_urls: urls.append(url) waiting_urls.append(url)
def get_technologies(url): l = [] response = open_url(url) soup = PyQuery(response) soup = soup('.organization__tag') for each in soup.items(): l.append(each.text()) return l
def get_ingredients(): pages = send_requests() ingredients = [] for page in pages: pq = PyQuery(page)(".grid-view") ig = [i.attr("id") for i in pq.items("li")] ig = [i.replace("_", " ") for i in ig] ingredients.extend(ig) return ingredients
def extract_urls(html): urls = [] pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") if url and url.startswith("http") and url not in seen_urls: urls.append(url) waitting_urls.append(url) return urls
def get_flight_ids(html_text): __pq = PyQuery(html_text).find("tbody").find("td.inputselect").find( "div.content").find("input") ids = [] for __node in __pq.items(): if (__node.hasClass("radio-ajax")): __matched = regex("[A-Z]{2}.*[A-Z]", str(__node.attr("value"))) ids.extend(__matched) return ids
def get_movies(self, html): doc = PyQuery(html) for item in doc.items('.board-item-content'): return { 'name': item.find('.name').text(), 'stars': item.find('.star').text().split(':')[1], 'time': item.find('.releasetime').text().split(':')[1], 'score': item.find('.score').text() }
def extract_urls(html): urls = [] pq = PyQuery(html) for link in pq.items('a'): url = link.attr("href") if url and url.startwith("http") and url not in seen_urls: urls.append(url) waitting_urls.append(url) return urls
def extract_urls(html): # html中提取所有url urls = [] pq = PyQuery(html) for link in pq.items('a'): url = link.attr('href') if url and url.startwith('http') and url not in seen_urls: urls.append(url) waitting_urls.append(urls) return urls
def decode_html(html_text): '''解析网页的电影数据''' doc = PyQuery(html_text) for item in doc.items('.board-wrapper dd'): yield { 'name': item.find('.name').text(), 'actors': item.find('.star').text(), 'time': item.find('.releasetime').text(), 'score': item.find('.score').text() }
def extrack_urls(html): urls = [] if html: pq = PyQuery(html) for link in pq.items('a'): url = link.attr('href') # 判断url,踢出不符合条件的url if url and url.startswith('http') and url not in seen_urls: urls.append(url) waitting_urls.append(url) return urls
def extract_url(html): urls = [] pq = PyQuery(html) # 爬取所有a链接 for link in pq.items("a"): # 提取href属性 url = link.attr('href') if url and url.startswith('http') and url not in seen_urls: # 判断是否重复或是否是http urls.append(url) waitting_url.append(url) return urls
def extract_urls(html): print("extract_html", html) urls = [] pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") if url and url.startswith("http") and url not in seen_urls: urls.append(url) waitting_urls.append(url) print(waitting_urls) return urls
async def main(): loop = asyncio.get_running_loop() async with aiohttp.ClientSession() as session: html = await fetch(session, "http://www.biquge.cm/12/12097/") pq = PyQuery(html) for item in pq.items("dd a"): title = item.text() text = await get_text(session, item.attr("href")) # 兼容阻塞旧代码 await loop.run_in_executor(None, save, title, text)
async def extract_urls(html): urls = [] pq = PyQuery(html) # 取所有的url for link in pq.items('a'): url = link.attr('href') # 过滤重复的url if url and url.startswith('http') not in seen_urls: urls.append(url) waitting_urls.append(url) return urls
def getItemsByKeyword( keyword, encoding='GBK', page=1, price=None, ): if not PyQuery: return [] items = [] domain = \ 'http://s.taobao.com/search?q=$Q$&style=list&bcoffset=1&tab=all&cd=false&v=auction&sort=sale-desc&s={0}' domain = domain.replace('$Q$', repr(keyword.decode(encoding). encode('GBK')). replace('\\x', '%').upper()[1:-1]) domain = domain.replace(' ', '%20') # price interval if price: domain = domain \ + '&filter=reserve_price%5B{0}%2C{1}%5D'.format(price[0], price[1]) # price interval end for i in range(page): url = domain.format(i * 40) r = __import__('requests').get(url) encoding = r.encoding py = PyQuery(r.content.decode(encoding)) page_num = 0 for i, meta_item in enumerate(py.items("div[nid]")): page_num += 1 try: info = { 'rank': i + 1, 'keyword': keyword, 'itemName': meta_item('h3 a').attr('title').encode('utf-8'), 'itemId': meta_item.attr('nid'), 'itemPic': meta_item('img').attr('src'), 'wangwang': meta_item('.seller a').html().encode('utf-8').strip(), 'userNumId': re.findall(r'id\=(\d+)', meta_item('.seller a').attr('href'))[0], 'price': re.findall(r'\d+\.\d*', meta_item('.price').html().encode('utf-8'))[0], 'location': (meta_item('.loc div').html() or '').encode('utf-8'), 'tradeNum': re.findall(r'\d+', meta_item('.dealing div').html() or '0')[-1], 'rateNum': re.findall(r'\d+', (meta_item('.count a').html() or '0').encode('utf-8'))[0], } items.append(info) except: traceback.print_exc() # continue # if page_num < 40: # break return items
def _get_other_urls(self, data: PyQuery) -> None: for link in data.items(): if link.attr("href") == "#": continue if link.text() == "SauceNao": self.saucenao_url = "https:" + link.attr("href") elif link.text() == "ascii2d.net": self.ascii2d_url = link.attr("href") elif link.text() == "Google Images": self.google_url = "https:" + link.attr("href") elif link.text() == "TinEye": self.tineye_url = "https:" + link.attr("href")
def extract_links(source): ''' 提取出详情页的链接 ''' pq = PyQuery(source) for link in pq.items("a"): _url = link.attr("href") if _url and re.match('https://.*?/\d+.html', _url) and _url.find( '{}.lianjia.com'.format(city)): links_detail.add(_url) print(links_detail)
def decode_html(html_text): # print(html_text) doc = PyQuery(html_text) # print(doc) for item in doc.items('.board-wrapper dd'): # print(item) yield { 'name': item.find('.name').text(), 'actors': item.find('.star').text(), 'time': item.find('.releasetime').text(), 'score': item.find('.score').text(), }
def get_contributive_info(self, session, text, i_d, data): # 获得页码信息 total_page_num = self.get_page_num(text, '#paging') + 1 if total_page_num == 1: total_page_num += 1 for page in xrange(1, total_page_num): try: url = 'http://{host}/business/QueryInvList.jspx?pno={page}&order=0&mainId={i_d}'.format( host=self.host, page=page, i_d=i_d) r = self.filter_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL) continue self.append_model(data, Model.contributive_info, url, r.text) index = 0 tr_list = PyQuery( r.text, parser='html').find('.detailsListGDCZ').find('tr') item_list = tr_list.items() for item in item_list: index += 1 onclick = item.find('a').attr('onclick') if onclick is None or onclick == '': continue invest_list = self.invest_search_obj.findall( onclick.encode('utf-8')) for invest_id in invest_list: url = 'http://{host}/queryInvDetailAction.jspx?invId={i_d}'.format( host=self.host, i_d=invest_id) r = self.filter_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail) except Exception as e: self.log.exception(e)
def unpack_events(html, args): query = PyQuery(html) logger.debug("Source encoding: {} ".format(query.encoding)) rows = query.items('tr') offset = 0 events = [] for row in rows: parsed_row = __parse_row(row) event_id = offset events.append(parsed_row) offset += 1 logger.debug("Parsed {} entries".format(offset)) return events
def process_html_file(fi): f = open(fi,'r') d = PyQuery(f.read()) f.close() year = None links = {} for a in d.items('.toc a'): if len(a.text()) == 4: year = a.text() links[year] = [] else: if year: links[year].append( (a.attr('href'), a.text()) ) else: if 'NONE' not in links: links['NONE'] = [] links['NONE'].append( (a.attr('href'), a.text()) ) logger.debug('Retrieved data %s %s' % (a.text(), a.attr('href'))) data = [] for year in links.keys(): for link_id, link_name in links[year]: logger.info('Getting text at %s' % link_id) title = date = text = '' for x in d(link_id).parents('p').nextAll().items(): logger.debug('X: %s' % x.outerHtml()) if '<a' in x.outerHtml(): break elif 'End of the Project Gutenberg' in x.text(): break elif '<h2' in x.outerHtml(): title = x.text() elif '<h3' in x.outerHtml(): date = x.text() elif '<p' in x.outerHtml(): text += RE_CLEAN_TEXT.sub(x.text().replace('\n',' ').replace(' ','').replace('\r',' '), ' ') else: logger.error('Unrecognized tag: %s' % x.outerHtml()) if 'Gutenberg' in text: logger.error('%s\n%s' % (title,text)) logger.debug('\nTitle: %s\nDate: %s\nText: %s' % (title, date, text)) data.append((year, date, title, text)) logger.info('Retrieved %d pieces' % len(data)) return data
async def Start(): timestamp = time.time() parser = argparse.ArgumentParser(description="BioTC by Bioruebe (https://bioruebe.com), 2014-2019, Version 3.0.0, released under a BSD 3-clause style license.\n\nBioTC is a small application to simplify trading Steam Trading Cards with the SteamCardExchange bot by comparing the user's Steam inventory with the available cards on steamcardexchange.net") parser.add_argument("-n", "--name", action="store", type=str, default=None, help="Use specified Steam ID instead of reading it from " + STEAM_ID_FILE_NAME) parser.add_argument("-l", "--limit", action="store", type=int, default=-1, help="Stop searching after n sets have been found") args = parser.parse_args() parser.print_help() print("\n-----------------------------------------------------------------------------\n") if args.name is None: try: f = open(STEAM_ID_FILE_NAME) args.name = f.read() except: pass if args.name is None: sys.exit("Error: Could not read SteamID from file. Make sure the file '" + STEAM_ID_FILE_NAME + "' contains a valid SteamID.") result = { "sets": [], "steamID": args.name, "cardsCount": 0, "gameCount": 0, "completeSets": 0, "processingTime": 0, "time": 0 } async with aiohttp.ClientSession() as session: print("Loading Steam inventory") url = "https://steamcommunity.com/id/" + args.name + "/inventory/json/753/6" raw_json = await fetch(session, url) cardData = json.loads(raw_json) # print(cardData) if cardData is None or not cardData["success"]: sys.exit("Invalid JSON data received. Aborting.") for key, card in cardData["rgDescriptions"].items(): # Ignore emoticons, backgrounds if "Trading Card" not in card["type"]: # print(card["name"] + " is not a trading card.") continue # print(card) appid = card["market_fee_app"] try: game_cards = card_requests[appid] game_cards.append(card) except KeyError: card_requests[appid] = [card] i = 0 result["gameCount"] = len(card_requests) for appid, inventory in card_requests.items(): print("Processing " + appid) url = "https://www.steamcardexchange.net/index.php?inventorygame-appid-" + appid resp = await fetch(session, url) time.sleep(0.5) dom = PyQuery(resp) game_name = dom("h2").text() card_items = dom.items(".inventory-game-card-item") card_set = Set(appid, game_name) # print(inventory) for item in card_items: card = Card(item.find(".card-name").text().strip()) if card.name == "": # print("[Warning] Invalid card name: " + card.name) continue # available = item.find(".green, .orange") # if not available: # continue stock = filter_card_stock_value(item.find(".card-amount").text()) card.bot_inventory = stock[0] if len(stock) > 1: card.bot_inventory_pending = stock[1] try: card.price = int("".join(filter(str.isdigit, item.find(".card-price").eq(1).text()))) if card_set.standard_price < 1 and card.bot_inventory > 1: card_set.standard_price = card.price except ValueError: pass card.trade_url = item.find(".button-blue").attr("href") card.user_inventory = get_card_amount_in_inventory(cardData, inventory, card.name) card_set.cards.append(card) card_set.update_complete_sets() card_set.calculate_total_cost() card_set.set_progress_class() card_set.set_card_classes() card_set.cards.sort(key=lambda c: (c.user_inventory, 10 - c.bot_inventory)) result["completeSets"] += card_set.complete_sets if card_set.user_inventory_is_empty(): print("User has " + str(card_set.complete_sets) + " complete sets, but no surplus cards in inventory") continue if card_set.bot_inventory_is_empty(): print("Bot has no unowned cards (at normal price) for this set") continue print(card_set) result["sets"].append(card_set) i += 1 if args.limit > 0 and i >= args.limit: break env = Environment(loader=FileSystemLoader(".")) template = env.get_template('template.html') result["cardCount"] = sum(len(list(filter(lambda c: c.user_inventory < 1, s.cards))) for s in result["sets"]) result["processingTime"] = "{:.1f}".format(time.time() - timestamp) result["time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) html = template.render(result) file = open("Cards.html", "w", encoding="utf-8") file.write(html) file.close() os.startfile("Cards.html")
def getItemsByKeyword( keyword, encoding='utf-8', page=1, price=None, sort='sale-desc' ): """ @params sort: renqi-desc/sale-desc/credit-desc/old_starts/price-asc/price-desc """ if not PyQuery: return [] items = [] params = { 'q': '$Q$', 'style': 'grid', # 'bcoffset': 1, 'tab': 'all', # 'cd': 'false', # 'v': 'auction', 'sort': sort, 's': '{0}', # 'p4poffset': '4', # 'bcoffset': '-4', 'btab': 0 } domain = 'http://s.taobao.com/search?' + \ '&'.join(['%s=%s' % (k, v) for k, v in params.items()]) domain = domain.replace('$Q$', repr(keyword.decode(encoding). encode('GBK')). replace('\\x', '%').upper()[1:-1]) domain = domain.replace(' ', '%20') # price interval if price: domain = domain \ + '&filter=reserve_price%5B{0}%2C{1}%5D'.format(price[0], price[1]) # price interval end for i in range(page): url = domain.format(i * 40) r = requests.get(url, timeout=5, allow_redirects=True) # encoding = r.encoding content = eval(repr(r.content).replace('\\x86"', '"').replace('\\x90"', '"')) py = PyQuery(content.decode(r.encoding)) page_num = 0 for j, meta_item in enumerate(py.items("div[nid]")): page_num += 1 try: info = { 'rank': i * 40 + j + 1, 'keyword': keyword.decode(encoding).encode('utf-8'), 'itemName': meta_item('h3 a').attr('title').encode('utf-8'), 'itemId': meta_item.attr('nid').encode('utf-8'), 'itemPic': meta_item('img').attr('src').encode('utf-8'), 'wangwang': meta_item('.seller a').html().encode('utf-8').strip(), 'userNumId': re.findall(r'id\=(\d+)', meta_item('.seller a').attr('href'))[0].encode('utf-8'), 'price': re.findall(r'\d+\.\d*', meta_item('.price').html().encode('utf-8'))[0], 'location': (meta_item('.loc div').html() or '').encode('utf-8'), 'tradeNum': re.findall(r'\d+', meta_item('.dealing div').html() or '0')[-1].encode('utf-8'), 'rateNum': re.findall(r'\d+', (meta_item('.count a').html() or '0').encode('utf-8'))[0], } items.append(info) except: traceback.print_exc() # continue # if page_num < 40: # break return items