예제 #1
0
    def get_fund_mapping(self):
        # sub_code, sub_name, main_code, main_name
        sub_to_main_mapping = []
        html = requests.get(self._url, timeout=30.0).content
        document = PyQuery(unicode(html, 'utf-8'))

        fund_blocks = [document.items('.aa'), document.items('.dd')]
        for each_block in fund_blocks:
            for class_tag in each_block:
                items_list = [item.text() for item in class_tag.items('td')]
                sub_to_main_mapping.append((items_list[1], items_list[3]))
        return dict(sub_to_main_mapping)
def find_indicacoes(html: str) -> List[Indicacao]:
    d = PyQuery(html)
    indicacoes = []
    current_index = 0

    for heading in d.items('div[class="w3-col w3-left-align"] > p > b'):
        indicacoes.append(dict({
            "heading": heading.text(),
        }))

    for content in d.items('div#Demo1'):
        autor = content('div[class="w3-col w3-left w3-twothird"]').remove(
            'label').remove('br').text()
        protocolo = content('div[class="w3-col w3-left w3-third"]').remove(
            'label').remove('br').text()
        tramites = []
        tramite_idx = 0
        for content_row in content.items('div[class="w3-row-padding"]'):
            for col in content_row.items(
                    'div[class="w3-col w3-left w3-quarter"]'):
                text = col.text()
                if text not in [
                        'Data de Trâmite', 'Hora de Trâmite', 'Trâmite',
                        'Anotações'
                ]:
                    if tramite_idx == 0:
                        tramites.append([text])
                    else:
                        tramites[-1].append(text)
                    tramite_idx += 1
                    if tramite_idx == 4:
                        tramite_idx = 0

        indicacoes[current_index] = {
            **indicacoes[current_index],
            "autor":
            autor,
            "protocolo":
            protocolo,
            "tramites": [{
                "data": tramite[0],
                "hora": tramite[1],
                "texto": tramite[2],
                "anotacoes": tramite[3]
            } for tramite in tramites],
        }
        current_index += 1

    return indicacoes
예제 #3
0
 def __init__(self, data: PyQuery, pages: List[PyQuery], index: int):
     self.origin: PyQuery = data  # 原始数据
     # 结果返回值
     self.raw: List[GoogleItem] = [GoogleItem(i) for i in data.items()]
     self.index: int = index  # 当前页
     self.page: int = len(pages)  # 总页数
     self.pages: List[PyQuery] = pages  # 页面源
예제 #4
0
def getResTb():
    html = fileworker.getHTML()
    pq = PyQuery(html)
    result = dict()
    blocks = list()
    for i in pq.items('.row.result'):
        list.append(i)
예제 #5
0
	def Parse(self, html_data):
		parser_result = ParserResult()
		product_list = parser_result.product_list
		doc = PyQuery(html_data)
		for productEl in doc.items(self._item_query):
			link_el = productEl.find(self._link_el_query)
			if not link_el:
				logging.info('Failed to find link element!')
				continue
			url = _GetLinkHref(link_el[0])
			if not url:
				logging.info('Failed to find href!')
				continue

			name_el = productEl.find(self._name_el_query)
			if not name_el:
				logging.info('Failed to find name element!')
				continue
			name = _GetText(name_el[0])

			image_el = productEl.find(self._image_el_query)
			if not image_el:
				logging.info('Failed to find image element!')
				continue
			image_url = _GetImageSrc(image_el[0])
			if not image_url:
				logging.info('Failed to find image src!')
				continue

			product = product_list.product.add()
			product.url = url.strip()
			product.name = name
			product.image_url = image_url.strip()
		return parser_result
예제 #6
0
def extract_urls(html):  #提取url是cpu操作,无需改成协程
    '''文章解析出url放入seen_urls队列'''
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")  #去掉已经爬取过的url,放入到等待爬取的列表
        if url and url.startswith("http") and url not in seen_urls:
            waitting_urls.put_nowait(url)
예제 #7
0
def FetchAndSave():
    #百度风云榜页面网址(含有50个热门新闻的关键词)
    fengyunbang_url = 'http://top.baidu.com/buzz?b=1'
    resp = requests.get(fengyunbang_url)
    resp.encoding='gb2312'

    #新建excel文件保存数据。
    csvf = open('data.csv', 'a+', encoding='gbk', newline='')
    writer = csv.writer(csvf)
    writer.writerow(('news_content', 'keyword'))

    #从heml文件中解析出  事件字段和 网址字段
    doc = PyQuery(resp.text)
    for item in doc.items('.keyword'):
        keyword = item('a').text().split(' ')[0]
        keyword_link = item('a').attr.href
        news_links = get_keywords_news_links(keyword_link)
        for news_link in news_links:
            try:
                content = get_news_content(news_link)
                if content:
                    print(keyword, content[0:20])
                    writer.writerow((content, keyword))
            except:
                print(news_link)
예제 #8
0
def getResTb():
    html = fileworker.getHTML()
    pq = PyQuery(html)
    result = dict()
    blocks = list()
    for i in pq.items('.row.result'):
        list.append(i)
예제 #9
0
def all_sub_div(partial):
    """Helper function that returns all the direct sub div of the given partial.

    Params:
        partial (str): an HTML partial to analyse it must be a div element.

    Returns:
        list(str): List of the content of all the direct `div` child elements
            in the partial.
            Empty list if the root element is not a div or if not direct div child exist.

    Examples:
        >>> all_sub_div('<div></div>')
        []

        >>> all_sub_div('<section></section>')
        []

        >>> all_sub_div('<div><div>toto</div><div>titi</div></div>')
        ['<div>toto</div>', '<div>titi</div>']

        >>> all_sub_div('<div><div>toto</div></div>')
        ['<div>toto</div>']

        >>> all_sub_div('<section><div>toto</div></section>')
        []
    """
    d = PyQuery(partial)

    if not d(":root").is_("div"):
        return []

    return [elem.outer_html() for elem in d.items(":root>div")]
예제 #10
0
def extract_urls(html):
    """从返回的html中及解析出新的url"""
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith("http") and url not in seen_urls:
            waitting_urls.append(url)
예제 #11
0
def get_organization():
    url = 'https://summerofcode.withgoogle.com/archive/2017/organizations/'
    response = open_url(url)
    soup = PyQuery(response)
    soup = soup('.organization-card__link')
    for each in soup.items():
        yield 'https://summerofcode.withgoogle.com' + each.attr('href')
예제 #12
0
파일: 每日一字.py 프로젝트: wifi-user/-
def video_download(url,title):
    response = requests.get(url,headers=headers).text
    doc = PyQuery(response)
    #<div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{&quot;card&quot;:{&quot;content&quot;:{&quot;type&quot;:&quot;Video&quot;,&quot;sub_type&quot;:&quot;SelfHosted&quot;,&quot;video_id&quot;:&quot;1161414208179904512&quot;,&quot;is_playable&quot;:true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&amp;useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div>V<div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{&quot;card&quot;:{&quot;content&quot;:{&quot;type&quot;:&quot;Video&quot;,&quot;sub_type&quot;:&quot;SelfHosted&quot;,&quot;video_id&quot;:&quot;1161414208179904512&quot;,&quot;is_playable&quot;:true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&amp;useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div>
    #<div class="Post-RichTextContainer"><div class="RichText ztext Post-RichText __reader_view_article_wrap_786807993615199__"><p>知乎里我添加了一个新专栏,是关于硬笔行书的。</p><p><br>行书是介于楷书与草书之间的一种书体,它不像草书那样潦草,也不像楷书那样规矩板正。它就好比正在行走的人,既有动感,速度也不会很快。草书虽然潇洒,楷书虽然俊美,但都不利于日常书写与交流。行走的汉字才是最适合在日常生活中书写的。</p><p><br>这个系列每期会写一个单字,有单字静图、动图还有书写视频。</p><p><br>对本人硬笔行书感兴趣的书友可以关注。有问题的也欢迎在评论区与我交流。我也会加倍勤奋争取把这个系列做到日更,争取让更多硬笔爱好者每天都能学到新字。</p><hr><p><b>这是本系列第 192 次更新。</b></p><div><div class="RichText-video" data-za-detail-view-path-module="VideoItem" data-za-extra-module="{&quot;card&quot;:{&quot;content&quot;:{&quot;type&quot;:&quot;Video&quot;,&quot;sub_type&quot;:&quot;SelfHosted&quot;,&quot;video_id&quot;:&quot;1161414208179904512&quot;,&quot;is_playable&quot;:true}}}"><div class="VideoCard VideoCard--interactive"><div class="VideoCard-layout"><div class="VideoCard-video"><div class="VideoCard-video-content"><div class="VideoCard-player"><iframe frameborder="0" allowfullscreen="" src="https://www.zhihu.com/video/1161414208179904512?autoplay=false&amp;useMSE="></iframe></div></div></div><div class="VideoCard-content"><div class="VideoCard-title">硬笔行书每日一字 · 可</div></div></div><div class="VideoCard-mask"></div></div></div></div><figure data-size="normal"><noscript><img src="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg"/></noscript><img src="https://pic3.zhimg.com/80/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_hd.jpg" data-caption="" data-size="normal" data-rawwidth="2110" data-rawheight="2110" class="origin_image zh-lightbox-thumb lazy" width="2110" data-original="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-0b5f6fb5b9905f4e89a3081b9c5c4ade_b.jpg" data-lazy-status="ok"></figure><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.gif" data-caption="" data-size="normal" data-rawwidth="400" data-rawheight="400" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" class="content_image" width="400"/></noscript><div class="RichText-gifPlaceholder"><div class="GifPlayer" data-size="normal" data-za-detail-view-path-module="GifItem"><img class="ztext-gif" role="presentation" src="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-thumbnail="https://pic2.zhimg.com/v2-4c94b4f2c6ddadaecead094ee1042b09_b.jpg" data-size="normal"><svg width="60" height="60" viewBox="0 0 60 60" class="GifPlayer-icon"><g fill="none" fill-rule="evenodd"><ellipse fill="#000" opacity="0.45" cx="30" cy="30" rx="30" ry="30"></ellipse><ellipse stroke="#FFF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4,1,4" cx="30" cy="30" rx="26" ry="26"></ellipse><svg x="16" y="18.5" class="GifPlayer-icon"><path d="M12.842 12.981V11.4H7.64v1.653h3.27v.272c-.018 1.881-1.442 3.147-3.516 3.147-2.382 0-3.876-1.846-3.876-4.834 0-2.936 1.485-4.79 3.832-4.79 1.732 0 2.936.835 3.428 2.364h1.977c-.43-2.566-2.522-4.201-5.405-4.201-3.55 0-5.845 2.601-5.845 6.644 0 4.096 2.268 6.654 5.863 6.654 3.322 0 5.475-2.083 5.475-5.327zM17.518 18V5.317H15.55V18h1.97zm5.142 0v-5.256h5.449v-1.74h-5.45V7.11h5.95V5.317h-7.918V18h1.969z" fill="#fff"></path></svg></g></svg></div></div></figure><p></p></div></div>
    for item in doc.items('#root .Post-RichTextContainer div'):
        #<span class="z-ico-video"/>https://www.zhihu.com/video/1161414208179904512</span></span>
        #for url in item.items('.z-ico-video '):
        #<span class="z-ico-video"/>**************27****28******和等于55
        videourl = str(item.find('a span .z-ico-video'))[55:]
        #print(videourl)
        #获取视频链接的存储json文件链接
        videos = 'https://lens.zhihu.com/api/v4/videos/'+videourl
        resp = requests.get(videos,headers=headers).text
        urls = json.loads(resp)
        #获取视频链接
        videourl = urls['playlist']['LD']['play_url']
        print(videourl)
        videourl = videourl.replace('&','&').replace(' ','')
        #下载视频
        videofile = requests.get(videourl,headers=headers)
        with open(title+'.mp4','wb') as file:
            file.write(videofile.content)
        print('完成下载:'+title+'.mp4')
    time.sleep(1)
예제 #13
0
def gain_data():
    """

    :return:
    """
    for i in range(1, PAGE_NUM + 1):
        time.sleep(random.randint(5, 9))
        url = "https://jobs.51job.com/pachongkaifa/p" + str(i)
        r = requests.get(url, headers=headers)
        r.encoding = "gbk"
        content = PyQuery(r.text)(".detlist.gbox").children("div")
        for d in content.items():
            info = d(".info span").text().split(" ")
            data["Job"].append(d(".info .title").text())
            data["Company"].append(d(".info a").text().split(" ")[-1])
            data["Location"].append(info[-3])
            data["Salary"].append(info[-2])
            data["Date"].append(info[-1])
            order = d(".order").text().split("|")
            data["Education"].append(list_helper(order[0]))
            data["Experience"].append(list_helper(order[1]))
            data["Type"].append(list_helper(order[2]))
            data["Scope"].append(list_helper(order[3]))
            data["Detail"].append(d(".text").text().replace(" ", "", -1))
            data["Url"].append(d(".info span a").attr("href"))
    pandas.DataFrame(data).to_csv("../resource/qcwy/data.csv")
예제 #14
0
def extract_urls(html):
    # urls = []
    doc = PyQuery(html)
    for link in doc.items("a"):  # 获取a标签
        url = link.attr("href")  # a标签中的href属性
        if url and url.startswith("https") and url not in seen_urls:
            # urls.append(url)   # 如果详情页还有url(初始页时有),可以将url加入到urls中,然后迭代爬取
            waitting_urls.append(url)  # 待爬取的url加到waitting队列中
예제 #15
0
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith('http') and url not in seen_urls:
            urls.append(url)
            waiting_urls.append(url)
예제 #16
0
def get_technologies(url):
    l = []
    response = open_url(url)
    soup = PyQuery(response)
    soup = soup('.organization__tag')
    for each in soup.items():
        l.append(each.text())
    return l
예제 #17
0
def get_ingredients():
    pages = send_requests()
    ingredients = []
    for page in pages:
        pq = PyQuery(page)(".grid-view")
        ig = [i.attr("id") for i in pq.items("li")]
        ig = [i.replace("_", " ") for i in ig]
        ingredients.extend(ig)
    return ingredients
예제 #18
0
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith("http") and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(url)
    return urls
예제 #19
0
def get_flight_ids(html_text):
    __pq = PyQuery(html_text).find("tbody").find("td.inputselect").find(
        "div.content").find("input")
    ids = []
    for __node in __pq.items():
        if (__node.hasClass("radio-ajax")):
            __matched = regex("[A-Z]{2}.*[A-Z]", str(__node.attr("value")))
            ids.extend(__matched)
    return ids
예제 #20
0
 def get_movies(self, html):
     doc = PyQuery(html)
     for item in doc.items('.board-item-content'):
         return {
             'name': item.find('.name').text(),
             'stars': item.find('.star').text().split(':')[1],
             'time': item.find('.releasetime').text().split(':')[1],
             'score': item.find('.score').text()
         }
예제 #21
0
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr("href")
        if url and url.startwith("http") and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(url)
    return urls
예제 #22
0
def extract_urls(html):  # html中提取所有url
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr('href')
        if url and url.startwith('http') and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(urls)
    return urls
예제 #23
0
def decode_html(html_text):
    '''解析网页的电影数据'''
    doc = PyQuery(html_text)
    for item in doc.items('.board-wrapper dd'):
        yield {
            'name': item.find('.name').text(),
            'actors': item.find('.star').text(),
            'time': item.find('.releasetime').text(),
            'score': item.find('.score').text()
        }
예제 #24
0
def extrack_urls(html):
    urls = []
    if html:
        pq = PyQuery(html)
        for link in pq.items('a'):
            url = link.attr('href')
            # 判断url,踢出不符合条件的url
            if url and url.startswith('http') and url not in seen_urls:
                urls.append(url)
                waitting_urls.append(url)
        return urls
예제 #25
0
def extract_url(html):
    urls = []
    pq = PyQuery(html)
    # 爬取所有a链接
    for link in pq.items("a"):
        # 提取href属性
        url = link.attr('href')
        if url and url.startswith('http') and url not in seen_urls:  # 判断是否重复或是否是http
            urls.append(url)
            waitting_url.append(url)
    return urls
예제 #26
0
def extract_urls(html):
    print("extract_html", html)
    urls = []
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith("http") and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(url)
    print(waitting_urls)
    return urls
예제 #27
0
async def main():
    loop = asyncio.get_running_loop()

    async with aiohttp.ClientSession() as session:
        html = await fetch(session, "http://www.biquge.cm/12/12097/")
        pq = PyQuery(html)
        for item in pq.items("dd a"):
            title = item.text()
            text = await get_text(session, item.attr("href"))
            # 兼容阻塞旧代码
            await loop.run_in_executor(None, save, title, text)
예제 #28
0
async def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    # 取所有的url
    for link in pq.items('a'):
        url = link.attr('href')
        # 过滤重复的url
        if url and url.startswith('http') not in seen_urls:
            urls.append(url)
            waitting_urls.append(url)
    return urls
예제 #29
0
def getItemsByKeyword(
    keyword,
    encoding='GBK',
    page=1,
    price=None,
):
    if not PyQuery:
        return []
    items = []
    domain = \
        'http://s.taobao.com/search?q=$Q$&style=list&bcoffset=1&tab=all&cd=false&v=auction&sort=sale-desc&s={0}'
    domain = domain.replace('$Q$',
                            repr(keyword.decode(encoding).
                                 encode('GBK')).
                            replace('\\x', '%').upper()[1:-1])
    domain = domain.replace(' ', '%20')

    # price interval

    if price:
        domain = domain \
            + '&filter=reserve_price%5B{0}%2C{1}%5D'.format(price[0], price[1])

    # price interval end

    for i in range(page):
        url = domain.format(i * 40)
        r = __import__('requests').get(url)
        encoding = r.encoding
        py = PyQuery(r.content.decode(encoding))
        page_num = 0
        for i, meta_item in enumerate(py.items("div[nid]")):
            page_num += 1
            try:
                info = {
                    'rank': i + 1,
                    'keyword': keyword,
                    'itemName': meta_item('h3 a').attr('title').encode('utf-8'),
                    'itemId': meta_item.attr('nid'),
                    'itemPic': meta_item('img').attr('src'),
                    'wangwang': meta_item('.seller a').html().encode('utf-8').strip(),
                    'userNumId': re.findall(r'id\=(\d+)', meta_item('.seller a').attr('href'))[0],
                    'price': re.findall(r'\d+\.\d*', meta_item('.price').html().encode('utf-8'))[0],
                    'location': (meta_item('.loc div').html() or '').encode('utf-8'),
                    'tradeNum': re.findall(r'\d+', meta_item('.dealing div').html() or '0')[-1],
                    'rateNum': re.findall(r'\d+', (meta_item('.count a').html() or '0').encode('utf-8'))[0],
                }
                items.append(info)
            except:
                traceback.print_exc()
                # continue
        # if page_num < 40:
            # break
    return items
예제 #30
0
 def _get_other_urls(self, data: PyQuery) -> None:
     for link in data.items():
         if link.attr("href") == "#":
             continue
         if link.text() == "SauceNao":
             self.saucenao_url = "https:" + link.attr("href")
         elif link.text() == "ascii2d.net":
             self.ascii2d_url = link.attr("href")
         elif link.text() == "Google Images":
             self.google_url = "https:" + link.attr("href")
         elif link.text() == "TinEye":
             self.tineye_url = "https:" + link.attr("href")
예제 #31
0
def extract_links(source):
    '''
    提取出详情页的链接
    '''
    pq = PyQuery(source)
    for link in pq.items("a"):
        _url = link.attr("href")
        if _url and re.match('https://.*?/\d+.html', _url) and _url.find(
                '{}.lianjia.com'.format(city)):
            links_detail.add(_url)

    print(links_detail)
예제 #32
0
 def decode_html(html_text):
     # print(html_text)
     doc = PyQuery(html_text)
     # print(doc)
     for item in doc.items('.board-wrapper dd'):
         # print(item)
         yield {
             'name': item.find('.name').text(),
             'actors': item.find('.star').text(),
             'time': item.find('.releasetime').text(),
             'score': item.find('.score').text(),
         }
예제 #33
0
    def get_contributive_info(self, session, text, i_d, data):
        # 获得页码信息
        total_page_num = self.get_page_num(text, '#paging') + 1
        if total_page_num == 1:
            total_page_num += 1

        for page in xrange(1, total_page_num):
            try:
                url = 'http://{host}/business/QueryInvList.jspx?pno={page}&order=0&mainId={i_d}'.format(
                    host=self.host, page=page, i_d=i_d)

                r = self.filter_request(session, session.get, url)
                if r is None:
                    self.append_model(data,
                                      Model.contributive_info,
                                      url,
                                      '',
                                      status=self.STATUS_FAIL)
                    continue

                self.append_model(data, Model.contributive_info, url, r.text)

                index = 0
                tr_list = PyQuery(
                    r.text, parser='html').find('.detailsListGDCZ').find('tr')
                item_list = tr_list.items()
                for item in item_list:
                    index += 1
                    onclick = item.find('a').attr('onclick')
                    if onclick is None or onclick == '':
                        continue
                    invest_list = self.invest_search_obj.findall(
                        onclick.encode('utf-8'))
                    for invest_id in invest_list:
                        url = 'http://{host}/queryInvDetailAction.jspx?invId={i_d}'.format(
                            host=self.host, i_d=invest_id)
                        r = self.filter_request(session, session.get, url)
                        if r is None:
                            self.append_model(data,
                                              Model.contributive_info,
                                              url,
                                              '',
                                              status=self.STATUS_FAIL,
                                              classify=Model.type_detail)
                            continue

                        self.append_model(data,
                                          Model.contributive_info,
                                          url,
                                          r.text,
                                          classify=Model.type_detail)
            except Exception as e:
                self.log.exception(e)
예제 #34
0
def unpack_events(html, args):
    query = PyQuery(html)
    logger.debug("Source encoding: {} ".format(query.encoding))
    rows = query.items('tr')
    offset = 0
    events = []
    for row in rows:
        parsed_row = __parse_row(row)
        event_id = offset
        events.append(parsed_row)
        offset += 1

    logger.debug("Parsed {} entries".format(offset))
    return events
예제 #35
0
def process_html_file(fi):
    f = open(fi,'r')
    d = PyQuery(f.read())
    f.close()

    year = None
    links = {}
    for a in d.items('.toc a'):
        if len(a.text()) == 4:
            year = a.text()
            links[year] = []
        else:
            if year:
                links[year].append( (a.attr('href'), a.text()) )
            else:
                if 'NONE' not in links:
                    links['NONE'] = []
                links['NONE'].append( (a.attr('href'), a.text()) )
        logger.debug('Retrieved data %s %s' % (a.text(), a.attr('href')))

    data = []
    for year in links.keys():
        for link_id, link_name in links[year]:
            logger.info('Getting text at %s' % link_id)
            title = date = text = ''
            for x in d(link_id).parents('p').nextAll().items():
                logger.debug('X: %s' % x.outerHtml())
                if '<a' in x.outerHtml():
                    break
                elif 'End of the Project Gutenberg' in x.text():
                    break
                elif '<h2' in x.outerHtml():
                    title = x.text()
                elif '<h3' in x.outerHtml():
                    date = x.text()
                elif '<p' in x.outerHtml():
                    text += RE_CLEAN_TEXT.sub(x.text().replace('\n',' ').replace('&#13;','').replace('\r',' '), ' ')
                else:
                    logger.error('Unrecognized tag: %s' % x.outerHtml())

            if 'Gutenberg' in text:
                logger.error('%s\n%s' % (title,text))
            logger.debug('\nTitle: %s\nDate: %s\nText: %s' % (title, date, text))
            data.append((year, date, title, text))
    logger.info('Retrieved %d pieces' % len(data))
    return data
예제 #36
0
async def Start():
	timestamp = time.time()

	parser = argparse.ArgumentParser(description="BioTC by Bioruebe (https://bioruebe.com), 2014-2019, Version 3.0.0, released under a BSD 3-clause style license.\n\nBioTC is a small application to simplify trading Steam Trading Cards with the SteamCardExchange bot by comparing the user's Steam inventory with the available cards on steamcardexchange.net")
	parser.add_argument("-n", "--name", action="store", type=str, default=None, help="Use specified Steam ID instead of reading it from " + STEAM_ID_FILE_NAME)
	parser.add_argument("-l", "--limit", action="store", type=int, default=-1, help="Stop searching after n sets have been found")
	args = parser.parse_args()

	parser.print_help()
	print("\n-----------------------------------------------------------------------------\n")

	if args.name is None:
		try:
			f = open(STEAM_ID_FILE_NAME)
			args.name = f.read()
		except:
			pass
	if args.name is None:
		sys.exit("Error: Could not read SteamID from file. Make sure the file '" + STEAM_ID_FILE_NAME + "' contains a valid SteamID.")

	result = {
		"sets": [],
		"steamID": args.name,
		"cardsCount": 0,
		"gameCount": 0,
		"completeSets": 0,
		"processingTime": 0,
		"time": 0
	}

	async with aiohttp.ClientSession() as session:
		print("Loading Steam inventory")
		url = "https://steamcommunity.com/id/" + args.name + "/inventory/json/753/6"
		raw_json = await fetch(session, url)
		cardData = json.loads(raw_json)
		# print(cardData)
		if cardData is None or not cardData["success"]:
			sys.exit("Invalid JSON data received. Aborting.")

		for key, card in cardData["rgDescriptions"].items():
			# Ignore emoticons, backgrounds
			if "Trading Card" not in card["type"]:
				# print(card["name"] + " is not a trading card.")
				continue
			# print(card)

			appid = card["market_fee_app"]
			try:
				game_cards = card_requests[appid]
				game_cards.append(card)
			except KeyError:
				card_requests[appid] = [card]

		i = 0
		result["gameCount"] = len(card_requests)
		for appid, inventory in card_requests.items():
			print("Processing " + appid)
			url = "https://www.steamcardexchange.net/index.php?inventorygame-appid-" + appid
			resp = await fetch(session, url)
			time.sleep(0.5)
			dom = PyQuery(resp)
			game_name = dom("h2").text()
			card_items = dom.items(".inventory-game-card-item")
			card_set = Set(appid, game_name)
			# print(inventory)
			for item in card_items:
				card = Card(item.find(".card-name").text().strip())
				if card.name == "":
					# print("[Warning] Invalid card name: " + card.name)
					continue

				# available = item.find(".green, .orange")
				# if not available:
				# 	continue
				stock = filter_card_stock_value(item.find(".card-amount").text())
				card.bot_inventory = stock[0]
				if len(stock) > 1:
					card.bot_inventory_pending = stock[1]

				try:
					card.price = int("".join(filter(str.isdigit, item.find(".card-price").eq(1).text())))
					if card_set.standard_price < 1 and card.bot_inventory > 1:
						card_set.standard_price = card.price
				except ValueError:
					pass

				card.trade_url = item.find(".button-blue").attr("href")
				card.user_inventory = get_card_amount_in_inventory(cardData, inventory, card.name)
				card_set.cards.append(card)

			card_set.update_complete_sets()
			card_set.calculate_total_cost()
			card_set.set_progress_class()
			card_set.set_card_classes()
			card_set.cards.sort(key=lambda c: (c.user_inventory, 10 - c.bot_inventory))

			result["completeSets"] += card_set.complete_sets
			if card_set.user_inventory_is_empty():
				print("User has " + str(card_set.complete_sets) + " complete sets, but no surplus cards in inventory")
				continue

			if card_set.bot_inventory_is_empty():
				print("Bot has no unowned cards (at normal price) for this set")
				continue

			print(card_set)
			result["sets"].append(card_set)

			i += 1
			if args.limit > 0 and i >= args.limit:
				break

		env = Environment(loader=FileSystemLoader("."))
		template = env.get_template('template.html')

		result["cardCount"] = sum(len(list(filter(lambda c: c.user_inventory < 1, s.cards))) for s in result["sets"])
		result["processingTime"] = "{:.1f}".format(time.time() - timestamp)
		result["time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

		html = template.render(result)

		file = open("Cards.html", "w", encoding="utf-8")
		file.write(html)
		file.close()
		os.startfile("Cards.html")
예제 #37
0
def getItemsByKeyword(
    keyword,
    encoding='utf-8',
    page=1,
    price=None,
    sort='sale-desc'
):
    """
    @params sort: renqi-desc/sale-desc/credit-desc/old_starts/price-asc/price-desc
    """
    if not PyQuery:
        return []
    items = []
    params = {
        'q': '$Q$',
        'style': 'grid',
        # 'bcoffset': 1,
        'tab': 'all',
        # 'cd': 'false',
        # 'v': 'auction',
        'sort': sort,
        's': '{0}',
        # 'p4poffset': '4',
        # 'bcoffset': '-4',
        'btab': 0
    }
    domain = 'http://s.taobao.com/search?' + \
        '&'.join(['%s=%s' % (k, v) for k, v in params.items()])
    domain = domain.replace('$Q$',
                            repr(keyword.decode(encoding).
                                 encode('GBK')).
                            replace('\\x', '%').upper()[1:-1])
    domain = domain.replace(' ', '%20')

    # price interval

    if price:
        domain = domain \
            + '&filter=reserve_price%5B{0}%2C{1}%5D'.format(price[0], price[1])

    # price interval end

    for i in range(page):
        url = domain.format(i * 40)
        r = requests.get(url, timeout=5, allow_redirects=True)
        # encoding = r.encoding
        content = eval(repr(r.content).replace('\\x86"', '"').replace('\\x90"', '"'))
        py = PyQuery(content.decode(r.encoding))
        page_num = 0
        for j, meta_item in enumerate(py.items("div[nid]")):
            page_num += 1
            try:
                info = {
                    'rank': i * 40 + j + 1,
                    'keyword': keyword.decode(encoding).encode('utf-8'),
                    'itemName': meta_item('h3 a').attr('title').encode('utf-8'),
                    'itemId': meta_item.attr('nid').encode('utf-8'),
                    'itemPic': meta_item('img').attr('src').encode('utf-8'),
                    'wangwang': meta_item('.seller a').html().encode('utf-8').strip(),
                    'userNumId': re.findall(r'id\=(\d+)', meta_item('.seller a').attr('href'))[0].encode('utf-8'),
                    'price': re.findall(r'\d+\.\d*', meta_item('.price').html().encode('utf-8'))[0],
                    'location': (meta_item('.loc div').html() or '').encode('utf-8'),
                    'tradeNum': re.findall(r'\d+', meta_item('.dealing div').html() or '0')[-1].encode('utf-8'),
                    'rateNum': re.findall(r'\d+', (meta_item('.count a').html() or '0').encode('utf-8'))[0],
                }
                items.append(info)
            except:
                traceback.print_exc()
                # continue
        # if page_num < 40:
            # break
    return items