#Selfteaching day11 homework ,email-practice!html-practice! from mymodule import stats_word as counts #import module stats_word.py function import requests #调用requests 函数 r = requests.get( 'https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') #从微信公众号取内容 r.encoding #text 格式编码'utf-8' r.text response = r.text #返回文档 from pyquery import PyQuery #调用网页解释函数 document = PyQuery(response) text = document('#js_content').text() from mymodule import stats_word as counts #import module stats_word.py function l1 = counts.stats_text_cn(text, 100) #输出词频TOP 100的词语 stats_string_resut = ''.join(str(i) for i in l1) #以字符串输出统计结果 print('张小龙演讲Top 100高频词', stats_string_resut) import getpass #调用邮件接入模块,保障密码安全 sender = input('请输入发件人邮箱:') password = getpass.getpass('输入发件人邮箱密码(可复制粘贴):') recipients = input('请输入收件人邮箱:') import yagmail #调用邮件接入模块,保障密码安全 yag = yagmail.SMTP( sender, password, 'smtp.139.com') #user=sender,password=password,host='smtp.139.com'
import stats_word import json import requests from pyquery import PyQuery import yagmail import getpass import matplotlib.pyplot as plt import numpy as np from matplotlib.font_manager import FontProperties #获取网页内容 response = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA', auth=('user', 'pass')) document = PyQuery(response.text) content = document('#js_content').text() #获取文本 count = 10 #限制输出元素的变量 try: text_list = stats_word.stats_text(content, count) #调用统计汉字的函数 text_list2 = [] #存储词 text_list3 = [] #存储词频 for i in text_list: text_list2.append(i[0]) #把text_list里的词取出来 text_list3.append(i[1]) #把text_list里的数字取出来 except ValueError as identifier: print('请输入字符串', identifier) del text_list2[-2:] #统计的结果后面总是有英文Twitter号之类,删掉 del text_list3[-2:] #数据转化 words = tuple(text_list2) #转换成元祖 y_pos = np.arange(len(words)) #y轴词的个数
def detail_by_www(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#main #product_container') pdata = self.get_pdata(pqhtml) roter, goodsData = self.get_goodsData(pqhtml) # print pqhtml.outerHtml() # print area.outerHtml() # exit() #下架 if roter['soldOut']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 detail['brand'] = pdata['sc_brandENG'] #名称 detail['name'] = area('.product-titles').text() #货币,和pdata中['sc_priceRMB'] 绑定 currency = 'CNY' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(roter) detail['price'] = price detail['listPrice'] = listPrice or price #产品ID productId = pdata['sc_prdSKU'] or roter['goods_id'] detail['productId'] = productId #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #图片集 imgs = [ a.attr('href') for a in pqhtml('.product-album-thumb .thumbnail a').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(roter) #描述 detail['descr'] = pqhtml('.product-attributes').text() + pqhtml( '.product_detail').text() #详细 detail['detail'] = pqhtml('.product_detail').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception: raise
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() img = tweetPQ("div.AdaptiveMedia-photoContainer").attr( 'data-image-url') usernameTweet = tweetPQ( "span.username.js-action-profile-name b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp( dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id tweet.img = img results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def test_parse_document_expressions(html, result): assert parse_document_expressions(PyQuery(html)) == result
import urllib3 from pyquery import PyQuery import pymysql db_conn = pymysql.connect("localhost", "root", "lw1001", "IR_db", charset='utf8', ) cursor = db_conn.cursor() #cursor.execute('select * from books_cn') url = 'http://book.km.com/shuku/165352.html' jq = PyQuery(url) for i in range(jq('.commentTxtList')('dl')('dd').length): print('---------------') print(jq('.commentTxtList')('dl')('dd').eq(i))
def find_title(self, url=None): pq = PyQuery(url=url) return pq('title').text().replace(u'Volver a ver vídeos de', '')
def getData(search_kw): page = 1 url = "http://www.104.com.tw/jobbank/joblist/auto_joblist.cfm?auto=1&jobsource=n104bank1&ro=0&keyword={skw}&order=1&asc=0&page={page}&psl=N_B".format( skw=search_kw.replace(" ", "+"), page=page) res = requests.get(url) S = PyQuery(res.text) max_pages = int(PyQuery(S("#box_page_bottom_2 li > a")[-1]).text()) data = [] data.extend( S(".j_cont").map( lambda i, e: { "name": PyQuery(e)(".job_name").text(), "url": PyQuery(e)("a").attr("href"), "meta": dict( PyQuery(e)("meta").map(lambda ii, ee: (PyQuery(ee).attr( "itemprop"), PyQuery(ee).attr("content")))), "area": PyQuery(e)(".area_summary").text(), "company_name": PyQuery(e)(".compname_summary").text(), "company_meta": PyQuery(e)(".compname_summary span").attr("title"), "candidates_summary": PyQuery(e)(".candidates_summary").text(), "requirement": PyQuery(e)(".requirement").text(), "joblist_summary": PyQuery(e)(".joblist_summary").text(), "searched_keyword": search_kw, "crawledAt": datetime.utcnow() })) for page in range(2, max_pages + 1): url = "http://www.104.com.tw/jobbank/joblist/auto_joblist.cfm?auto=1&jobsource=n104bank1&ro=0&keyword={skw}&order=1&asc=0&page={page}&psl=N_B".format( skw=search_kw.replace(" ", "+"), page=page) res = requests.get(url) S = PyQuery(res.text) data.extend( S(".j_cont").map( lambda i, e: { "name": PyQuery(e)(".job_name").text(), "url": PyQuery(e)("a").attr("href"), "meta": dict( PyQuery(e)("meta").map(lambda ii, ee: (PyQuery(ee).attr("itemprop"), PyQuery(ee).attr("content")))), "area": PyQuery(e)(".area_summary").text(), "company_name": PyQuery(e)(".compname_summary").text(), "company_meta": PyQuery(e)(".compname_summary span").attr("title"), "candidates_summary": PyQuery(e)(".candidates_summary").text(), "requirement": PyQuery(e)(".requirement").text(), "joblist_summary": PyQuery(e)(".joblist_summary").text(), "searched_keyword": search_kw, "crawledAt": datetime.utcnow() })) df = pd.DataFrame(data) return df
def ensure_pq(elm): if isinstance(elm, PyQuery): return elm else: return PyQuery(elm)
wr.writerow([ 'URL', 'CIK', 'FileName', 'RecordType', 'securityTitle', 'conversionOrExercisePrice', 'transactionDate', 'deemedExecutionDate', 'transactionFormType', 'transactionCode', 'equitySwapInvolved', 'transactionTimeliness', 'transactionShares', 'transactionTotalValue', 'transactionPricePerShare', 'transactionAcquiredDisposedCode', 'exerciseDate', 'expirationDate', 'underlyingSecurityTitle', 'underlyingSecurityShares', 'underlyingSecurityValue', 'sharesOwnedFollowingTransaction', 'valueOwnedFollowingTransaction', 'directOrIndirectOwnership', 'natureOfOwnership' ]) for url in input_urls: #reversed(input_urls):#['https://www.sec.gov/Archives/edgar/data/1105078/0001127602-17-021121.txt']: print(url) r = requests.get(url).text pq = PyQuery(r) j = json.loads( json.dumps( xmltodict.parse(str(pq('XML').children()).replace('??>', '?>')))) #print (json.dumps(j)) try: nonDerivativeHolding = j.get('ownershipdocument', {}).get('nonderivativetable', {}).get('nonderivativeholding') except: pass try: derivativeHolding = j.get('ownershipdocument', {}).get('derivativetable',
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #错误 if len(pqhtml('.error_message')) >= 1: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SAKERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#pdp-content-area') pdata = self.get_pdata(area) # print json.dumps(pdata) # exit() #下架 if pdata['sold_out_message']['enabled'] or pdata[ 'intl_shipping_restriction']['enabled']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['brand_name']['label'] if pdata['brand_name'][ 'enabled'] else '' detail['brand'] = brand #名称 detail['name'] = pdata['short_description'] #货币 currency = pdata['price']['list_price']['local_currency_code'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pdata) detail['price'] = price detail['listPrice'] = listPrice #颜色 color = dict([(clor['id'], clor['label']) for clor in pdata['colors']['colors']]) colorId = dict([(clor['id'], clor['id']) for clor in pdata['colors']['colors']]) detail['color'] = color detail['colorId'] = colorId #图片集 imgs = self.get_imgs(pdata) detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cid, Arr[0]) for cid, Arr in imgs.items()]) detail['imgs'] = imgs #钥匙 detail['keys'] = color.keys() #产品ID productId = pdata['product_code'] detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(pdata) #描述 detail['descr'] = PyQuery(pdata['description']).text() #退换货 detail['returns'] = pdata['simple_shipping_statement']['message'] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None, debug=False): """Get tweets that match the tweetCriteria parameter A static method. Parameters ---------- tweetCriteria : tweetCriteria, an object that specifies a match criteria receiveBuffer : callable, a function that will be called upon a getting next `bufferLength' tweets bufferLength: int, the number of tweets to pass to `receiveBuffer' function proxy: str, a proxy server to use debug: bool, output debug information """ results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() user_agent = random.choice(TweetManager.user_agents) all_usernames = [] usernames_per_batch = 20 if hasattr(tweetCriteria, 'username'): if type(tweetCriteria.username) == str or not hasattr( tweetCriteria.username, '__iter__'): tweetCriteria.username = [tweetCriteria.username] usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u] all_usernames = sorted({u.lower() for u in usernames_ if u}) n_usernames = len(all_usernames) n_batches = n_usernames // usernames_per_batch + ( n_usernames % usernames_per_batch > 0) else: n_batches = 1 for batch in range(n_batches): # process all_usernames by batches refreshCursor = '' batch_cnt_results = 0 if all_usernames: # a username in the criteria? tweetCriteria.username = all_usernames[ batch * usernames_per_batch:batch * usernames_per_batch + usernames_per_batch] active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy, user_agent, debug=debug) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernames = tweetPQ("span.username.u-dir b").text().split() tweet.username = usernames[0] tweet.to = usernames[1] if len(usernames) == 2 else None tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text())\ .replace('# ', '#').replace('@ ', '@').replace('$ ', '$') tweet.retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.id = tweetPQ.attr("data-tweet-id") tweet.permalink = 'https://twitter.com' + tweetPQ.attr( "data-permalink-path") tweet.author_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweet.date = datetime.datetime.fromtimestamp( dateSec, tz=datetime.timezone.utc) tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc)\ .strftime("%a %b %d %X +0000 %Y") tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: tweet.geo = geoSpan.attr('title') else: tweet.geo = '' urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.urls = ",".join(urls) results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] batch_cnt_results += 1 if tweetCriteria.maxTweets > 0 and batch_cnt_results >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) resultsAux = [] return results
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 Jtxt = pqhtml('script').text() area = pqhtml( 'div.body-wrap>div#page-container section.product-area') varea = pqhtml( 'div.body-wrap>div#page-container section.product-large-view-container' ) self.imgPrefix = re.search(r"productImagePrefix:\s*'(.*?)'", Jtxt, re.DOTALL).groups()[0] # print area.outerHtml().encode('utf-8') # exit() #下架 if resp.url[resp.url.rfind('/'):] != url[url.rfind( '/'):] or pqhtml('.cat-button').hasClass('soldout'): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #名称 detail['name'] = area('h1.product-title').text() #品牌 detail['brand'] = self.get_brand(area) #货币 currency = area('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #描述 detail['descr'] = area('div.product-info').text() + area( 'div.product-more-details').text() #产品ID productId = area('.buying-area input.buy').attr('value') or area( '.buynow').attr('href').split('buy=')[-1] detail['productId'] = productId #退货 detail['returns'] = area('div.product-delivery-returns').text() #图片集 imgsTmp = self.get_imgs(varea) #颜色 color = self.get_color(area) detail['color'] = color #钥匙 if isinstance(color, dict): detail['keys'] = color.keys() detail['colorId'] = dict([(Id, Id) for Id in color.keys()]) else: detail['colorId'] = productId #信息.图片,价格 sizes, imgs, price, listPrice = self.get_info(area, varea, url) #价格 detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cid, arr[0]) for cid, arr in imgs.items()]) detail['imgs'] = imgs detail['sizes'] = sizes detail['price'] = price detail['listPrice'] = listPrice #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def get_article(): r = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') document = PyQuery(r.text) return document('#js_content').text()
def parse(self, response): item = EfinancialcareersItem() pq = PyQuery(response.body_as_unicode()) item['source_post_id'] = response.url.split('.')[-1] try: item['url'] = response.url except: item['url'] = '' try: item['posted_date'] = self.current_day except: item['posted_date'] = '' try: item['job_title_raw'] = pq('[itemprop="title"]').text() except: item['job_title_raw'] = '' job_category = '' for jc in pq('[class="breadcrumb"] li a'): job_category = jc.text try: item['job_category'] = job_category except: item['job_category'] = '' try: item['job_type'] = pq('[itemprop="employmentType"]').text() except: item['job_type'] = '' try: item['description'] = pq('.description').text() except: item['description'] = '' try: item['company_raw'] = pq('.brandInfo h2').text() except: item['company_raw'] = '' if 'our client is' in pq('[itemprop="description"].description').text().lower(): item['company_ad_type'] = 'Recruitment agency' else: item['company_ad_type'] = 'Direct employer' try: item['country'] = 'SG' except: item['country'] ='' try: item['location'] = pq('[itemprop="addressLocality"]').text() except: item['location'] = '' if not 'isJobExpired=true' in response.url.split('.')[-1]: yield item
from __future__ import print_function import sys from pyquery import PyQuery import re filename = sys.argv[1] print("Processing TOC for " + filename + "...") with open(filename, 'r+') as f: text = f.read() with open(filename, 'w') as f: f.write('{% extends "books/pfe/base.html" %}\n\n') pattern = re.compile(r"{% block toc %}(.*){% endblock %}", re.DOTALL) matches = re.findall(pattern, text) d = PyQuery(matches[0]) # first ul gets 'right' d('ul').eq(0).addClass('right') # first li gets dropdown d('li').eq(0).addClass('has-dropdown') # second ul is dropdown d('ul').eq(1).addClass('dropdown') toc = "{% block toc %}\n" + str(d) + "\n{% endblock %}" newtext = re.sub(pattern, toc, text) f.write(newtext)
def login(self): # SSO login for installed apps to get an auth code user = test_data['g10f_user'] client = test_data['g10f_oidc_web_client'] headers = {"referer": client['redirect_uri']} response = self.client.get( self.get_authentication_uri(client), headers=headers, proxies=self.proxies, name=urlsplit( self.openid_configuration['authorization_endpoint']).path) pq = PyQuery(response.content) data = { "username": user['username'], "password": user['password'], "csrfmiddlewaretoken": pq("input[name='csrfmiddlewaretoken']").val(), "login_form_key": "login_form", "next": pq("input[name='next']").val() } path_url = response.request.path_url headers = {"referer": response.request.url} response = self.client.post(path_url, data=data, headers=headers, proxies=self.proxies, allow_redirects=False, name=urlsplit(path_url).path) if response.next is None: print(response.text) return response = self.client.get(response.next.url, headers=headers, proxies=self.proxies, allow_redirects=False, name=urlsplit(response.next.url).path) query = parse_qs(urlsplit(response.next.url).query) # user logout, because we have now a auth code to authenticate self.client.get(self.openid_configuration['end_session_endpoint'], proxies=self.proxies) headers = { 'content-type': 'application/x-www-form-urlencoded', 'Accept': 'application/json' } client_id = client['client_id'] client_secret = client['client_secret'] data = { 'grant_type': 'authorization_code', 'code': query['code'][0], 'redirect_uri': client['redirect_uri'] } # if code_verifier: # data['code_verifier'] = code_verifier if client_secret: auth = b"%s:%s" % (client_id.encode(), client_secret.encode()) headers['authorization'] = '%s %s' % ( 'Basic', b64encode(auth).decode("ascii")) response = self.client.post( self.openid_configuration['token_endpoint'], data=data, headers=headers, proxies=self.proxies) content = response.json() headers = { 'accept': 'application/json', 'authorization': '%s %s' % (content['token_type'], content['access_token']) } self.client.get(self.openid_configuration['userinfo_endpoint'], headers=headers, proxies=self.proxies)
def rastrea(self): equipo_local = Util.equipo_equivalente_local( self.html('#marcador .equipo1 b').attr('title')) equipo_visitante = Util.equipo_equivalente_local( self.html('#marcador .equipo2 b').attr('title')) self.modelo['local'] = equipo_local self.modelo['visitante'] = equipo_visitante goles_local = int(self.html('.resultado span').eq(0).text()) goles_visitante = int(self.html('.resultado span').eq(1).text()) fecha = self.html('.jor-date').attr('content').replace( 'T', ' ').replace('+02:00', '').replace('+01:00', '') arbitro = self.html('.referee').eq(0).text() # arbitro = texto_arbitro[texto_arbitro.find(':')+1:].strip() arbitro_var = self.html('.referee').eq(4).text() # arbitro_var = texto_arbitro[texto_arbitro.find(':')+1:].strip() texto_asistencia = self.html('.as>span').text() if texto_asistencia != '': asistencia = int( texto_asistencia[texto_asistencia.find(':') + 1:].strip().split(' ')[0].replace('.', '')) else: asistencia = 0 print(fecha, "->", equipo_local, goles_local, "-", goles_visitante, equipo_visitante) """ print(arbitro, '| VAR:', arbitro_var) print(asistencia, 'espectadores') """ eventos = [] cambio = {'tipo': 'cambio'} events = self.html('.evento>.event-content') for content in events: evento = PyQuery(content) imagen_jugador = evento('img.event-avatar').attr('src') url_jugador = evento('.name>a').attr('href') jugador = Util.get_id_jugador(imagen_jugador, url_jugador) minuto = int( evento('.minutos').text().replace('\'', '').replace('minuto', '').strip()) if evento.find('.event_1'): tipo = 'gol' elif evento.find('.event_12'): tipo = 'gol_pp' elif evento.find('.event_11'): tipo = 'gol_penalti' elif evento.find('.event_8'): tipo = 'tarjeta_amarilla' elif evento.find('.event_9'): tipo = 'tarjeta_roja' elif evento.find('.event_6'): tipo = 'sale' elif evento.find('.event_7'): tipo = 'entra' else: continue if tipo not in ('sale', 'entra'): eventos.append({ 'tipo': tipo, 'jugador': jugador, 'minuto': minuto }) elif tipo == 'entra': cambio['entra'] = jugador cambio['minuto'] = minuto elif tipo == 'sale': cambio['sale'] = jugador cambio['minuto'] = minuto if 'entra' in cambio.keys() and 'sale' in cambio.keys(): eventos.append(cambio) cambio = {'tipo': 'cambio'} self.modelo['data'] = { 'fecha': fecha, 'goles_local': goles_local, 'goles_visitante': goles_visitante, 'arbitro': arbitro, 'arbitro_var': arbitro_var, 'asistencia': asistencia, 'eventos': eventos } with open('log.json', 'w') as file: file.write(json.dumps(self.modelo, indent=4))
def _get_pyquery(self, raw_html): return PyQuery(raw_html.decode('cp932'))
def reply_my_friend(msg): response = requests.get(msg.url) document = PyQuery(response.text) contents = document('#js_content').text() result = str(stats_word.stats_text(contents, 100)) msg.reply(result)
def extractEventDetailsFromURL(url): # Get plain text from HTML pq = PyQuery(url=url) textContent = pq('body').text() return extractEventDetails(textContent)
def parsed_xml(self): if 'Content-Type' not in self.headers: return None if not self.headers['Content-Type'].endswith('+xml'): return None return PyQuery(self.payload).remove_namespaces()
def test_identify_document(document, result): root = PyQuery(document) identify_document(root) assert root.outer_html() == result
#old# import pandas.io.data as web import pandas_datareader.data as web except Exception, e: module_import = 'import failed' app = Flask(__name__) from flaskext.mysql import MySQL mysql = MySQL() mysql.init_app(app) resp_ssl = requests.get('https://www.sslproxies.org', verify=False).text pq_ssl = PyQuery(resp_ssl) proxy_list = [] for p in pq_ssl('table tr'): proxy_list.append( pq_ssl(p)('td:nth-child(1)').text() + ':' + pq_ssl(p)('td:nth-child(2)').text()) proxy_count = 0 def get_new_proxy(): global proxy_count proxy_count = proxy_count + 1 get_proxy = proxy_list[proxy_count] #print get_proxy
def test_parse_markup_expression(text, result): assert parse_markup_expression(PyQuery(text)) == result
import stats_word import getpass import requests bot = Bot(cache_path=True) #机器人账号自身 myself = bot.self my_friend = bot.friends() msg = bot.messages @bot.register(my_friend, SHARING) def get_msgurl(msg): print(msg.url) embed() #用文章链接,获取返回结果 r = requests.get(get_msgurl(msg)) #提取正文 d = PyQuery(r.text) c = d('#js_content').text().replace(",", "").replace("。", "").replace("\n", "") c1 = stats_word.stats_text_cn(c) #转换为字符串 c2 = str(c1)
from pyquery import PyQuery doc = PyQuery(filename='./movie.html') print(doc('title'))
def crack(md5, auto=True): scraper = cfscrape.create_scraper() response = scraper.get(HOST + '/md5-decrypter.aspx') # Save headers and cookies, to be used in next request session = requests.session() session.headers = response.headers session.cookies = response.cookies query = PyQuery(response.content) image_path = query("#content1_imgCaptcha").attr("src") image_content = scraper.get(HOST + image_path).content # Trying to decaptcha image captcha_image = Image.open(StringIO.StringIO(image_content)) if auto: img = captcha_image.load() pix = captcha_image.size for x in xrange(pix[0]): for y in xrange(pix[1]): if img[x, y][0] < 107 or img[x, y][1] < 4: img[x, y] = (0, 0, 0, 255) if img[x, y][2] > 0: img[x, y] = (255, 255, 255, 0) captcha = image_to_string(captcha_image) captcha = filter(str.isalnum, captcha).upper() else: captcha_image.show() captcha = raw_input("[+] Input captcha: ") if len(captcha) != 6: return False scraper = cfscrape.create_scraper(sess=scraper) response = scraper.post( HOST + '/md5-decrypter.aspx', data={ 'ctl00$ScriptMan1': 'ctl00$content1$updDecrypt|ctl00$content1$btnSubmit', 'ctl00$content1$txtInput': md5, 'ctl00$content1$txtCaptcha': captcha, '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': query("#__VIEWSTATE").attr("value"), '__EVENTVALIDATION': query("#__EVENTVALIDATION").attr("value"), '__ASYNCPOST': 'true', 'ctl00$content1$btnSubmit': 'Submit', query('#content1_pnlStatus input').attr('name'): query('#content1_pnlStatus input').attr('value') }) response = PyQuery(response.content) status = response('#content1_lblStatus').text() result = response('#content1_lblResults .text-green').text() return status, result
def detail_by_hk(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#Main') # pdata = self.get_pdata(pqhtml) # roter,goodsData = self.get_goodsData(pqhtml) # print area.outerHtml().encode('utf-8') # print self.session.cookies.get_dict() # {'currency': '2', 'location': '3', 'T7_JSESSIONID': '308B6C0438C94F52A4048588D1E9D551', 'previous1': '10093665'} # print area.outerHtml() # exit() #下架 if u'缺货' in area('.detail-size form font').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.detail-info .right span[itemprop="brand"]').text() detail['brand'] = brand #名称 detail['name'] = brand + ' ' + area( '.detail-info .right span[itemprop="name"]').text() #货币,和cookie设置的值一致.web1域名为HK currency = 'HKD' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_price_by_hk(area) detail['price'] = price detail['listPrice'] = listPrice #产品ID productId = self.get_prdId_by_hk(area) detail['productId'] = productId #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #图片集 imgs = [a.attr('src') for a in pqhtml('#big_pic img').items()] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=self.cfg.DEFAULT_STOCK_NUMBER, sku=productId, id=productId) ] #描述 detail['descr'] = area('.detail-item').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def get_version_identifiers(node): pqi = PyQuery(node) return (pqi.attr('id_string'), pqi.attr('version'))