예제 #1
0
#Selfteaching day11 homework ,email-practice!html-practice!

from mymodule import stats_word as counts  #import module stats_word.py function

import requests  #调用requests 函数
r = requests.get(
    'https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA')  #从微信公众号取内容
r.encoding  #text 格式编码'utf-8'
r.text
response = r.text  #返回文档

from pyquery import PyQuery  #调用网页解释函数
document = PyQuery(response)
text = document('#js_content').text()

from mymodule import stats_word as counts  #import module stats_word.py function

l1 = counts.stats_text_cn(text, 100)  #输出词频TOP 100的词语

stats_string_resut = ''.join(str(i) for i in l1)  #以字符串输出统计结果
print('张小龙演讲Top 100高频词', stats_string_resut)

import getpass  #调用邮件接入模块,保障密码安全
sender = input('请输入发件人邮箱:')
password = getpass.getpass('输入发件人邮箱密码(可复制粘贴):')
recipients = input('请输入收件人邮箱:')

import yagmail  #调用邮件接入模块,保障密码安全
yag = yagmail.SMTP(
    sender, password,
    'smtp.139.com')  #user=sender,password=password,host='smtp.139.com'
예제 #2
0
import stats_word
import json
import requests
from pyquery import PyQuery
import yagmail
import getpass
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties

#获取网页内容
response = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA',
                        auth=('user', 'pass'))
document = PyQuery(response.text)
content = document('#js_content').text()  #获取文本
count = 10  #限制输出元素的变量
try:
    text_list = stats_word.stats_text(content, count)  #调用统计汉字的函数
    text_list2 = []  #存储词
    text_list3 = []  #存储词频
    for i in text_list:
        text_list2.append(i[0])  #把text_list里的词取出来
        text_list3.append(i[1])  #把text_list里的数字取出来
except ValueError as identifier:
    print('请输入字符串', identifier)
del text_list2[-2:]  #统计的结果后面总是有英文Twitter号之类,删掉
del text_list3[-2:]

#数据转化
words = tuple(text_list2)  #转换成元祖
y_pos = np.arange(len(words))  #y轴词的个数
예제 #3
0
    def detail_by_www(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#main #product_container')
            pdata = self.get_pdata(pqhtml)
            roter, goodsData = self.get_goodsData(pqhtml)

            # print pqhtml.outerHtml()
            # print area.outerHtml()
            # exit()

            #下架
            if roter['soldOut']:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            detail['brand'] = pdata['sc_brandENG']

            #名称
            detail['name'] = area('.product-titles').text()

            #货币,和pdata中['sc_priceRMB'] 绑定
            currency = 'CNY'
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(roter)
            detail['price'] = price
            detail['listPrice'] = listPrice or price

            #产品ID
            productId = pdata['sc_prdSKU'] or roter['goods_id']
            detail['productId'] = productId

            #颜色
            # color = self.get_color(area)
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = productId

            #图片集
            imgs = [
                a.attr('href')
                for a in pqhtml('.product-album-thumb .thumbnail a').items()
            ]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(roter)

            #描述
            detail['descr'] = pqhtml('.product-attributes').text() + pqhtml(
                '.product_detail').text()

            #详细
            detail['detail'] = pqhtml('.product_detail').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception:
            raise
예제 #4
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                break
            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:

                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                img = tweetPQ("div.AdaptiveMedia-photoContainer").attr(
                    'data-image-url')
                usernameTweet = tweetPQ(
                    "span.username.js-action-profile-name b").text()
                txt = re.sub(
                    r"\s+", " ",
                    tweetPQ("p.js-tweet-text").text().replace('# ',
                                                              '#').replace(
                                                                  '@ ', '@'))
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")
                user_id = int(
                    tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')
                urls = []
                for link in tweetPQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet

                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.formatted_date = datetime.datetime.fromtimestamp(
                    dateSec).strftime("%a %b %d %X +0000 %Y")
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(
                    re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.urls = ",".join(urls)
                tweet.author_id = user_id
                tweet.img = img

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #5
0
def test_parse_document_expressions(html, result):
    assert parse_document_expressions(PyQuery(html)) == result
예제 #6
0
import urllib3
from pyquery import PyQuery
import pymysql

db_conn = pymysql.connect("localhost", "root", "lw1001", "IR_db", charset='utf8', )
cursor = db_conn.cursor()

#cursor.execute('select * from books_cn')

url = 'http://book.km.com/shuku/165352.html'

jq = PyQuery(url)

for i in range(jq('.commentTxtList')('dl')('dd').length):
    print('---------------')
    print(jq('.commentTxtList')('dl')('dd').eq(i))

예제 #7
0
 def find_title(self, url=None):
     pq = PyQuery(url=url)
     return pq('title').text().replace(u'Volver a ver vídeos de', '')
예제 #8
0
def getData(search_kw):
    page = 1
    url = "http://www.104.com.tw/jobbank/joblist/auto_joblist.cfm?auto=1&jobsource=n104bank1&ro=0&keyword={skw}&order=1&asc=0&page={page}&psl=N_B".format(
        skw=search_kw.replace(" ", "+"), page=page)
    res = requests.get(url)
    S = PyQuery(res.text)
    max_pages = int(PyQuery(S("#box_page_bottom_2 li > a")[-1]).text())
    data = []
    data.extend(
        S(".j_cont").map(
            lambda i, e: {
                "name":
                PyQuery(e)(".job_name").text(),
                "url":
                PyQuery(e)("a").attr("href"),
                "meta":
                dict(
                    PyQuery(e)("meta").map(lambda ii, ee: (PyQuery(ee).attr(
                        "itemprop"), PyQuery(ee).attr("content")))),
                "area":
                PyQuery(e)(".area_summary").text(),
                "company_name":
                PyQuery(e)(".compname_summary").text(),
                "company_meta":
                PyQuery(e)(".compname_summary span").attr("title"),
                "candidates_summary":
                PyQuery(e)(".candidates_summary").text(),
                "requirement":
                PyQuery(e)(".requirement").text(),
                "joblist_summary":
                PyQuery(e)(".joblist_summary").text(),
                "searched_keyword":
                search_kw,
                "crawledAt":
                datetime.utcnow()
            }))

    for page in range(2, max_pages + 1):
        url = "http://www.104.com.tw/jobbank/joblist/auto_joblist.cfm?auto=1&jobsource=n104bank1&ro=0&keyword={skw}&order=1&asc=0&page={page}&psl=N_B".format(
            skw=search_kw.replace(" ", "+"), page=page)
        res = requests.get(url)
        S = PyQuery(res.text)
        data.extend(
            S(".j_cont").map(
                lambda i, e: {
                    "name":
                    PyQuery(e)(".job_name").text(),
                    "url":
                    PyQuery(e)("a").attr("href"),
                    "meta":
                    dict(
                        PyQuery(e)("meta").map(lambda ii, ee:
                                               (PyQuery(ee).attr("itemprop"),
                                                PyQuery(ee).attr("content")))),
                    "area":
                    PyQuery(e)(".area_summary").text(),
                    "company_name":
                    PyQuery(e)(".compname_summary").text(),
                    "company_meta":
                    PyQuery(e)(".compname_summary span").attr("title"),
                    "candidates_summary":
                    PyQuery(e)(".candidates_summary").text(),
                    "requirement":
                    PyQuery(e)(".requirement").text(),
                    "joblist_summary":
                    PyQuery(e)(".joblist_summary").text(),
                    "searched_keyword":
                    search_kw,
                    "crawledAt":
                    datetime.utcnow()
                }))

    df = pd.DataFrame(data)
    return df
예제 #9
0
def ensure_pq(elm):
    if isinstance(elm, PyQuery):
        return elm
    else:
        return PyQuery(elm)
예제 #10
0
wr.writerow([
    'URL', 'CIK', 'FileName', 'RecordType', 'securityTitle',
    'conversionOrExercisePrice', 'transactionDate', 'deemedExecutionDate',
    'transactionFormType', 'transactionCode', 'equitySwapInvolved',
    'transactionTimeliness', 'transactionShares', 'transactionTotalValue',
    'transactionPricePerShare', 'transactionAcquiredDisposedCode',
    'exerciseDate', 'expirationDate', 'underlyingSecurityTitle',
    'underlyingSecurityShares', 'underlyingSecurityValue',
    'sharesOwnedFollowingTransaction', 'valueOwnedFollowingTransaction',
    'directOrIndirectOwnership', 'natureOfOwnership'
])

for url in input_urls:  #reversed(input_urls):#['https://www.sec.gov/Archives/edgar/data/1105078/0001127602-17-021121.txt']:
    print(url)
    r = requests.get(url).text
    pq = PyQuery(r)

    j = json.loads(
        json.dumps(
            xmltodict.parse(str(pq('XML').children()).replace('??>', '?>'))))
    #print (json.dumps(j))
    try:
        nonDerivativeHolding = j.get('ownershipdocument',
                                     {}).get('nonderivativetable',
                                             {}).get('nonderivativeholding')
    except:
        pass

    try:
        derivativeHolding = j.get('ownershipdocument',
                                  {}).get('derivativetable',
예제 #11
0
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #错误
            if len(pqhtml('.error_message')) >= 1:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SAKERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#pdp-content-area')
            pdata = self.get_pdata(area)

            # print json.dumps(pdata)
            # exit()

            #下架
            if pdata['sold_out_message']['enabled'] or pdata[
                    'intl_shipping_restriction']['enabled']:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = pdata['brand_name']['label'] if pdata['brand_name'][
                'enabled'] else ''
            detail['brand'] = brand

            #名称
            detail['name'] = pdata['short_description']

            #货币
            currency = pdata['price']['list_price']['local_currency_code']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(pdata)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            color = dict([(clor['id'], clor['label'])
                          for clor in pdata['colors']['colors']])
            colorId = dict([(clor['id'], clor['id'])
                            for clor in pdata['colors']['colors']])
            detail['color'] = color
            detail['colorId'] = colorId

            #图片集
            imgs = self.get_imgs(pdata)
            detail['img'] = imgs[0] if isinstance(imgs, list) else dict(
                [(cid, Arr[0]) for cid, Arr in imgs.items()])
            detail['imgs'] = imgs

            #钥匙
            detail['keys'] = color.keys()

            #产品ID
            productId = pdata['product_code']
            detail['productId'] = productId

            #规格
            detail['sizes'] = self.get_sizes(pdata)

            #描述
            detail['descr'] = PyQuery(pdata['description']).text()

            #退换货
            detail['returns'] = pdata['simple_shipping_statement']['message']

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #12
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None,
                  debug=False):
        """Get tweets that match the tweetCriteria parameter
        A static method.

        Parameters
        ----------
        tweetCriteria : tweetCriteria, an object that specifies a match criteria
        receiveBuffer : callable, a function that will be called upon a getting next `bufferLength' tweets
        bufferLength: int, the number of tweets to pass to `receiveBuffer' function
        proxy: str, a proxy server to use
        debug: bool, output debug information
        """
        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()
        user_agent = random.choice(TweetManager.user_agents)

        all_usernames = []
        usernames_per_batch = 20

        if hasattr(tweetCriteria, 'username'):
            if type(tweetCriteria.username) == str or not hasattr(
                    tweetCriteria.username, '__iter__'):
                tweetCriteria.username = [tweetCriteria.username]

            usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u]
            all_usernames = sorted({u.lower() for u in usernames_ if u})
            n_usernames = len(all_usernames)
            n_batches = n_usernames // usernames_per_batch + (
                n_usernames % usernames_per_batch > 0)
        else:
            n_batches = 1

        for batch in range(n_batches):  # process all_usernames by batches
            refreshCursor = ''
            batch_cnt_results = 0

            if all_usernames:  # a username in the criteria?
                tweetCriteria.username = all_usernames[
                    batch * usernames_per_batch:batch * usernames_per_batch +
                    usernames_per_batch]

            active = True
            while active:
                json = TweetManager.getJsonReponse(tweetCriteria,
                                                   refreshCursor,
                                                   cookieJar,
                                                   proxy,
                                                   user_agent,
                                                   debug=debug)
                if len(json['items_html'].strip()) == 0:
                    break

                refreshCursor = json['min_position']
                scrapedTweets = PyQuery(json['items_html'])
                #Remove incomplete tweets withheld by Twitter Guidelines
                scrapedTweets.remove('div.withheld-tweet')
                tweets = scrapedTweets('div.js-stream-tweet')

                if len(tweets) == 0:
                    break

                for tweetHTML in tweets:
                    tweetPQ = PyQuery(tweetHTML)
                    tweet = models.Tweet()

                    usernames = tweetPQ("span.username.u-dir b").text().split()
                    tweet.username = usernames[0]
                    tweet.to = usernames[1] if len(usernames) == 2 else None
                    tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text())\
                        .replace('# ', '#').replace('@ ', '@').replace('$ ', '$')
                    tweet.retweets = int(
                        tweetPQ(
                            "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.favorites = int(
                        tweetPQ(
                            "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.id = tweetPQ.attr("data-tweet-id")
                    tweet.permalink = 'https://twitter.com' + tweetPQ.attr(
                        "data-permalink-path")
                    tweet.author_id = int(
                        tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                    tweet.date = datetime.datetime.fromtimestamp(
                        dateSec, tz=datetime.timezone.utc)
                    tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc)\
                                                            .strftime("%a %b %d %X +0000 %Y")
                    tweet.mentions = " ".join(
                        re.compile('(@\\w*)').findall(tweet.text))
                    tweet.hashtags = " ".join(
                        re.compile('(#\\w*)').findall(tweet.text))

                    geoSpan = tweetPQ('span.Tweet-geo')
                    if len(geoSpan) > 0:
                        tweet.geo = geoSpan.attr('title')
                    else:
                        tweet.geo = ''

                    urls = []
                    for link in tweetPQ("a"):
                        try:
                            urls.append((link.attrib["data-expanded-url"]))
                        except KeyError:
                            pass

                    tweet.urls = ",".join(urls)

                    results.append(tweet)
                    resultsAux.append(tweet)

                    if receiveBuffer and len(resultsAux) >= bufferLength:
                        receiveBuffer(resultsAux)
                        resultsAux = []

                    batch_cnt_results += 1
                    if tweetCriteria.maxTweets > 0 and batch_cnt_results >= tweetCriteria.maxTweets:
                        active = False
                        break

            if receiveBuffer and len(resultsAux) > 0:
                receiveBuffer(resultsAux)
                resultsAux = []

        return results
예제 #13
0
파일: thehut.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            Jtxt = pqhtml('script').text()
            area = pqhtml(
                'div.body-wrap>div#page-container section.product-area')
            varea = pqhtml(
                'div.body-wrap>div#page-container section.product-large-view-container'
            )
            self.imgPrefix = re.search(r"productImagePrefix:\s*'(.*?)'", Jtxt,
                                       re.DOTALL).groups()[0]

            # print area.outerHtml().encode('utf-8')
            # exit()

            #下架
            if resp.url[resp.url.rfind('/'):] != url[url.rfind(
                    '/'):] or pqhtml('.cat-button').hasClass('soldout'):

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            detail = dict()

            #名称
            detail['name'] = area('h1.product-title').text()

            #品牌
            detail['brand'] = self.get_brand(area)

            #货币
            currency = area('meta[itemprop="priceCurrency"]').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #描述
            detail['descr'] = area('div.product-info').text() + area(
                'div.product-more-details').text()

            #产品ID
            productId = area('.buying-area input.buy').attr('value') or area(
                '.buynow').attr('href').split('buy=')[-1]
            detail['productId'] = productId

            #退货
            detail['returns'] = area('div.product-delivery-returns').text()

            #图片集
            imgsTmp = self.get_imgs(varea)

            #颜色
            color = self.get_color(area)

            detail['color'] = color

            #钥匙
            if isinstance(color, dict):
                detail['keys'] = color.keys()
                detail['colorId'] = dict([(Id, Id) for Id in color.keys()])
            else:
                detail['colorId'] = productId

            #信息.图片,价格
            sizes, imgs, price, listPrice = self.get_info(area, varea, url)

            #价格
            detail['img'] = imgs[0] if isinstance(imgs, list) else dict(
                [(cid, arr[0]) for cid, arr in imgs.items()])
            detail['imgs'] = imgs
            detail['sizes'] = sizes
            detail['price'] = price
            detail['listPrice'] = listPrice

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #14
0
def get_article():
    r = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA')
    document = PyQuery(r.text)
    return document('#js_content').text()
예제 #15
0
    def parse(self, response):
        item = EfinancialcareersItem()
        
        pq = PyQuery(response.body_as_unicode())
        
        item['source_post_id'] = response.url.split('.')[-1]
        
        try:
            item['url'] = response.url
        except:
            item['url'] = ''
            
            
        try:

            item['posted_date'] = self.current_day
        except:
            item['posted_date'] = ''
            
        try:
            item['job_title_raw'] = pq('[itemprop="title"]').text()
        except:
            item['job_title_raw'] = ''
        job_category = ''
        for jc in pq('[class="breadcrumb"] li a'):
            job_category = jc.text
        
        try:
            item['job_category'] = job_category
        except:
            item['job_category'] = ''
            
        try:
            item['job_type'] = pq('[itemprop="employmentType"]').text()
        except:
            item['job_type'] = ''
            
        try:
            item['description'] = pq('.description').text()
        except:
            item['description'] = ''
            
        try:
            item['company_raw'] = pq('.brandInfo h2').text()
        except:
            item['company_raw'] = ''
            
        if 'our client is' in pq('[itemprop="description"].description').text().lower():
            item['company_ad_type'] = 'Recruitment agency'
        else:
            item['company_ad_type'] = 'Direct employer'
            
        try:
            item['country'] = 'SG'
        except:
            item['country'] =''
            
        try:
            item['location'] = pq('[itemprop="addressLocality"]').text()
        except:
            item['location'] = ''
            
        
        if not 'isJobExpired=true' in response.url.split('.')[-1]:
            yield item
예제 #16
0
from __future__ import print_function
import sys
from pyquery import PyQuery
import re

filename = sys.argv[1]

print("Processing TOC for " + filename + "...")

with open(filename, 'r+') as f:
    text = f.read()

with open(filename, 'w') as f:   
    f.write('{% extends "books/pfe/base.html" %}\n\n')
    pattern = re.compile(r"{% block toc %}(.*){% endblock %}", re.DOTALL)
    matches = re.findall(pattern, text)
    d = PyQuery(matches[0])
    # first ul gets 'right'
    d('ul').eq(0).addClass('right')
    # first li gets dropdown
    d('li').eq(0).addClass('has-dropdown')
    # second ul is dropdown
    d('ul').eq(1).addClass('dropdown')

    toc = "{% block toc %}\n" + str(d) + "\n{% endblock %}"
    newtext = re.sub(pattern, toc, text)
    f.write(newtext)
예제 #17
0
    def login(self):
        # SSO login for installed apps to get an auth code
        user = test_data['g10f_user']
        client = test_data['g10f_oidc_web_client']
        headers = {"referer": client['redirect_uri']}
        response = self.client.get(
            self.get_authentication_uri(client),
            headers=headers,
            proxies=self.proxies,
            name=urlsplit(
                self.openid_configuration['authorization_endpoint']).path)
        pq = PyQuery(response.content)
        data = {
            "username": user['username'],
            "password": user['password'],
            "csrfmiddlewaretoken":
            pq("input[name='csrfmiddlewaretoken']").val(),
            "login_form_key": "login_form",
            "next": pq("input[name='next']").val()
        }
        path_url = response.request.path_url
        headers = {"referer": response.request.url}
        response = self.client.post(path_url,
                                    data=data,
                                    headers=headers,
                                    proxies=self.proxies,
                                    allow_redirects=False,
                                    name=urlsplit(path_url).path)
        if response.next is None:
            print(response.text)
            return
        response = self.client.get(response.next.url,
                                   headers=headers,
                                   proxies=self.proxies,
                                   allow_redirects=False,
                                   name=urlsplit(response.next.url).path)
        query = parse_qs(urlsplit(response.next.url).query)

        # user logout, because we have now a auth code to authenticate
        self.client.get(self.openid_configuration['end_session_endpoint'],
                        proxies=self.proxies)

        headers = {
            'content-type': 'application/x-www-form-urlencoded',
            'Accept': 'application/json'
        }
        client_id = client['client_id']
        client_secret = client['client_secret']
        data = {
            'grant_type': 'authorization_code',
            'code': query['code'][0],
            'redirect_uri': client['redirect_uri']
        }
        # if code_verifier:
        #    data['code_verifier'] = code_verifier
        if client_secret:
            auth = b"%s:%s" % (client_id.encode(), client_secret.encode())
            headers['authorization'] = '%s %s' % (
                'Basic', b64encode(auth).decode("ascii"))

        response = self.client.post(
            self.openid_configuration['token_endpoint'],
            data=data,
            headers=headers,
            proxies=self.proxies)

        content = response.json()
        headers = {
            'accept':
            'application/json',
            'authorization':
            '%s %s' % (content['token_type'], content['access_token'])
        }
        self.client.get(self.openid_configuration['userinfo_endpoint'],
                        headers=headers,
                        proxies=self.proxies)
예제 #18
0
    def rastrea(self):
        equipo_local = Util.equipo_equivalente_local(
            self.html('#marcador .equipo1 b').attr('title'))
        equipo_visitante = Util.equipo_equivalente_local(
            self.html('#marcador .equipo2 b').attr('title'))

        self.modelo['local'] = equipo_local
        self.modelo['visitante'] = equipo_visitante

        goles_local = int(self.html('.resultado span').eq(0).text())
        goles_visitante = int(self.html('.resultado span').eq(1).text())

        fecha = self.html('.jor-date').attr('content').replace(
            'T', ' ').replace('+02:00', '').replace('+01:00', '')

        arbitro = self.html('.referee').eq(0).text()
        # arbitro = texto_arbitro[texto_arbitro.find(':')+1:].strip()

        arbitro_var = self.html('.referee').eq(4).text()
        # arbitro_var = texto_arbitro[texto_arbitro.find(':')+1:].strip()

        texto_asistencia = self.html('.as>span').text()
        if texto_asistencia != '':
            asistencia = int(
                texto_asistencia[texto_asistencia.find(':') +
                                 1:].strip().split(' ')[0].replace('.', ''))
        else:
            asistencia = 0

        print(fecha, "->", equipo_local, goles_local, "-", goles_visitante,
              equipo_visitante)
        """
        print(arbitro, '| VAR:', arbitro_var)
        print(asistencia, 'espectadores')
        """

        eventos = []
        cambio = {'tipo': 'cambio'}

        events = self.html('.evento>.event-content')
        for content in events:
            evento = PyQuery(content)

            imagen_jugador = evento('img.event-avatar').attr('src')
            url_jugador = evento('.name>a').attr('href')
            jugador = Util.get_id_jugador(imagen_jugador, url_jugador)

            minuto = int(
                evento('.minutos').text().replace('\'',
                                                  '').replace('minuto',
                                                              '').strip())

            if evento.find('.event_1'):
                tipo = 'gol'
            elif evento.find('.event_12'):
                tipo = 'gol_pp'
            elif evento.find('.event_11'):
                tipo = 'gol_penalti'
            elif evento.find('.event_8'):
                tipo = 'tarjeta_amarilla'
            elif evento.find('.event_9'):
                tipo = 'tarjeta_roja'
            elif evento.find('.event_6'):
                tipo = 'sale'
            elif evento.find('.event_7'):
                tipo = 'entra'
            else:
                continue

            if tipo not in ('sale', 'entra'):
                eventos.append({
                    'tipo': tipo,
                    'jugador': jugador,
                    'minuto': minuto
                })
            elif tipo == 'entra':
                cambio['entra'] = jugador
                cambio['minuto'] = minuto
            elif tipo == 'sale':
                cambio['sale'] = jugador
                cambio['minuto'] = minuto

            if 'entra' in cambio.keys() and 'sale' in cambio.keys():
                eventos.append(cambio)
                cambio = {'tipo': 'cambio'}

        self.modelo['data'] = {
            'fecha': fecha,
            'goles_local': goles_local,
            'goles_visitante': goles_visitante,
            'arbitro': arbitro,
            'arbitro_var': arbitro_var,
            'asistencia': asistencia,
            'eventos': eventos
        }

        with open('log.json', 'w') as file:
            file.write(json.dumps(self.modelo, indent=4))
예제 #19
0
파일: eagate.py 프로젝트: 7kry/iidxdjdata
 def _get_pyquery(self, raw_html):
   return PyQuery(raw_html.decode('cp932'))
예제 #20
0
def reply_my_friend(msg):
    response = requests.get(msg.url)
    document = PyQuery(response.text)
    contents = document('#js_content').text()
    result = str(stats_word.stats_text(contents, 100))
    msg.reply(result)
예제 #21
0
def extractEventDetailsFromURL(url):
	# Get plain text from HTML
	pq = PyQuery(url=url)
	textContent = pq('body').text()
	return extractEventDetails(textContent)
예제 #22
0
 def parsed_xml(self):
     if 'Content-Type' not in self.headers:
         return None
     if not self.headers['Content-Type'].endswith('+xml'):
         return None
     return PyQuery(self.payload).remove_namespaces()
예제 #23
0
def test_identify_document(document, result):
    root = PyQuery(document)
    identify_document(root)
    assert root.outer_html() == result
예제 #24
0
    #old# import pandas.io.data as web
    import pandas_datareader.data as web

except Exception, e:
    module_import = 'import failed'

app = Flask(__name__)

from flaskext.mysql import MySQL

mysql = MySQL()

mysql.init_app(app)

resp_ssl = requests.get('https://www.sslproxies.org', verify=False).text
pq_ssl = PyQuery(resp_ssl)

proxy_list = []
for p in pq_ssl('table tr'):
    proxy_list.append(
        pq_ssl(p)('td:nth-child(1)').text() + ':' +
        pq_ssl(p)('td:nth-child(2)').text())

proxy_count = 0


def get_new_proxy():
    global proxy_count
    proxy_count = proxy_count + 1
    get_proxy = proxy_list[proxy_count]
    #print get_proxy
예제 #25
0
def test_parse_markup_expression(text, result):
    assert parse_markup_expression(PyQuery(text)) == result
예제 #26
0
import stats_word
import getpass
import requests

bot = Bot(cache_path=True)

#机器人账号自身
myself = bot.self

my_friend = bot.friends()
msg = bot.messages


@bot.register(my_friend, SHARING)
def get_msgurl(msg):
    print(msg.url)


embed()

#用文章链接,获取返回结果
r = requests.get(get_msgurl(msg))

#提取正文
d = PyQuery(r.text)
c = d('#js_content').text().replace(",", "").replace("。", "").replace("\n", "")

c1 = stats_word.stats_text_cn(c)
#转换为字符串
c2 = str(c1)
예제 #27
0
from pyquery import PyQuery


doc = PyQuery(filename='./movie.html')
print(doc('title'))
예제 #28
0
def crack(md5, auto=True):
    scraper = cfscrape.create_scraper()
    response = scraper.get(HOST + '/md5-decrypter.aspx')

    # Save headers and cookies, to be used in next request
    session = requests.session()
    session.headers = response.headers
    session.cookies = response.cookies

    query = PyQuery(response.content)
    image_path = query("#content1_imgCaptcha").attr("src")
    image_content = scraper.get(HOST + image_path).content

    # Trying to decaptcha image
    captcha_image = Image.open(StringIO.StringIO(image_content))

    if auto:
        img = captcha_image.load()
        pix = captcha_image.size

        for x in xrange(pix[0]):
            for y in xrange(pix[1]):
                if img[x, y][0] < 107 or img[x, y][1] < 4:
                    img[x, y] = (0, 0, 0, 255)
                if img[x, y][2] > 0:
                    img[x, y] = (255, 255, 255, 0)

        captcha = image_to_string(captcha_image)
        captcha = filter(str.isalnum, captcha).upper()
    else:
        captcha_image.show()
        captcha = raw_input("[+] Input captcha: ")

    if len(captcha) != 6:
        return False

    scraper = cfscrape.create_scraper(sess=scraper)
    response = scraper.post(
        HOST + '/md5-decrypter.aspx',
        data={
            'ctl00$ScriptMan1':
            'ctl00$content1$updDecrypt|ctl00$content1$btnSubmit',
            'ctl00$content1$txtInput':
            md5,
            'ctl00$content1$txtCaptcha':
            captcha,
            '__EVENTTARGET':
            '',
            '__EVENTARGUMENT':
            '',
            '__VIEWSTATE':
            query("#__VIEWSTATE").attr("value"),
            '__EVENTVALIDATION':
            query("#__EVENTVALIDATION").attr("value"),
            '__ASYNCPOST':
            'true',
            'ctl00$content1$btnSubmit':
            'Submit',
            query('#content1_pnlStatus input').attr('name'):
            query('#content1_pnlStatus input').attr('value')
        })
    response = PyQuery(response.content)
    status = response('#content1_lblStatus').text()
    result = response('#content1_lblResults .text-green').text()

    return status, result
예제 #29
0
    def detail_by_hk(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#Main')
            # pdata = self.get_pdata(pqhtml)
            # roter,goodsData = self.get_goodsData(pqhtml)

            # print area.outerHtml().encode('utf-8')
            # print self.session.cookies.get_dict()
            # {'currency': '2', 'location': '3', 'T7_JSESSIONID': '308B6C0438C94F52A4048588D1E9D551', 'previous1': '10093665'}
            # print area.outerHtml()
            # exit()

            #下架
            if u'缺货' in area('.detail-size form font').text():

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)
                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = area('.detail-info .right span[itemprop="brand"]').text()
            detail['brand'] = brand

            #名称
            detail['name'] = brand + ' ' + area(
                '.detail-info .right span[itemprop="name"]').text()

            #货币,和cookie设置的值一致.web1域名为HK
            currency = 'HKD'
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_price_by_hk(area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #产品ID
            productId = self.get_prdId_by_hk(area)
            detail['productId'] = productId

            #颜色
            # color = self.get_color(area)
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = productId

            #图片集
            imgs = [a.attr('src') for a in pqhtml('#big_pic img').items()]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = [
                dict(name=self.cfg.DEFAULT_ONE_SIZE,
                     inventory=self.cfg.DEFAULT_STOCK_NUMBER,
                     sku=productId,
                     id=productId)
            ]

            #描述
            detail['descr'] = area('.detail-item').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #30
0
def get_version_identifiers(node):
    pqi = PyQuery(node)
    return (pqi.attr('id_string'), pqi.attr('version'))