Python HTML.xpath примеры, lxml.etree.HTML.xpath Python примеры использования

Пример #1

0

Показать файл

 def testRemoteInfoDrilldownValues(self):
     header, body = getRequest(port=self.httpPort, path='/remote/info/drilldownvalues', arguments=dict(path='untokenized.field2', name='main'), parse=False)
     self.assertFalse('Traceback' in body, body)
     bodyLxml = HTML(body)
     self.assertEquals(set(['value1', 'value0', 'value9', 'value8', 'value7', 'value6', 'value5', 'value4', 'value3', 'othervalue2', 'value2']), set(bodyLxml.xpath('//ul/li/a/text()')))

Пример #2

0

Показать файл

Файл: get_debian_version.py Проект: ryanq1/Mastering-Python-Networking

import re
import requests
from lxml.etree import HTML

response = requests.get('http://www.debian.org/releases/stable/')
root = HTML(response.content)
title_text = root.find('head').find('title').text
release = re.search('\u201c(.*)\u201d', title_text).group(1)
p_text = root.xpath('//div[@id="content"]/p[1]')[0].text
version = p_text.split()[1]

print('Codename: {}\nVersion: {}'.format(release, version))

Пример #3

0

Показать файл

def parseData(urlList):
    urlW=open("/usr/caizhuang/jiajiemao/url.txt" ,'a')
    for u in urlList:
        url=u.get("href").strip()
        print url
        urlW.write(url)
        urlW.write("\n")
        h = HTML(getHtml(url).decode('gbk'))
        try:
            dTxt=h.xpath('//h3')
            name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字
            brand=dTxt[0].text.strip().split()[0]#品牌
        except Exception:
            errorTxt.write(url)
#        print brand
#        print name
        try:
            pCpgg=h.xpath('//p[@class="pCpgg"]')
            td=h.xpath('//td[@class="td2"]')  
        except Exception:
            errorTxt.write(url)
        try:
            if td:
                price=list(td[0].itertext())[1].strip()
            else :
                price=list(pCpgg[0].itertext())[1].strip()#价格
#    print price   
        except Exception:
            errorTxt.write(url)
        try:
            norms=list(pCpgg[-1].itertext())[1].strip()#规格
        #    print norms
        except Exception:
            errorTxt.write(url)
        try:
            spePs=h.xpath('//p[@class="speP"]/a')
            effect=''
            for speP in spePs:
                effect+=speP.text.strip()+" "#功效
        #    print effect
        except Exception:
            errorTxt.write(url)
        try:
            awrap=h.xpath('//div[@class="Awrap"]/ul/li/a')
            imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址
        #    print imgUrl
        except Exception:
            errorTxt.write(url)
        try:
            troCon=h.xpath('//div[@class="troCon"]')
            des=list(troCon[0].itertext())
            description=''
            for d in des:
                if len(d.strip())>20:
                    description+=d.strip()+""#产品描述
    #    print description
        except Exception:
            errorTxt.write(url)    
    
        
        try:
            dTxt=h.xpath('//div[@class="dTxt"]/p/a')
            series=dTxt[1].text.strip() #系列
        except Exception:
            errorTxt.write(url) 
        
#        print series
        insertData(name,brand,price,norms,effect,imgUrl,description,series)

Пример #4

0

Показать файл

START_URL = 'https://free-proxy-list.net/'
HEADERS = ['"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"']
PATH = 'ip_address/'

options = ChromeOptions()
options.add_argument(HEADERS[0])

browser = webdriver.Chrome(options = options)

if __name__ == '__main__':
    f = open(PATH+'ip.csv', 'w')
    browser.get(START_URL)
    for i in range(15):
        content = browser.page_source
        html = HTML(content)

        ip_address = html.xpath('//tr//td[1]//text()')[:20]
        port = html.xpath('//tr//td[2]//text()')[:20]
        anonimity = html.xpath('//tr//td[5]//text()')[:20]
        https = html.xpath('//tr//td[@class="hx"]//text()')[:20]
        
        for i in range(len(port)):
            line = ','.join([ip_address[i], port[i], anonimity[i], https[i]])
            f.write(line)
            f.write('\n')
        time.sleep(2)
        button = browser.find_element_by_xpath('//*[@id="proxylisttable_next"]/a')
        button.click()
    f.close()

Пример #5

0

Показать файл

Файл: doubanTop.py Проект: zack7wong/spiders

def start():
    num = 1
    for i in range(0, 10):
        print('当前页：' + str(i))
        pageToken = i * 25
        start_url = 'https://movie.douban.com/top250?start={pageToken}&filter='.format(
            pageToken=pageToken)
        print(start_url)
        response = requests.get(start_url, headers=headers)
        # print(response.text)
        html = HTML(response.text)
        urls = html.xpath(
            '//ol[@class="grid_view"]/li//div[@class="hd"]/a/@href')
        for url in urls:
            print(url)
            movieId = re.search('https://movie.douban.com/subject/(\d+)/',
                                url).group(1)
            response = requests.get(url, headers=headers)
            html = HTML(response.text)

            MovieNameStr = html.xpath('string(//h1/span/text())')
            MovieName = MovieNameStr.split(' ')[0].replace('\'', '"')
            EnglishName = ' '.join(MovieNameStr.split(' ')[1:]).replace(
                '\'', '"')

            pattern_all_zh = r'([\u4e00-\u9fa5])'
            text_cn_split = re.findall(pattern_all_zh, EnglishName, re.S)
            if text_cn_split:
                EnglishName = ''
                MovieName = MovieNameStr.replace('\'', '"').strip()

            jsonStr = re.search('<script type="application.*?">(.*?)</script>',
                                response.text,
                                re.S).group(1).replace('\n', '').strip()
            # print(response.text)
            print(jsonStr)
            json_obj = json.loads(jsonStr)

            OtherName = ''
            OtherNameStr = re.search('又名:</span>(.*?)<br/>', response.text)
            if OtherNameStr:
                OtherName = OtherNameStr.group(1).strip().replace('\'', '"')
            DirectorList = []
            for dire in json_obj['director']:
                DirectorList.append(dire['name'])
            Director = '|'.join(DirectorList).replace('\'', '"')

            ActorsList = []
            for dire in json_obj['actor']:
                ActorsList.append(dire['name'])
            Actors = '|'.join(ActorsList).replace('\'', '"')

            Year = json_obj['datePublished']
            Country = re.search('制片国家/地区:</span>(.*?)<br/>',
                                response.text).group(1).replace(
                                    '\n', '').replace('\'', '"').strip()
            timeLong = re.search(
                '片长:</span> <span property="v:runtime" content="(\d+)"',
                response.text).group(1).replace('\n', '').replace('\'',
                                                                  '"').strip()
            language = re.search('语言:</span>(.*?)<br/>',
                                 response.text).group(1).replace(
                                     '\n', '').replace('\'', '"').strip()

            Grenre = '|'.join(json_obj['genre'])
            Rating = json_obj['aggregateRating']['ratingValue']
            RatingNum = json_obj['aggregateRating']['ratingCount']
            Description = json_obj['description']

            # print(movieId)
            # print(MovieName)
            # print(EnglishName)
            # print(Director)
            # print(Actors)
            # print(Year)
            # print(Country)
            # print(Grenre)
            # print(Rating)
            # print(RatingNum)
            # print(Description)

            sql = "insert into info(movieId,num,MovieName,EnglishName,OtherName,Director,Actors,Year,Country,Grenre,Rating,RatingNum,Description,timeLong,language) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " % (
                movieId, num, MovieName, EnglishName, OtherName, Director,
                Actors, Year, Country, Grenre, Rating, RatingNum, Description,
                timeLong, language)
            num += 1

            print(sql)
            dbCli.save(sql)

Пример #6

0

Показать файл

Файл: get_info.py Проект: zack7wong/spiders

def start():
    date_list = []
    with open('date.txt') as f:
        results = f.readlines()
        for res in results:
            date_list.append(res.strip())
    print(date_list)

    item_list = []
    with open('fujian_id1.txt') as f:
        results = f.readlines()
        for res in results:
            url = res.split(',')[0]
            title = res.split(',')[1].strip()
            obj = {
                'url':url,
                'title':title,
            }
            item_list.append(obj)

    print(len(item_list))
    print(item_list[:10])

    for itemObj in item_list:
        print(itemObj)
        try:
            for date in date_list:
                print('当前日期:'+date)
                start_url = itemObj['url']
                title = itemObj['title']
                body = '__VIEWSTATE=%2FwEPDwUJNDk2MTM2Mzc5ZGTd37nDAAZ8HMoQ9C6MjYnecXynQQ%3D%3D&__EVENTVALIDATION=%2FwEWAwKKm%2FSpBwKnpoOOCwKY7%2B%2FtCc29g5gXa%2BvZaoWCWvhGPER39rFI&right%24l_date={date}&right%24Button1=%CB%D1%CB%F7'
                data = body.format(date=date)
                try:
                    response = requests.post(start_url, data=data, headers=headers,timeout=10)
                except:
                    continue
                # print(response.text)

                html =HTML(response.text)
                tr_list = html.xpath('//form[@id="aspnetForm"]//div[@class="table3"]//tr')
                if len(tr_list) == 1:
                    print('无数据')
                for item in tr_list[1:]:
                    try:
                        td_list = item.xpath('./td')
                        if len(td_list) == 8:
                            jiancedianName = item.xpath('string(./td[1])')
                            jianceTime = item.xpath('string(./td[2])')
                            jianceProject = item.xpath('string(./td[3])')
                            jianceValue = item.xpath('string(./td[4])')
                            biaozhunValue = item.xpath('string(./td[5])')
                            shifoudabiao = item.xpath('string(./td[6])')
                            chaobiaobenshu = item.xpath('string(./td[7])')
                            shifoutingchan = item.xpath('string(./td[8])')
                        elif len(td_list) == 6:
                            jianceProject = item.xpath('string(./td[1])')
                            jianceValue = item.xpath('string(./td[2])')
                            biaozhunValue = item.xpath('string(./td[3])')
                            shifoudabiao = item.xpath('string(./td[4])')
                            chaobiaobenshu = item.xpath('string(./td[5])')
                            shifoutingchan = item.xpath('string(./td[6])')

                        # 企业名称、污染源类型（废水、废气）、监测点名称、监测时间、监测项目、监测值、标准值、是否达标、超标倍数、是否停产
                        print(title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan)

                        sql = "insert into fujian(title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan)" \
                              " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                              % (title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan)
                        dbclient.save(sql)
                    except:
                        continue
        except:
            continue

Пример #7

0

Показать файл

Файл: cyberctm_feifei.py Проект: inwikipedia/spiders

def start():

    #获取第一页
    pageToken = 1
    start_url = 'https://forum.cyberctm.com/home.php?mod=space&uid=503430&do=thread&view=me&order=dateline&page=' + str(
        pageToken)
    if USE_PROXY:
        response = requests.get(start_url,
                                headers=start_headers,
                                proxies=proxies)
    else:
        response = requests.get(start_url, headers=start_headers)
    # print(response.text)
    html = HTML(response.text)

    url_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/@href')
    title_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/text()')

    num = 1
    item_list = []
    for url, title in zip(url_list, title_list):
        #请求详情页
        print(str(num) + '. ' + title)
        link = 'https://forum.cyberctm.com/' + url
        # print(link)
        obj = {'numkey': str(num), 'link': link, 'title': title}
        item_list.append(obj)
        num += 1

    #获取第二页
    pageToken = 2
    start_url = 'https://forum.cyberctm.com/home.php?mod=space&uid=503430&do=thread&view=me&order=dateline&page=' + str(
        pageToken)
    if USE_PROXY:
        response = requests.get(start_url,
                                headers=start_headers,
                                proxies=proxies)
    else:
        response = requests.get(start_url, headers=start_headers)
    # print(response.text)
    html = HTML(response.text)

    url_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/@href')
    title_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/text()')

    for url, title in zip(url_list, title_list):
        # 请求详情页
        print(str(num) + '. ' + title)
        link = 'https://forum.cyberctm.com/' + url
        # print(link)
        obj = {'numkey': str(num), 'link': link, 'title': title}
        item_list.append(obj)
        num += 1

    #开始评论回复
    num_input_listStr = input('\n请输入要发布评论的帖子的序号：')
    sleepTime = input('\n请输入多少分钟后循环发布：')

    while True:
        num_input_list = num_input_listStr.split('.')
        with open('评论内容.txt') as f:
            mycomment = f.read().strip()

        print('当前评论内容是：' + mycomment)
        for num_input in num_input_list:
            for item in item_list:
                if item['numkey'] == num_input:
                    print('\n正在评论：' + str(item['numkey']))
                    try:
                        setRes = setComment(item['link'], mycomment)
                        if setRes:
                            pass
                        else:
                            setRes = setComment(item['link'], mycomment)
                    except:
                        print('未知错误')

                    break
        sleepTimeMin = 60 * int(sleepTime)
        print('\n当前时间：' + str(time.strftime('%Y-%m-%d %H:%M:%S')))
        print('等待下一轮：' + sleepTime + '分钟后重新启动。。。')
        time.sleep(sleepTimeMin)

Пример #8

0

Показать файл

Файл: wuda.py Проект: zack7wong/spiders

import requests
from lxml.etree import HTML
import re

#网站url
url = 'https://www.whu.edu.cn/'

#request请求
response = requests.get(url)
#设置编码
response.encoding = 'utf8'

#lxml解析返回的结果
html = HTML(response.text)

#xpath获取对应的数据
lis = html.xpath('//a/@href')
titles = html.xpath('//a/text()')

for li, title in zip(lis, titles):
    #对不匹配的数据剔除
    title = title.strip()
    if title != '':
        #re正则表达式匹配
        if re.match('http://news.*?|info', li):
            #只获取新闻内容
            if li[:4] == 'info':
                link = 'https://www.whu.edu.cn/' + li
            else:
                link = li
            print(link, title)

Пример #9

0

Показать файл

import re
import requests
from lxml.etree import HTML

response = requests.get("http://www.debian.org/releases/stable/")
root = HTML(response.content)
title_text = root.find("head").find("title").text
release = re.search("\u201c(.*)\u201d", title_text).group(1)
p_text = root.xpath("//div[@id='content']/p[1]")[0].text
version = p_text.split()[1]

print("Codename:{}\nVersion: {}".format(release, version))

Пример #10

0

Показать файл

Файл: souhuxinwen.py Проект: pythonPCS/scrapy-redis-mongo-mysql-news

 def parse_item(self, response):
     title = response.meta['title']
     describe = response.meta['describe']
     publishedDate = response.meta['publish']
     pic_url = response.meta['pic']
     app_name = '搜狐新闻'
     author = ''
     home_url = 'https://api.k.sohu.com/'
     crawlTime = time.strftime("%Y-%m-%d %H:%M:%S",
                               time.localtime(time.time()))
     publishedDate = time.strftime(
         "%Y-%m-%d %H:%M:%S", time.localtime(float(publishedDate) / 1000))
     category = '要闻'
     data = json.loads(response.body)
     content = data['content']
     selector = HTML(content)
     content = selector.xpath('//text()')
     content = ''.join(content)
     content = content.replace('\t', '').replace('\n', '').replace('\r', '')
     pic_more_url = data['photos']
     pic = []
     for i in range(len(pic_more_url)):
         pic.append(str(pic_more_url[i]['pic']))
     pic_more_url = pic
     print "app名称", app_name
     print "主图片url", pic_url
     print "子图片url", pic_more_url
     print "作者", author
     print "详情页地址", response.url
     print "所属类型", category
     print "标题", title
     print "描述", describe
     print "内容", content
     print "主url", home_url
     print "发布时间", publishedDate
     print "爬取时间", crawlTime
     self.count += 1
     url = response.url
     item = NewsItem()
     item['app_name'] = app_name
     item['pic_url'] = pic_url
     item['pic_more_url'] = pic_more_url
     item['author'] = author
     item['url'] = url
     item['category'] = category
     item['title'] = title
     item['describe'] = describe
     item['content'] = content
     item['home_url'] = home_url
     item['publishedDate'] = publishedDate
     item['crawlTime'] = crawlTime
     item['count'] = self.count
     timeArray = time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S")
     timeStamp = int(time.mktime(timeArray))
     if timeStamp >= self.timeStamp:
         numappName = self.readjson()
         if len(numappName) == 0:
             items = {'title': title}
             with open('souhuxinwen.json', 'a+') as fp:
                 line = json.dumps(dict(items), ensure_ascii=False) + '\n'
                 fp.write(line)
             yield item
         else:
             for i in range(len(numappName)):
                 if numappName[i]['title'] == item['title']:
                     return
             else:
                 items = {'title': item['title']}
                 with open('souhuxinwen.json', 'a+') as fp:
                     line = json.dumps(dict(items),
                                       ensure_ascii=False) + '\n'
                     fp.write(line)
                 yield item

Пример #11

0

Показать файл

Файл: net_searcher.py Проект: Qingluan/mroylib

 def xpath(self, html, *tags, exclude=None):
     xhtml = HTML(html)
     exclude = '[not(name()={})]'.format(exclude) if exclude else ''
     LogControl.info("//" + "//".join(tags) + exclude) if self.debug else ''
     for item in xhtml.xpath("//" + "//".join(tags) + exclude):
         yield item

Пример #12

0

Показать файл

 def _get_total_drug_page(self, url):
     response = requests.get(url)
     sel = HTML(response.content)
     total_pages = int(sel.xpath('//span[@class="p-skip"]/em/b/text()')[0])
     return total_pages

Пример #13

0

Показать файл

Файл: autohome2.py Проект: inwikipedia/spiders

def parse(obj):

    detail_url = 'https://forum.app.autohome.com.cn/forum_v9.8.0/forum/club/topiccontent-a2-pm2-t{id}-o0-p1-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct0-mid0-abX-isar1.json'
    start_url = detail_url.format(id=obj['id'])
    print(start_url)
    headers = {
        'User-Agent': "Android6.0	autohome9.8.5	Android",
        'sample': "0",
        'reqid': "863100032895926/1547540160089/346",
        'apisign':
        "2|863100032895926|autohomebrush|1547540157|11D6CEBC22A1C9729C5C683E24F5D6AE",
        'Host': "forum.app.autohome.com.cn",
        'Connection': "Keep-Alive",
        'Accept-Encoding': "gzip",
        'cache-control': "no-cache",
        'Postman-Token': "8ca13309-a73f-4acb-8414-921c52aa0665"
    }
    response = requests.get(start_url, headers=headers, verify=False)
    # print(response.text)
    responseRrStr = response.text.replace(
        '<span class="hs_kw0_mainpl"></span>',
        '，').replace('<span class="hs_kw1_mainpl"></span>', '了').replace(
            '<span class="hs_kw2_mainpl"></span>',
            '是').replace('<span class="hs_kw3_mainpl"></span>', '的').replace(
                '<span class="hs_kw4_mainpl"></span>', '不').replace(
                    '<span class="hs_kw6_mainpl"></span>',
                    '？').replace('<span class="hs_kw5_mainpl"></span>', '。')
    html = HTML(responseRrStr)
    id = obj['id']

    url = start_url

    if obj['topictype'] == '精':
        jinghua = '1'
    else:
        jinghua = '0'
    userName = obj['userName']
    title = obj['title']
    publishDate = obj['publishDate']
    replyCount = obj['replyCount']

    clickCount = html.xpath('string(//span[@class="view"])').replace('浏览', '')
    content = html.xpath('string(//div[@class="tz-paragraph"])')

    save_res = id + '||' + url + '||' + userName + '||' + title + '||' + jinghua + '||' + clickCount + '||' + replyCount + '||' + content + '||' + publishDate
    save_res = save_res.replace(',', '，').replace(' ', '').replace(
        '\n', ' ').replace('\r', ' ').replace('||', ',').strip() + '\n'
    print(save_res)
    with open('post.csv', 'a', encoding='utf8', errors='ignore') as f:
        f.write(save_res)

    commentEtreeList = html.xpath('//ul[@class="post-flow"]/li')
    for eachEtree in commentEtreeList:
        try:
            commentStr = etree.tostring(eachEtree)
            comment_html = HTML(commentStr)
            comment_list = comment_html.xpath(
                '//div[@class="yy_reply_cont"]//text()')
            commentContent = ''.join(comment_list)
            if commentContent == '':
                continue
            with open('comment.csv', 'a') as f:
                commentRes = id + ',' + commentContent.replace(
                    ',', '，').replace('\n', ' ').strip() + '\n'
                f.write(commentRes)
        except:
            continue

Пример #14

0

Показать файл

Файл: jsk_spider.py Проект: yangwen1997/code

    def date_xpath(self,resp,id):

        items = {}
        etre = HTML(resp)
        items["_id"] = id
        items["name"] = "".join(etre.xpath('//div[@class="zbxq_name"]/text()'))
        items["company"] = "".join(etre.xpath('//div[@class="zbxq_time"]/a[1]/text()'))
        items["userID"] = "".join(etre.xpath('//div[@class="zbxq_time"]/span//label/text()'))

        cont = etre.xpath('//div[@class="see"]/table')
        if len(cont) > 1:

            # 证书名称 执业印章号 注册专业 注册编号 有效期

            trlt = etre.xpath('//div[@class="see"]/table[1]//tr')
            first = []
            k = 0
            v = 3
            for x in range(len(trlt) + 1):
                if v == x:
                    y = trlt[k:v]
                    first.append(y)
                    k += 3
                    v += 3

            for _ in range(len(first)):
                items[f"certificate_name_{str(_)}"] = "".join(first[_][0].xpath('.//td[2]//text()'))
                items[f"practice_seal_{str(_)}"] = "".join(first[_][0].xpath('.//td[4]//text()'))
                items[f"reg_major_{str(_)}"] = "".join(first[_][1].xpath('.//td[2]//text()'))
                items[f"reg_number_{str(_)}"] = "".join(first[_][1].xpath('.//td[4]//text()'))
                items[f"validity_time_{str(_)}"] = "".join(first[_][2].xpath('.//td[2]/span/span[2]/text()'))


            tr_2_lt = etre.xpath('//div[@class="see"]/table[2]//tr')
            second = []
            k = 0
            v = 2

            for x in range(len(tr_2_lt) + 1):
                if v == x:
                    y = tr_2_lt[k:v]
                    second.append(y)
                    k += 2
                    v += 2
            for _ in range(len(second)):
                items[f"certificate_category_{str(_)}"] = "".join(second[_][0].xpath('.//td[2]//text()'))
                items[f"certificate_num_{str(_)}"] = "".join(second[_][1].xpath('.//td[2]//text()'))
                items[f"cer_validity_time_{str(_)}"] = "".join(second[_][1].xpath('.//td[4]/span/span[2]/text()'))
        # 证书类型 证书编号 证书名称 执业印章号 注册专业 注册编号 有效期
        else:
            trlt = etre.xpath('//div[@class="see"]/table[1]//tr')
            first = []
            k = 0
            v = 3
            for x in range(len(trlt) + 1):
                if v == x:
                    y = trlt[k:v]
                    first.append(y)
                    k += 3
                    v += 3

            for _ in range(len(first)):
                items[f"certificate_name_{str(_)}"] = "".join(first[_][0].xpath('.//td[2]//text()'))
                items[f"practice_seal_{str(_)}"] = "".join(first[_][0].xpath('.//td[4]//text()'))
                items[f"reg_major_{str(_)}"] = "".join(first[_][1].xpath('.//td[2]//text()'))
                items[f"reg_number_{str(_)}"] = "".join(first[_][1].xpath('.//td[4]//text()'))
                items[f"validity_time_{str(_)}"] = "".join(first[_][2].xpath('.//td[2]/span/span[2]/text()'))

        JSK_date.save(items)
        self.log.info(f"数据{id}存入成功")

Пример #15

0

Показать файл

Файл: landing.py Проект: threatinteltest/ekdeco

    return None


def get_num(x):
    return int(re.search('[0-9]+$', x).group(0))


if __name__ == '__main__':
    args = apr.parse_args()
    h = HTML(open(args.file).read().replace('<br>', ''))
    key_var = None
    for key in get_keys(h):
        print '[*] testing key:', key
        stream = ''
        txt = None
        for el in h.xpath('//*[@id or @ui or @di]'):
            if el.text:
                txt = decode_page(el.text, key)
#            print txt
            if not txt:
                continue

            if 'cryptKey' in txt:

                key_var = re.findall(
                    'var cryptKey = ([_a-z0-9]+(\[\s*[0-9]+\s*\])?),', txt,
                    re.I)[0][0]
                key_var = re.sub('\s+', '', key_var)
                print '[+] found key_var', key_var
                #txt = method_3(stream,key)
                #print txt

Пример #16

0

Показать файл

Файл: baiduSearch.py Проект: zack7wong/spiders

def get_objects(keyword, pageToken):
    # kw = keyword + '%20site:news.163.com'
    kw = keyword

    SEARCH_URL = 'https://www.baidu.com/s?wd={kw}&pn={pageToken}&rn=10&oq={kw}'
    url = SEARCH_URL.format(kw=kw, pageToken=pageToken)
    print(url)

    try:
        search_response = requests.get(url, headers=headers, verify=False)
    except:
        return

    # print(search_response.text)
    html = HTML(search_response.text)
    # with open('aa.txt') as f:
    #     aa = f.read()
    # html = HTML(aa)

    div_list = html.xpath('//div[@id="content_left"]/div')

    rank = 1

    for div in div_list:
        if re.search('广告',
                     etree.tostring(div, encoding='utf8').decode('utf8')):
            if re.search('class="EC_newppim',
                         etree.tostring(div, encoding='utf8').decode('utf8')):
                eachItem_list = div.xpath('./div')
                for eachItem in eachItem_list:
                    deal(eachItem, rank)
                    rank += 1
            else:
                deal(div, rank)
                rank += 1

    # # 获取真实url
    # for site in site_list:
    #     t = site.xpath('h3/a')[0]
    #     link = t.get("href")
    #
    #     title = site.xpath('h3/a//text()')
    #     title = ''.join(title)
    #
    #     publishDateStr = site.xpath('string(div//span[@class=" newTimeFactor_before_abs m"])').replace('-','').strip()
    #
    #     save_res = title+'||'+link+'||'+publishDateStr
    #     save_res = save_res.replace('\n','').replace('\r','').replace(',','，').replace('||',',') + '\n'
    #     print(save_res)
    #     with open('结果.csv','a',encoding='gbk',errors='ignore') as f:
    #         f.write(save_res)

    page = html.xpath('//div[@id="page"]')
    if page:
        if u"下一页" in etree.tostring(page[0], encoding="utf-8",
                                    method="text").decode("utf-8"):
            pageToken = int(pageToken) + 10
        else:
            pageToken = False

    return pageToken

Пример #17

0

Показать файл

async def main():
    async with aiohttp.ClientSession() as session:

        mysql_cli = db.MysqlClient()
        item_list = []
        with open('zhihu_id.txt') as f:
            results = f.readlines()
            for res in results:
                id = res.split(',')[0]
                question = res.split(',')[1]
                obj = {
                    'id': id,
                    'question': question,
                }
                item_list.append(obj)

        for obj in item_list:
            print(obj['id'])
            url = 'https://www.zhihu.com/question/' + obj['id']
            print(url)
            response = await fetch(session, url)

            jsonStr = re.search(
                '<script id="js-initialData".*?>(.*?)</script>',
                response).group(1)
            json_obj = json.loads(jsonStr)
            print(json.dumps(json_obj))

            for data in json_obj['initialState']['entities']['questions']:
                questionAuthor = json_obj['initialState']['entities'][
                    'questions'][data]['author']['name']
                questionAuthorId = json_obj['initialState']['entities'][
                    'questions'][data]['author']['urlToken']
                questionAuthor_hashId = json_obj['initialState']['entities'][
                    'questions'][data]['author']['id']

                save_re = questionAuthor.replace(
                    ',', '，'
                ) + ',' + questionAuthorId + ',' + questionAuthor_hashId + '\n'
                with open('author.txt', 'a') as f:
                    f.write(save_re)

            for data in json_obj['initialState']['entities']['answers']:
                question = obj['question']

                answerId = str(json_obj['initialState']['entities']['answers']
                               [data]['id'])
                answer = json_obj['initialState']['entities']['answers'][data][
                    'content']

                html = HTML(answer)
                content_list = html.xpath('//text()')
                answer = ''.join(content_list)
                answerAuthor = json_obj['initialState']['entities']['answers'][
                    data]['author']['name']
                answerAuthorId = json_obj['initialState']['entities'][
                    'answers'][data]['author']['urlToken']
                answerAuthor_hashId = json_obj['initialState']['entities'][
                    'answers'][data]['author']['id']
                commentCount = str(json_obj['initialState']['entities']
                                   ['answers'][data]['commentCount'])
                likeCount = str(json_obj['initialState']['entities']['answers']
                                [data]['voteupCount'])

                # print(question)
                # print(answerId)
                # print(answer)
                # print(answerAuthor)
                # print(answerAuthorId)
                # print(answerAuthor_hashId)
                # print(commentCount)
                # print(likeCount)

                save_re = answerAuthor.replace(
                    ',', '，'
                ) + ',' + answerAuthorId + ',' + answerAuthor_hashId + '\n'
                with open('author.txt', 'a') as f:
                    f.write(save_re)

                sql = "insert into questionDetail(question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)" \
                      " VALUES ('%s', '%s', '%s','%s', '%s', '%s','%s', '%s')" \
                      % (question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)
                print(sql)

                mysql_cli.save(sql)

Пример #18

0

Показать файл

 def oldhome(self, response):
     # if response.status==
     sel=scrapy.Selector(response)
     #从上一函数传下来
     item=response.meta
     province=item['province']
     city=item['city']
     city_href=item['city_href']
     county=item['county']
     county_href=item['county_href']
     oldhome_href=item['oldhome_href']
     url=response.url
     item = HouseItem()
     #存储到此函数
     item['province'] = province
     item['city'] = city
     item['city_href'] = city_href
     item['county'] = county
     item['county_href'] = county_href
     item['oldhome_href']=oldhome_href
     item['date_before']=self.date_before
     item['building'] = '二手房'
     item['ProgramStarttime']=self.ProgramStarttime
     # 有小区信息那一部分
     detail_table=sel.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")
     # 会出现传入的第一个链接，获取内容不全的情况
     #xpath重新获取时，HTML不能用extract(),所以后续不能合并处理
     if detail_table==[]:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
         }
         urls=requests.get(url,headers=headers).text
         html=HTML(urls)
         #有该页小区信息那一部分
         detail_table = html.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")[0]
         #该区县所有小区列表
         detail=detail_table.xpath(".//tr[@height='25px;']")
         #获取各个小区信息
         for d in detail:
             #小区名称
             item['house']=d.xpath(".//a[@class='c_blue']/text()")[0]
             #上月房价（平均单价）
             item['price']=d.xpath(".//td[4]/span/text()")[0]
             #环比上月信息
             rate=d.xpath(".//td[5]/span/text()")[0]
             if '--' not in rate:
                 if rate[0]=='-':
                     item['rate_m_unit']='下降'
                     item['rate_m'] = rate[1:]
                 elif rate[0]=='+':
                     item['rate_m_unit']='上升'
                     item['rate_m'] = rate[1:]
                 else:
                     item['rate_m_unit']=None
                     item['rate_m'] = rate
             else:
                 item['rate_m_unit'] = None
                 item['rate_m'] = rate
             yield item
     else:
         # 该区县所有小区列表
         detail=detail_table.xpath(".//tr[@height='25px;']")
         # 获取各个小区信息
         for d in detail:
             # 小区名称
             item['house'] = d.xpath(".//a[@class='c_blue']/text()").extract()[0]
             # 上月房价（平均单价）
             item['price'] = d.xpath(".//td[4]/span/text()").extract()[0]
             # 环比上月信息
             rate = d.xpath(".//td[5]/span/text()").extract()[0]
             if '--' not in rate:
                 if rate[0] == '-':
                     item['rate_m_unit'] = '下降'
                     item['rate_m'] = rate[1:]
                 elif rate[0] == '+':
                     item['rate_m_unit'] = '上升'
                     item['rate_m'] = rate[1:]
                 else:
                     item['rate_m_unit'] = None
                     item['rate_m'] = rate
             else:
                 item['rate_m_unit'] = None
                 item['rate_m'] = rate
             yield item

Пример #19

0

Показать файл

Файл: law_star.py Проект: ssmnghunssjust/law_star

 def _parse_detail(self, li):
     item = dict()
     item['_id'] = li.xpath('./div[@class="div05"]/h2/a/@rjs8').pop()
     item['url'] = urljoin(
         self.base_url,
         li.xpath('./div[@class="div05"]/h2/a[1]/@href').pop())
     item['title'] = li.xpath('./div[@class="div05"]/h2/a[1]/@title').pop()
     response = self._get_response(item['url'])
     text = self._get_text(response)
     detail_html = HTML(text)
     # 法规文号
     item['fgwh'] = detail_html.xpath(
         '/html/body/div[8]/div/div/div[3]/ul/li[2]/p/text()') if len(
             detail_html.xpath(
                 '/html/body/div[8]/div/div/div[3]/ul/li[2]/p/text()')
         ) > 0 else None
     # 发布日期
     item['fbrq'] = detail_html.xpath('//p[@id="tdat"]/text()') if len(
         detail_html.xpath('//p[@id="tdat"]/text()')) > 0 else None
     # 实施日期
     item['ssrq'] = detail_html.xpath(
         '/html/body/div[8]/div/div/div[3]/ul/li[4]/p/text()') if len(
             detail_html.xpath(
                 '/html/body/div[8]/div/div/div[3]/ul/li[4]/p/text()')
         ) > 0 else None
     # 发布部门
     item['fbbm'] = detail_html.xpath('//p[@id="tdpt"]/text()') if len(
         detail_html.xpath('//p[@id="tdpt"]/text()')) > 0 else None
     # 效力等级
     item['xldj'] = detail_html.xpath(
         '/html/body/div[8]/div/div/div[3]/ul/li[6]/p/text()') if len(
             detail_html.xpath(
                 '/html/body/div[8]/div/div/div[3]/ul/li[6]/p/text()')
         ) > 0 else None
     # 正文
     item['maintext'] = detail_html.xpath(
         '//div[@id="maintext"]/text()'
     ) if len(
         detail_html.xpath('//div[@id="maintext"]/text()')) > 0 else None
     return item

Пример #20

0

Показать файл

    def newhome(self,response):
        #从上一函数传下来
        sel=scrapy.Selector(response)
        item=response.meta
        province=item['province']
        city=item['city']
        city_href=item['city_href']
        county=item['county']
        # 中间变量（不返回到yield item）
        cpage=item['cpage']         #当前页
        county_href=item['county_href']
        newhome_href = item['newhome_href']
        # 中间变量（不返回到yield item）
        newhome_fweb=item['newhome_fweb']    #首页链接（为了后续拼翻页链接）
        url=response.url
        item = HouseItem()
        #存储到此函数
        item['province'] = province
        item['city'] = city
        item['city_href'] = city_href
        item['county'] = county
        item['county_href'] = county_href
        item['newhome_href']=newhome_href
        item['building']='新楼盘'
        item['date_before']=self.date_before
        item['ProgramStarttime']=self.ProgramStarttime
        boxs=sel.xpath(".//div[@id='content']/div[@class='halistbox']")
        # xpath重新获取时，HTML不能用extract(),所以后续不能合并处理
        if boxs==[]:   #会出现获取信息不全的情况
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
            }
            urls=requests.get(url,headers=headers).text
            html = HTML(urls)
            #有该页所有小区信息那一部分
            boxs = html.xpath(".//div[@id='content']/div[@class='halistbox']")[0]
            #小区信息列表
            box=boxs.xpath(".//div[@class='halist clearfix']")
            #各个小区
            for b in box:
                #小区名称
                item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()")[0]
                #text=['均价：', '元/㎡', '(2017-06-12)'] 或[]
                text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()")
                if text:
                    try:  #房价类型
                        item['price_type']=text[0][:-1]
                    except:
                        item['price_type']=None
                    try:   #房价发布时间
                        item['time']=text[2][1:-1]
                    except:
                        item['time']=None
                #price_info=['25,000']或[]
                price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()")
                if price_info:   #房价
                    item['price']=price_info[0]
                    yield item
            #共**页
            try:
                pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span[@class='page_p']/text()")[0]
                page=int(re.findall("共(.*?)页",pages)[0])
            except:
                page=None
        else:
            #小区信息列表
            box=boxs.xpath(".//div[@class='halist clearfix']")
            #各个小区
            for b in box:
                #小区名称
                item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()").extract()[0]
                #text=['均价：', '元/㎡', '(2017-06-12)'] 或[]
                text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()").extract()
                if text:
                    try:
                        #房价类型（均价或起价）
                        item['price_type']=text[0][:-1]
                    except:
                        item['price_type']=None
                    try:  #房价更新时间
                        item['time']=text[2][1:-1]
                    except:
                        item['time']=None
                # price_info=['25,000']或[]
                price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()").extract()
                if price_info:
                    item['price']=price_info[0]
                    yield item
            # 共**页
            try:
                pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span/text()").extract()[0]
                page=int(re.findall("共(.*?)页",pages)[0])
            except:
                page=None

        #翻页
        if page:
            if cpage<page:
                #根据各区县小区首页链接拼翻页链接
                newhome_href=newhome_fweb[:-1]+"-pg"+str(cpage+1)+"/"
                item['cpage']=cpage+1
                item['newhome_fweb']=newhome_fweb
                item['newhome_href']=newhome_href
                yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)

Пример #21

0

Показать файл

def get_profile(uid, deepNum, preName):

    # 人编码	人的标签（创客还是创客导师）	人地址（比如高密市第五中学）	个性签名	关注者数量	粉丝数量	作品数量	本月被赞总数	被访问总数	他的勋章墙数量及具体名称	代表作作品数量	作品数量（公开）	作品数量（优秀）	作品所有标签
    #userId userType    address
    url = 'http://www.i3done.com/u/{uid}'
    start_url = url.format(uid=uid)
    response = down.get_html(start_url, headers=headers)
    if response:
        # print(response.text)

        html = HTML(response.text)
        userName = html.xpath(
            'string(//div[@class="zw-banner-user"]/span/text()|//strong[@class="hide_text"]/text())'
        )
        userType = html.xpath(
            'string(//a[@class="maker"]/@title|//a[@class="tutor"]/@title|//a[@class="teacher"]/@title)'
        )
        address = html.xpath(
            'string(//p[@class="school-name"]/text()|//p[@class="jsle"]/a/@title)'
        )
        description = html.xpath(
            'string(//div[@class="zw-banner-sign"]/span/text()|//p[@class="zxpent"]/text())'
        ).strip()
        followCount = html.xpath(
            'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-id="focus"]/text())'
        ).strip()
        fansCount = html.xpath(
            'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-id="fans"]/text()|//span[@id="cares"]/text())'
        ).strip()
        worksCount = html.xpath(
            'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-tabid="tuzhi"]/text())'
        )
        likeCount = html.xpath(
            'string(//b[@id="likes"]/text()|//font[@id="likes"])')

        createDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

        clickCount = html.xpath(
            'string(//div[@class="zw-right-fans"]/i/em/text()|//div[@class="zan_03"]/span/text())'
        )
        clickCount = re.search('^(\d+)人访问', clickCount)
        if clickCount:
            clickCount = clickCount.group(1)
        else:
            clickCount = '0'

        honorCount = html.xpath('string(//i[@class="text_red"]/text())')

        level = html.xpath(
            'string(//div[@class="zw-zone-des"]/span/text())').replace(
                '\n', '').replace('\r', '').replace('\t', '').strip()
        try:
            level = re.search('等级：L(\d+)', level).group(1)
        except:
            level = ''
        rank = html.xpath(
            'string(//div[@class="zw-user-rank"]/span/text())').replace(
                '全网排名：', '')

        verifyTime = html.xpath(
            'string(//img[@class="cp"]/@src | //div[@class="tips"]/text())')
        if '年' in verifyTime:
            try:
                verifyTime = re.search('(\d+年\d+月\d+日)', verifyTime).group(1)
            except:
                print('verifyTime error..')

        #获取荣誉
        get_honor(uid)

        # 获取作品
        publicWorkCount, excellentWorkCount, tags, category_list = get_works_first(
            uid)

        createDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

        sql = "insert into user(userId,userName,userType,address,description,followCount,fansCount,worksCount,likeCount,clickCount,honorCount,publicWorkCount,excellentWorkCount,tags,level,rank,verifyTime,deepNum,preName,createDate) VALUES ('%s','%s','%s', '%s', '%s', '%s','%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s')" % (
            uid,userName, userType, address, description, followCount, fansCount, worksCount, likeCount,clickCount, honorCount, publicWorkCount, excellentWorkCount, tags, level,rank,verifyTime,deepNum,preName,createDate) \
              + "ON DUPLICATE KEY UPDATE followCount='%s', fansCount='%s',deepNum='%s'" % (followCount,fansCount,deepNum)
        print(sql)
        mysqlCli.save(sql)

        return category_list

Пример #22

0

Показать файл

#表头
save = '序号,标题,链接,作者,时间,点赞数,评论数,转发数\n'
with open('res.csv', 'w') as f:
    f.write(save)

account = 1
#翻页
for page in range(1, 14):
    url = URL + str(page)
    response = requests.get(url, headers=headers)
    json_obj = json.loads(response.text)
    html_str = json_obj['data']
    html = HTML(html_str)
    #lxml 的 xpath解析
    titles = html.xpath(
        '//div[@class="UG_list_b"]//h3[@class="list_title_b"]/a/text()')
    hrefs = html.xpath(
        '//div[@class="UG_list_b"]//h3[@class="list_title_b"]/a/@href')
    authors = html.xpath(
        '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/a[2]/span[1]/text()'
    )
    times = html.xpath(
        '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[1]/text()'
    )
    likes = html.xpath(
        '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[2]/em[2]/text()'
    )
    comments = html.xpath(
        '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[4]/em[2]/text()'
    )
    zhufas = html.xpath(

Пример #23

0

Показать файл

Файл: sn_test.py Проект: wojiaergou/qunuanpian

import requests
import re
from lxml.etree import HTML
url = 'https://product.suning.com/0000000000/694729819.html'
response_text = requests.get(url).text
html = HTML(response_text)
try:
    p_Name = re.findall('"itemDisplayName":"(.*?)"', response_text)[0]
except:
    p_Name = None
xxx = html.xpath(".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt")[0]
print(xxx)
print(p_Name)
print(len(p_Name))

Пример #24

0

Показать файл

Файл: test.py Проект: insub/travel_spider

def lvmama_poi_detail(url):
    with requests.session() as sess:
        response = sess.get(url)
        html = HTML(response.text)
        item = LvmamaPoiDetailItem()
        item['raw'] = {'html': str(lzma.compress(response.content))}
        if 'sight' in url:
            item['head'] = get_text_by_xpath(html, './/span[@class="crumbs_nav"]/span//text()')
            item['title'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/h2[@class="title"]/text()')
            item['title_en'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/span[@class="title-eng"]/text()')
            item['vcomon'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/i[@class="vcomon-icon"]/text()')
            # item['country'] = response.request.meta.get('country')
            dls = html.xpath('.//dl[@class="poi_bordernone"]')
            for dl in dls:
                dt = get_text_by_xpath(dl, './/dt//text()')
                dd = get_text_by_xpath(dl, './/dt//text()')
                if '简介' in dt:
                    item['poi_brief'] = dd

                elif '景点导览' in dt:
                    item['poi_detail'] = dd

                elif '交通信息' in dt:
                    item['traffic'] = dd

                elif '小贴士' in dt:
                    item['poi_tip_content'] = dd

            dts = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dt')
            dds = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dd')
            for dt, dd in zip(dts, dds):
                dt = get_text_by_xpath(dt, './/text()')
                dd = get_text_by_xpath(dd, './/text()')
                if '地　　址' in dt:
                    item['address'] = dd
                elif '游玩时间' in dt:
                    item['playtime'] = dd
                elif '联系电话' in dt:
                    item['phone'] = dd
                elif '门票' in dt:
                    item['ticket'] = dd
                elif '开放时间' in dt:
                    item['open_time'] = dd
                elif '网址' in dt:
                    item['website'] = dd
        elif 'zone' in url:
            item['head'] = get_text_by_xpath(html, './/div[@class="nav clearfix"]/span[@class="crumbs_nav fl"]//text()')
            item['title'] = get_text_by_xpath(html,
                                              './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/text()')
            item['title_en'] = get_text_by_xpath(html,
                                                 './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/span/text()')
            item['active'] = get_text_by_xpath(html,
                                               './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/p[@class="active"]/text()')
            dls = html.xpath('.//div[@class="city_viewBox"]/div[@class="city_view_model"]/div/dl')
            for dl in dls:
                dt = get_text_by_xpath(dl, './/dt//text()')
                dd = get_text_by_xpath(dl, './/dd//text()')
                if '简介' in dt:
                    item['poi_brief'] = dd

                elif '景点导览' in dt:
                    item['poi_detail'] = dd

                elif '交通信息' in dt:
                    item['traffic'] = dd

                elif '小贴士' in dt:
                    item['poi_tip_content'] = dd

            divs = html.xpath('.//dl[@class="city_mapList clearfix"]/dd/div')
            for div in divs:
                dt = get_text_by_xpath(div, './/p[1]//text()')
                dd = get_text_by_xpath(div, './/p[2]//text()')
                if '地址' in dt.replace(' ',''):
                    item['address'] = dd
                elif '游玩时间' in dt:
                    item['playtime'] = dd
                elif '联系电话' in dt:
                    item['phone'] = dd
                elif '门票' in dt:
                    item['ticket'] = dd
                elif '开放时间' in dt:
                    item['open_time'] = dd
                elif '网址' in dt:
                    item['website'] = dd

        # item['url'] = response.request.url
        return item

Пример #25

0

Показать файл

 driver.switch_to.window(handles[1])
 t1 = threading.Thread(target=deletAsin)
 t1.start()
 while True:
     for link in links:
         time.sleep(0.4)
         driver.get(link)
         wait = WebDriverWait(driver, 10, 0.2)
         wait.until(lambda driver: driver.find_element_by_xpath(
             "//div[@data-index='9']"))
         text = driver.page_source
         html = HTML(text)
         now_asins = {}
         try:
             now_asins = set(
                 html.xpath("//div/@data-asin")) - {'" data-index=', ''}
         except:
             pass
         for count in range(2):
             if links.index(link) == 0:
                 countlist = countlist1
             else:
                 countlist = countlist2
             while now_asins.__len__() not in countlist:
                 # print(str(links.index(link)) + ': ' + str(now_asins.__len__()))
                 time.sleep(0.2)
                 text = driver.page_source
                 html = HTML(text)
                 try:
                     now_asins = set(html.xpath("//div/@data-asin")) - {
                         '" data-index=', ''

Пример #26

0

Показать файл

Файл: gov.py Проект: zack7wong/spiders

#!/usr/bin/env python
# -*- coding:utf-8 -*-

#导入包
import requests
from lxml.etree import HTML

#初始url
URL = 'http://sousuo.gov.cn/column/40123/{page}.htm'

#翻页
for i in range(0, 11):
    start_url = URL.format(page=i)
    print(start_url)
    #发送请求
    response = requests.get(start_url)

    #lxml解析获得的结果
    html = HTML(response.text)

    #获取url和标题
    urls = html.xpath('//ul[@class="listTxt"]/li/h4/a/@href')
    titles = html.xpath('//ul[@class="listTxt"]/li/h4/a/text()')

    #对结果进行拼接
    for url, title in zip(urls, titles):
        print(url, title)
        with open('结果.txt', 'a') as f:
            f.write(url + ',' + title + '\n')

Пример #27

0

Показать файл

Файл: bk_spider.py Проект: yangwen1997/code

    def trade_info(self):
        """
        抓取商标详细信息页面
        :return:
        """

        self.s.headers.update({
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
        })
        data = red_cli.srandmember("BKID")
        url = eval(data)["url"]
        # url = 'https://www.tmkoo.com/detail/24ff6eaa997007f2967541ee3bb13223/11/'
        resp = self.get_req(url)
        etre = HTML(resp.text)

        regist_num = "".join(
            etre.xpath(
                '//td[contains(text(),"注册号")]/following-sibling::td[1]/font/text()'
            ))
        international_class = "".join(
            etre.xpath(
                '//td[contains(text(),"注册号")]/following-sibling::td[2]/font/text()'
            ))
        regist_time = "".join(
            etre.xpath(
                '//td[contains(text(),"申请日期")]/following-sibling::td[1]//text()'
            ))
        registrant_chinese_name = "".join(
            etre.xpath(
                '//td[contains(text(),"申请人名称(中文)")]/following-sibling::td[1]/div/text()'
            ))
        registrant_foreign_name = "".join(
            etre.xpath(
                '//td[contains(text(),"申请人名称(英文)")]/following-sibling::td[1]//text()'
            ))
        registrant_foreign_address = "".join(
            etre.xpath(
                '//td[contains(text(),"申请人地址(英文)")]/following-sibling::td[1]//text()'
            ))
        image_url = "".join(etre.xpath('//td[@align="center"]/img/@src'))
        preliminary_notice_num = "".join(
            etre.xpath(
                '//td[contains(text(),"初审公告期号")]/following-sibling::td[1]//text()'
            ))
        regist_notice_num = "".join(
            etre.xpath(
                '//td[contains(text(),"注册公告期号")]/following-sibling::td[1]//text()'
            ))
        preliminary_notice_time = "".join(
            etre.xpath(
                '//td[contains(text(),"初审公告日期")]/following-sibling::td[1]//text()'
            ))
        regist_notice_time = "".join(
            etre.xpath(
                '//td[contains(text(),"注册公告日期")]/following-sibling::td[1]//text()'
            ))
        special_period_effective_time = "".join(
            etre.xpath(
                '//td[contains(text(),"专用权期限")]/following-sibling::td[1]//text()'
            ))
        is_co_regist = "".join(
            etre.xpath(
                '//td[contains(text(),"是否共有商标")]/following-sibling::td[1]//text()'
            ))
        international_later_time = "".join(
            etre.xpath(
                '//td[contains(text(),"后期指定日期")]/following-sibling::td[1]//text()'
            ))
        international_regist_time = "".join(
            etre.xpath(
                '//td[contains(text(),"国际注册日期")]/following-sibling::td[1]//text()'
            ))
        priority_date = "".join(
            etre.xpath(
                '//td[contains(text(),"优先权日期")]/following-sibling::td[1]//text()'
            ))
        agent_name = "".join(
            etre.xpath(
                '//td[contains(text(),"代理人名称")]/following-sibling::td[1]//text()'
            ))
        color_indication = "".join(
            etre.xpath(
                '//td[contains(text(),"指定颜色")]/following-sibling::td[1]//text()'
            ))
        trademark_type = "".join(
            etre.xpath(
                '//td[contains(text(),"商标类型")]/following-sibling::td[1]//text()'
            ))
        form = "".join(
            etre.xpath(
                '//td[contains(text(),"商标状态")]/following-sibling::td[1]//text()'
            ))

        commodity_num = etre.xpath(
            '//a[contains(text(),"具体核准商品/服务以商标公告为准，点击查看！")]/ancestor::td//table//tr/td[@align="right"]/text()'
        )
        commodity_chinese_name = etre.xpath(
            '//a[contains(text(),"具体核准商品/服务以商标公告为准，点击查看！")]/ancestor::td//table//tr/td[3]/text()'
        )
        lt = []
        for _ in range(len(commodity_num)):
            i = {}
            i["commodity_num"] = commodity_num[_]
            i["commodity_chinese_name"] = commodity_chinese_name[_]
            lt.append(i)

        item = {
            "_id": eval(data)["_id"],
            "regist_num": regist_num,
            "international_class": international_class,
            "regist_time": regist_time,
            "registrant_chinese_name": registrant_chinese_name,
            "image_url": image_url,
            "preliminary_notice_num": preliminary_notice_num,
            "regist_notice_num": regist_notice_num,
            "preliminary_notice_time": preliminary_notice_time,
            "regist_notice_time": regist_notice_time,
            "special_period_effective_time": special_period_effective_time,
            "international_later_time": international_later_time,
            "international_regist_time": international_regist_time,
            "priority_date": priority_date,
            "color_indication": color_indication,
            "trademark_type": trademark_type,
            "form": form,
            "is_co_regist": is_co_regist,
            "agent_name": agent_name,
            "trademark_commodity_server_info": lt,
            "registrant_foreign_name": registrant_foreign_name,
            "registrant_foreign_address": registrant_foreign_address,
        }
        item = BK.item_clear(item=item)

        BK_DATA_info.save(item)
        log.info("数据存入成功。。。。。。。。。。")

Пример #28

0

Показать файл

def start():
    for i in range(1,400):
        try:
            print('当前页：'+str(i))
            start_url = 'https://z.jd.com/bigger/search.html'
            body = 'status=&sort=&categoryId=&parentCategoryId=&sceneEnd=&productEnd=&keyword=&page='+str(i)

            response = requests.post(start_url, headers=headers,data=body,timeout=10)
            # print(response.text)
            html = HTML(response.text)
            urls = html.xpath('//div[@class="l-result"]//li/a/@href')

            print(len(urls))

            for url in urls:
                link = 'https://z.jd.com'+url
                print(link)
                try:
                    response = requests.get(link,headers=detail_headers,timeout=10)
                    id = re.search('https://z.jd.com/project/details/(\d+).html',link).group(1)

                    html = HTML(response.text)
                    title = html.xpath('string(//h1)').replace(',','，').strip()
                    price = html.xpath('string(//p[@class="p-num"]/text())')
                    yu_price = html.xpath('string(//p[@id="projectMessage"]/span[2]/text())')
                    faqiNum = html.xpath('string(//div[@class="promoters-num"]/div[@class="fl start"]/span[@class="num"])')
                    address = html.xpath('string(//div[@class="box-content"]/ul[@class="contact-box"]/li[2]/div[@class="val"])')
                    dangciList = html.xpath('//div[@class="details-right-fixed-box"]/div[@class="box-grade"]//div[@class="t-price"]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price"]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price "]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price"]/span')
                    dangciNum = str(len(dangciList))

                    if 'video' in response.text:
                        has_video = '是'
                    else:
                        has_video = '否'

                    imgXpath = html.xpath('//div[@class="tab-div tab-current"]//p/img')
                    img_len = str(len(imgXpath))

                    # like = html.xpath('string(//span[@id="praisCount"])').replace('(','').replace(')','')
                    # guanzhu = html.xpath('string(//span[@id="focusCount"])').replace('(','').replace(')','')

                    contentlist = html.xpath('//div[@id="proList"]//text()')
                    content = ''.join(contentlist)
                    content = content.replace(' ','').replace('\t','').replace('\n',' ').replace('\r',' ').replace(',','，').strip()


                    count_url = 'https://sq.jr.jd.com/cm/getCount?key=1000&systemId='+id
                    count_response = requests.get(count_url,headers=count_headers,timeout=10)
                    print(count_response.text)
                    json_obj = json.loads(count_response.text.replace('(','').replace(')',''))
                    like = str(json_obj['data']['praise'])
                    guanzhu = str(json_obj['data']['focus'])


                    save_res = id+','+link+','+title+','+price+','+yu_price+','+has_video+','+img_len+','+content+','+like+','+guanzhu+','+faqiNum+','+address+','+dangciNum+'\n'
                    print(save_res)
                    with open('结果.csv','a',encoding='gbk',errors='ignore') as f:
                        f.write(save_res)
                except:
                    continue
        except:
            continue

Пример #29

0

Показать файл

        cursor.execute('SET NAMES utf8;')
        cursor.execute('SET CHARACTER SET utf8;')
        cursor.execute('SET character_set_connection=utf8;')
        cursor.execute(sql)
        db.commit()
    except MySQLdb.Error, e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])
    cursor.close()
    db.close()


urlHtml = getHtml(
    "http://cosme.pclady.com.cn/products_list/br0_bs0_bi2_sm68_ef0_pb0_pe0_or0.html"
)
html = HTML(urlHtml.decode('gbk'))
urlList = html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
parseData(urlList)
for i in range(32, 40):
    if i < 10:
        i = "0" + str(i)
    else:
        i = str(i)
    print i
    htmls = "http://cosme.pclady.com.cn/products_list/br0_bs0_bi2_sm68_ef0_pb0_pe0_or0_p" + i + ".html#productList"
    urlHtml = getHtml(htmls)
    try:
        html = HTML(urlHtml.decode('gbk'))
        urlList = html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
        parseData(urlList)
    except Exception:
        errorTxt.write("\n")

Пример #30

0

Показать файл

Файл: adreep_v2.py Проект: yufeng97/composition_spider

 def parse(self, item):
     selector = HTML(item.text)
     href = selector.xpath("//h4/a/@href")
     for url in href:
         self.article_queue.put(urljoin(BASE_URL, url))

Python HTML.xpath примеры использования