示例#1
0
    def getClassNames(self,dic, html_content, t=''):
        '''
         找到被替换的字体,
        :param dic:
        :param html_content:
        :return:
        '''

        for itemskey, itemsvalue in dic.items():

            tagList = re.findall('<' + itemskey + ' class="(.*?)"></' + itemskey + '>', html_content)

            svg_url = itemsvalue.get('url', None)
            svg_classValue = itemsvalue.get('className', None)

            svgcont = requests.get(svg_url).text

            # t=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())

            if os.makedirs('./html', exist_ok=True):
                os.makedirs('./html')
            svg_name = './html/' + itemskey + str(t) + '.svg'

            with open(svg_name, 'w') as f:
                f.write(svgcont)

            defs = re.findall('<defs>', svgcont)
            if len(defs) > 0:
                '''
                    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
                    <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
                    <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="322.0px">

                    <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style>
                    <defs><path id="1" d="M0 38 H600"/><path id="2" d="M0 75 H600"/><path id="3" d="M0 102 H600"/><path id="4" d="M0 132 H600"/><path id="5" d="M0 174 H600"/><path id="6" d="M0 202 H600"/><path id="7" d="M0 237 H600"/><path id="8" d="M0 276 H600"/></defs>
                    <text lengthAdjust="spacing">
                    <textPath xlink:href="#1" textLength="336">健关农街泉银肥宾津化信县淄民工迁孝盐安平光京乌振</textPath>
                    <textPath xlink:href="#2" textLength="308">淮青连康人红甘定公上汕二烟哈鞍胜合无大三主朝</textPath>
                    <textPath xlink:href="#3" textLength="588">衡园蒙常向夏府乐衢层台生头齐杭锡黑心岛苏十治山南海皇金云郑藏绍晋前石福清襄曙庆华鲁站</textPath>
                    <textPath xlink:href="#4" textLength="448">泰旗六太龙惠才陕湾体遵洛富中沙建肃楼机四绵一明徽沈迎家宿远嘉昌谐</textPath>
                    <textPath xlink:href="#5" textLength="532">育辽潍七木温天源友疆博圳幸九宜通号利团浙爱创邢东道梅花德庄兴港汉茂莞文学佛年</textPath>
                    <textPath xlink:href="#6" textLength="574">封昆汾春冈锦乡阳波广湛永谊内吉市古八弄贵廊湖解教祥充感省拥林环威沿风城保开桂肇西村</textPath>
                    <textPath xlink:href="#7" textLength="448">门黄尔珠凰赣军徐北长场放韶厦结和五重义交武香成名隆深设扬凤宁区坊</textPath>
                    <textPath xlink:href="#8" textLength="210">江川济都进业澳岳新河滨临路镇州</textPath>
                    </text></svg>     
                 '''
                # index 列表[(),(),()]
                index = re.findall('<path id="(\d+)" d="M0 (\d+) H\d+"/>', svgcont)
                textPath = re.findall('">(.*?)</textPath>', svgcont)
                values = [list(textPath[v]) for v in range(len(textPath))]

            else:
                # 第一种
                '''
                    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
                    <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
                    <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="64.0px">

                    <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style>
                        <text x="14 28 42 56 70 84 98 112 126 140 " y="41">4598036127</text>
                    </svg>
                '''
                # 第二种
                '''
                    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
                    <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
                    <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="274.0px">               
                    <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style>
                        <text x="0" y="26">宾吉汾化民西工襄前庆朝开路市感邢福曙新合友泰金家黑衡设隆常迎通沿富太层庄才川贵头园清</text>
                        <text x="0" y="63">谊无团晋佛城七主定辽台四鞍广莞乐嘉教州交公安蒙云湛遵尔关郑向源弄昌成三夏华黄徽苏梅济</text>
                        <text x="0" y="98">银惠锡泉港军康东滨山保宁鲁藏廊八肃年学重五宿皇宜结京茂楼名大人南岛乌都生圳淮区青沈文</text>
                        <text x="0" y="132">幸昆治津爱长内淄珠春甘凤澳远坊旗古香武中浙湾省林十六桂站深赣杭迁洛心兴业祥德孝烟红湖</text>
                        <text x="0" y="156">威创河齐一厦花博利场号健扬村韶九放阳石镇门永临明冈汕振府谐拥波解县建陕盐光哈上封农汉</text>
                        <text x="0" y="192">胜疆龙肇温二风木义肥江北衢进连绵徐信育沙和平秦街天绍机充锦环岳凰乡潍体海道</text>
                    </svg>
                '''
                # [(),()]
                posxydata = re.findall('<text x="(.*?)" y="(.*?)">(.*?)</text>', svgcont)

                if len(posxydata) > 1:
                    if posxydata[0][0] == posxydata[1][0]:
                        #
                        index = re.findall('<text x="(\d+)" y="(\d+)">', svgcont)
                        textPath = re.findall('">(.*?)</text>', svgcont)

                        values = [list(textPath[v]) for v in range(len(textPath))]
                else:
                    xs = posxydata[0][0].strip().split(' ')
                    index = [(int(i), float(posxydata[0][1])) for i in xs]
                    values = [list(posxydata[0][2]) for i in xs]

            for tag in tagList:
                pos = itemsvalue.get(svg_classValue, {}).get(tag)

                posxy = (float(pos[0]), float(pos[1]))
                fontvalue = self.getvalue(posxy, index, values)

                html_content = html_content.replace('<' + itemskey + ' class="' + tag + '"></' + itemskey + '>',
                                                    fontvalue)

        # t=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
        selector = parsel.Selector(html_content)
        shopname = selector.css('.shop-name::text').get()

        name = './html/替换之后的-' + shopname + str(t) + '.html'
        with open(name, 'w', encoding='utf-8') as f:
            f.write(html_content)
        return html_content
示例#2
0
def login(username, password, domain):
    session = requests.Session()
    response = session.get(domain)
    return parsel.Selector(text=response.text)
示例#3
0
import parsel
from selenium import webdriver

driver = webdriver.Chrome('chromedriver.exe')  # 创建浏览器对象

driver.get('http://datanews.caixin.com/interactive/2020/us-president-election/'
           )  # 使用对象打开网址

data = driver.page_source  # 获得网页渲染后的代码

html = parsel.Selector(data)  # 吧渲染好的源代码转换成树对象

MuBiao = html.xpath(
    '//div[@class="us-detail homepage"]/div[@class="chart"]/svg/g')[
        -1]  # 使用xpath语句,获得最后的一个g标签(返回的是列表,所以可以[-1]

ZhiChi_list = MuBiao.xpath('./rect/@value')  # 对最后一个g标签内部的值进行处理
# print(zc)

for i in ZhiChi_list:
    #print(i.get())                                                                 # 为了检测是不是想要获取的数据

    with open('民调.csv', 'a', encoding='utf8') as f:  #  保存文件到本地
        f.write(i.get())
        f.write('\n')
示例#4
0
def menu_search_download(url, num, text, window, flag):  # 菜单方式下载,搜索方式下载
    global photos_url
    for page in range(1, num + 1):
        # https://www.fabiaoqing.com/bqb/lists/type/doutu/page/2.html
        URL = str(url).split('.html')[0] + '/page/{}.html'.format(
            page)  # 每一个类型翻页
        proxy = {'HTTP': random.choice(proxies)}
        response = requests.get(URL, proxies=proxy)
        time.sleep(random.random())
        if flag == 1:
            photos_url = parsel.Selector(response.text).xpath(
                '//div[@class="right floated left aligned twelve wide column"]/a/@href'
            ).extract()  # 菜单套图链接
        elif flag == 0:
            photos_url = parsel.Selector(response.text).xpath(
                '//div[@class="ui segment imghover"]/a/@href').extract(
                )  # 搜索套图链接
        for photo_url in photos_url:
            # https://www.fabiaoqing.com/bqb/detail/id/9825.html
            URL1 = 'https://www.fabiaoqing.com' + photo_url  # 拼接成完整的套图链接 print(URL1)
            response1 = requests.get(URL1, proxies=proxy)
            time.sleep(random.random())
            image_urls = parsel.Selector(response1.text).xpath(
                '//div[@class="swiper-slide swiper-slide-active bqpp"]/a/@href'
            ).extract()  # 拿到每个图的链接
            image_name = parsel.Selector(response1.text).xpath(
                '//div[@class="ui segment imghover"]/h1/text()').extract()[
                    0]  # 套图名称
            for image_url in image_urls:
                # https://www.fabiaoqing.com/biaoqing/detail/id/149344.html
                URL2 = 'https://www.fabiaoqing.com' + image_url
                response2 = requests.get(URL2, proxies=proxy)
                time.sleep(random.random())
                images_info = parsel.Selector(response2.text).xpath(
                    '//div[@class="swiper-slide swiper-slide-active"]/img'
                ).extract()  # 每张图片的信息list
                for image_info in images_info:
                    soup = BeautifulSoup(image_info, 'html.parser')
                    name1 = str(soup.img.attrs['title'])
                    image = requests.get(soup.img.attrs['src'], proxies=proxy)
                    NAME = name1.split('-')[0].strip().replace(
                        '/', '').replace('\\', '').replace(':', '').replace(
                            ':', '').replace('"', '').replace('*', '').replace(
                                '?', '').replace('?',
                                                 '').replace('|', '').replace(
                                                     '<', '').replace('>', '')
                    PATH = path + image_name + '/'
                    if os.path.exists(PATH):
                        pass
                    else:
                        os.makedirs(PATH)
                    with open(
                            PATH + NAME + '.' +
                            str(soup.img.attrs['src']).split('.')[-1],
                            'wb') as f:
                        f.write(image.content)
                    text.insert(
                        'end', '正在下载:' + NAME[:20] + '.' +
                        str(soup.img.attrs['src']).split('.')[-1] + '\n')
                    window.update()
                    text.see(tk.END)
                text.insert(tk.END, '\n*********下载完成!********\n')
                window.update()
    'mldm': '08',
    'mlmc': '',
    'yjxkdm': '0812',
    'zymc': '',
    'xxfs': '1',
    'pageno': str(first_post_page),
}

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}

response = requests.post(url=url, data=data, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)

max_page_num = int(
    selector.css('.zsml-page-box li:nth-last-child(3) a::text').get())
print(max_page_num)

for page in range(1, max_page_num + 1):
    print(page)
    data = {
        'ssdm': '',
        'dwmc': '',
        'mldm': '08',
        'mlmc': '',
        'yjxkdm': '0801',
        'zymc': '',
        'xxfs': '',
def extract_df(in_loteca_htm):
    """Preprocess the data in the loteca file

    Returns:
        A DataFrame with all the rounds present in the Loteca file
    """
    # load file

    with open(in_loteca_htm, mode='rb') as fp:
        body = fp.read()
        body = body.decode('windows-1252')
    selector = parsel.Selector(body)

    # core
    rows = selector.css('tr')
    header = rows[0]
    rows = rows[1:]
    columns = header.css('font::text').extract()

    rounds = []
    for row in rows:
        tds = row.css('td')
        td_cnt = len(tds)
        if td_cnt == 28:
            # round row
            data = [td.css('::text').extract_first() for td in tds]
            rounds.append(data)
        elif td_cnt == 2:
            # state row
            continue
        else:
            raise ValueError("Loteca file row with different number of cells")

    # create DataFrame
    df = pd.DataFrame.from_records(rounds, columns=columns)

    # remove columns
    df = df.drop(['Cidade', 'UF'], axis=1)
    df = df.drop(['Jogo_%s' % i for i in range(1, 15)], axis=1)

    # rename columns
    df.columns = ['roundno', 'date', 'winners14', 'shared14', 'accumulated',
                  'accumulated14', 'winners13', 'shared13', 'winners12',
                  'shared12', 'total_revenue', 'prize_estimative']

    # convert types
    df['roundno'] = df.roundno.apply(_read_int)
    df['date'] = pd.to_datetime(df.date, dayfirst=True)
    df['winners14'] = df.winners14.apply(_read_int)
    df['winners13'] = df.winners13.apply(_read_int)
    df['winners12'] = df.winners12.apply(_read_int)
    df['shared14'] = df.shared14.apply(_read_float)
    df['shared13'] = df.shared13.apply(_read_float)
    df['shared12'] = df.shared12.apply(_read_float)
    df['accumulated'] = df.accumulated.apply(lambda x: x == 'SIM')
    df['accumulated14'] = df.accumulated14.apply(_read_float)
    df['total_revenue'] = df.total_revenue.apply(_read_float)
    df['prize_estimative'] = df.prize_estimative.apply(_read_float)

    # set index
    df = df.set_index('roundno')

    return df
示例#7
0
            xindex = int(index[yindex][0])
            break
    #找到具体的位置

    value = values[xindex - 1]

    xyindex = int(posXY[0] / 14)
    return value[xyindex]


html_content = ''

with open('dazhong1.html', 'r') as f:
    html_content = f.read()

selector = parsel.Selector(html_content)

dic = {
    'bb': {
        'rt': {
            'rt03h': (448.0, 87.0),
            'rt0dm': (112.0, 159.0),
            'rt0ff': (280.0, 87.0),
            'rt0j3': (308.0, 187.0),
            'rt0ny': (112.0, 261.0),
            'rt119': (252.0, 222.0),
            'rt11b': (210.0, 222.0),
            'rt17d': (448.0, 159.0),
            'rt1ts': (420.0, 222.0),
            'rt1z1': (56.0, 23.0),
            'rt25z': (434.0, 117.0),
示例#8
0
 def parse(data):
     return parsel.Selector(text=data)
示例#9
0
import re
import parsel
from urllib import request

url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml"
with request.urlopen(url) as req:
    text = req.read().decode("utf8")
    title = re.search("<h1>(.*)</h1>", text).group(1)
    sel = parsel.Selector(text)
    content = "\n".join(
        sel.css(".column_content_inner p font::text").extract())
    with open("about.txt", "a") as file:
        file.write(title)
        file.write("\n")
        file.write(content)
# ['Albert Einstein', 'J.K. Rowling', 'Albert Einstein', 'Jane Austen', 'Marilyn Monroe', 'Albert Einstein', 'André Gide', 'Thomas A. Edison', 'Eleanor Roosevelt', 'Steve Martin']

print(res.headers["Content-Type"])  # text/html; charset=utf-8
""" terminal python3 """

import requests
import parsel

# importa a pagina spider_quote
from spider_quote import fetch_content

# site alvo
page_content = fetch_content("https://quotes.toscrape.com/")

# seleciona o conteuda da pagina
sel = parsel.Selector(page_content)

# seleciona a classe css
quotes = sel.css("div.quote")

# mostra o conteudo obtido
print(quotes)

# mostra os autores
sel.css("div.quote small.author").getall()

# pega somente o texto
sel.css("div.quote small.author::text").getall()

# --------------------------------------------------------------------------- #
# - > AULA ao VIVO - 34.3 ----- <--- / FIM --------------------------------- //
示例#11
0
proxies_list = []
for page in range(1, 5):
    print('=============正在获取第{}数据============'.format(page))
    base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(page))
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/81.0.4044.113 Safari/537.36'
    }
    response = requests.get(base_url, headers=headers)
    # print(response.request.headers)
    data = response.text
    # print(data)
    # 转换数据类型
    html_data = parsel.Selector(data)
    # 数据解析
    parse_list = html_data.xpath(
        '//table[@class="table table-bordered table-striped"]/tbody/tr')

    # 循环遍历
    for tr in parse_list:
        dict_proxies = {}
        http_type = tr.xpath('./td[4]/text()').extract_first()  # 协议类型
        ip_num = tr.xpath('./td[1]/text()').extract_first()  # 协议类型
        ip_port = tr.xpath('./td[2]/text()').extract_first()  # 协议类型
        # print(http_type,ip_num,ip_port)
        # 构建ip字典
        dict_proxies[http_type] = ip_num + ':' + ip_port
        # print(dict_proxies)
        proxies_list.append(dict_proxies)
示例#12
0
import requests
import parsel


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}

chapter_url = 'http://www.shuquge.com/txt/8659/31165742.html'

response = requests.get(url=chapter_url, headers=headers)

response.encoding = response.apparent_encoding
html = response.text

selector = parsel.Selector(html)
h1 = selector.css('h1::text').getall()
print(h1)
# 同一个标签的组合选择器之间不要有空格
content = selector.css('div#content.showtxt::text').getall()
list_content = []
for c in content:
    list_content.append(c.strip())
    print(c.strip())
print(",".join(list_content))
示例#13
0
# print(css_response.text)
# print(len(css_response.text))
pattern = re.compile('.(\w+){background:-(\d+\.\d+)px -(\d+\.\d+)px;}')
class_map = re.findall(pattern, css_response.text)
print(class_map)

coord = class_map[0]

if coord:
    coord_name, coord_x, coord_y = coord
    coord_x, coord_y = float(coord_x), float(coord_y)

import parsel

print(svg_response.text)
svg_data = parsel.Selector(svg_response.text)

texts = svg_data.xpath('//text')
# 根据类名的位置确定y(在哪一行)
print(coord_y)
axis = []
# 那已知点的y去svg表中查询位于哪一行
for text in texts:
    if coord_y <= int(text.attrib.get('y')):
        axis.append(text.attrib.get('y'))

# axis = [text.attrib.get('y') for text in texts if coord_y <= int(text.attrib.get('y'))]
print('axis', axis)
axis_y = axis[0]

print(axis_y)
示例#14
0
font_dir = os.path.join(os.path.curdir, "fonts")

if not os.path.isdir(font_dir):
    os.mkdir(font_dir)

headers = {
    'Referer':
    "https://maoyan.com/films/1212",
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
url = 'https://maoyan.com/films/1212'

#
r = requests.get(url, headers=headers)
selector = parsel.Selector(r.text)
woff = selector.re_first("url\('(.+?\.woff)'\)")
#os.path.basename 作用:
# url('//vfile.meituan.net/colorstone/d8b92513098c90cbadf06d2779d686492080.woff')
# 提取woff中的d8b92513098c90cbadf06d2779d686492080.woff作为文件名
download_font_path = os.path.join(font_dir, os.path.basename(woff))

if not os.path.isfile(download_font_path):
    urllib.request.urlretrieve('https:%s' % woff, download_font_path)

# 解析当前页面使用的字体文件
font = TTFont(download_font_path)
# 字形编码与字符编码的对应关系
hex2u = {
    font['glyf'][u].coordinates.array.tobytes().hex(): u
    for u in font.getGlyphOrder()[2:]
示例#15
0
def get_detail_params(url, page_source, c):
    selector = parsel.Selector(page_source)
    seller_id = selector.xpath(
        "//form[@id='J_FrmBid']/input[@name='seller_id']/@value").get()
    photo_url = selector.xpath(
        "//form[@id='J_FrmBid']/input[@name='photo_url']/@value").get()
    rootCatId = selector.xpath(
        "//form[@id='J_FrmBid']/input[@name='rootCatId']/@value").get()
    allow_quantity = re.findall("\"quantity\":(\d+),", page_source)[0]
    param = re.findall("id=(\d+).*&skuId=(\d+)", url)[0]
    buy_param = param[0] + "_" + "1" + "_" + param[1]
    _tb_token_ = c.get("_tb_token_")
    skuId = param[1]
    item_id_num = param[0]
    item_id = param[0]
    auction_id = param[0]
    buy_now = re.findall("\"price\":\"(\d+\.\d+)\",", page_source)[0]
    current_price = buy_now
    seller_num_id = selector.xpath("//*[@id=\"dsr-userid\"]/@value").get()
    data = {
        'title': '(unable to decode value)',
        'x_id': '',
        'seller_id': seller_id,
        'seller_nickname': '(unable to decode value)',
        'who_pay_ship': '(unable to decode value)',
        'photo_url': photo_url,
        'region': '(unable to decode value)',
        'auto_post': 'false',
        'etm': 'post',
        'virtual': 'false',
        'rootCatId': rootCatId,
        'auto_post1': '',
        'buyer_from': 'ecity',
        'root_refer': '',
        'item_url_refer': 'https%3A%2F%2Fs.taobao.com%2F',
        'allow_quantity': allow_quantity,
        'buy_param': buy_param,
        'quantity': '1',
        '_tb_token_': _tb_token_,
        'skuInfo': '(unable to decode value)',
        'use_cod': 'false',
        '_input_charset': 'UTF-8',
        'destination': '350100',
        'skuId': skuId,
        'bankfrom': '',
        'from_etao': '',
        'item_id_num': item_id_num,
        'item_id': item_id,
        'auction_id': auction_id,
        'seller_rank': '0',
        'seller_rate_sum': '0',
        'is_orginal': 'no',
        'point_price': 'false',
        'secure_pay': 'true',
        'pay_method': '(unable to decode value)',
        'from': 'item_detail',
        'buy_now': buy_now,
        'current_price': current_price,
        'auction_type': 'b',
        'seller_num_id': seller_num_id,
        'activity': '',
        'chargeTypeId': '',
    }
    return data, param[0]
示例#16
0
def get_suburls(html_content, logger):
    
    sel = pr.Selector(html_content)

    links = sel.xpath('//a[contains(@href, "teampages")]/@href').extract()
    return ['http://www.dailymail.co.uk' + link for link in links]
示例#17
0
# 使用xpath 爬取 链家房子数据
import requests
import parsel

url = 'https://cs.lianjia.com/ershoufang/pg1/'
headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
res = requests.get(url=url, headers=headers)
html_date = res.text
selector = parsel.Selector(html_date)
lis = selector.xpath('//ul[@class="sellListContent"]/li')

for li in lis:
    title = li.xpath('.//div[@class="title"]/a/text()').get()
    plase = li.xpath('.//div[@class="positionInfo"]/a/text()').getall()
    plase = "- ".join(plase)
    value = li.xpath('.//div[@class="totalPrice"]/span/text()').get() + "W"
    unitPrice = li.xpath('.//div[@class="unitPrice"]/span/text()').get()
    houseInfo = li.xpath('.//div[@class="houseInfo"]/text()').get()
    print(title, plase, value, unitPrice, houseInfo, sep=" | ")
示例#18
0
def extract_headlines(html_content, modifier, logger):
    '''
    Returns a dictionary with the key aspects of BBC headlines
    We have two types - the team pages and the confirmed transfers
    In the team pages - we have an image which also has a title (could be useful). 
    Each article is separated into "article" blocks and the URLs are non-absolute with a numeric code
    Could potentially improve timing here as the loops might be excessive
    '''

    sel = pr.Selector(html_content)
    articles_info = {}

    if modifier:
        # Get the article information - write an initial search and then find ANYTHING inside
        search = '//article[@class = "clearfix faux-block-link lakeside lakeside--auto lakeside--has-media"]'
        articles = sel.xpath(search)

        # Need to loop because we want all the info to match up - even if not present
        for i, article in enumerate(articles):

            # Step through each article and extract_first to only return value and not list
            article_title = article.xpath(
                './/span[@class = "lakeside__title-text"]/text()'
            ).extract_first()
            article_link = article.xpath(
                './/a[@class = "faux-block-link__overlay"]/@href'
            ).extract_first()
            article_summary = article.xpath('.//p/text()').extract_first()
            article_image = article.xpath('.//img/@alt').extract_first()
            article_date = article.xpath(
                './/span[@class = "timestamp"]/time/text()').extract_first()

            if article_summary:
                article_summary = article_summary.strip()

            if 'http://' not in article_link and 'https://' not in article_link:
                article_link = 'http://www.bbc.com' + article_link

            article_info = {
                'article_title': article_title.strip(),
                'article_link': article_link,
                'article_summary': article_summary,
                'article_image': article_image,
                'article_date': article_date
            }

            articles_info['article_{}'.format(i + 1)] = article_info

    else:
        article_titles = sel.xpath('//p/a/text()').extract()
        article_links = sel.xpath('//p/a/@href').extract()
        article_summaries = sel.xpath('//p/text()').extract()
        article_images = ''
        article_dates = ''

        # Now combine into dictionaries like above
        for i, title in enumerate(article_titles):

            if article_summaries[i]:
                article_summaries[i] = article_summaries[i].strip()

            if 'http://' not in article_links[
                    i] and 'https://' not in article_links[i]:
                article_links[i] = 'http://www.bbc.com' + article_links[i]

            article_info = {
                'article_title': title.strip(),
                'article_link': article_links[i],
                'article_summary': article_summaries[i],
                'article_image': article_images,
                'article_date': article_dates
            }

            articles_info['article_{}'.format(i + 1)] = article_info

    return articles_info
def get_html(fours_url):
    try:
        print('正在爬取的连接', fours_url)
        resp_html = requests.get(fours_url, headers=headers)
        resp_html.encoding = 'gbk'
        resp_data = parsel.Selector(resp_html.text)
        data_list = resp_data.xpath(
            '//ul[@class="list-box"]/li[@class="list-item"]')
        for li in data_list:
            print(fours_url)
            """请求详情"""
            # 获取详情连接
            store_name_link = 'https:' + li.xpath(
                './ul[@class="info-wrap"]/li[1]/a/@href').get()
            # 对详情发起请求
            detail_resp = requests.get(headers=headers, url=store_name_link)
            detail_resp.encoding = 'gbk'
            # 转化响应
            detail_data = parsel.Selector(detail_resp.text)

            # 4s店名称
            store_name = detail_data.xpath(
                '//div[@id="breadnav"]/p/span[2]/text()').get()
            # print(store_name)

            # 座机号
            telephone_number = detail_data.xpath(
                '//div[@id="400set"]/span[@class="dealer-api"]/span/text()'
            ).get()
            # print(telephone_number)

            # 地址
            address = detail_data.xpath(
                '//div[@id="dealerposi"]/div[@class="allagency-cont"]/p/@title'
            ).get()
            print(address)
            if address == '':
                continue
            # print(address)

            # 城市名称
            city_name = detail_data.xpath(
                '//div[@id="breadnav"]/p/a/text()').get()
            # print('城市',city_name)

            # 城市id
            sql2 = "select city_id from national_cities where city_name ='{}'".format(
                city_name)
            cursor.execute(sql2)
            city_id = cursor.fetchone()[0]

            last_sync_time = datetime.datetime.now().strftime(
                "%Y-%m-28 23:35:23")

            # 主营品牌
            brand_list = detail_data.xpath(
                '//div[@class="brandtree"]/div/p[@class="text"]/text()'
            ).getall()
            print(brand_list)

            brand_name = ''
            for bname in brand_list:
                brand_name = bname
                if brand_name == '阿尔法·罗密欧':
                    brand_name = '阿尔法・罗密欧'

                if brand_name == '阿斯顿·马丁':
                    brand_name = '阿斯顿・马丁'
                # 品牌id
                sql2 = "select brand_id from t_car_brand where brand_name='{}'".format(
                    brand_name)
                cursor.execute(sql2)
                brand_id = cursor.fetchone()[0]

                data = (brand_name, brand_id, store_name, telephone_number,
                        address, city_name, city_id, last_sync_time)
                print(data)
                # t_mbr_role_storehouse
                # 存入数据
                # 根据 车型id 城市id 判断 车型是否存在  时间就是确保 库里只保留本月的数据
                sql = "select * from t_mbr_role_storehouse where brand_name='{}'and brand_id='{}'and store_name ='{}' and telephone_number ='{}' and address='{}'and city_name='{}'".format(
                    data[0], data[1], data[2], data[3], data[4], data[5])
                cursor.execute(sql)
                many = cursor.fetchone()
                if many:
                    print('此数据表中已存在')
                else:
                    insert_sql = "insert into t_mbr_role_storehouse(brand_name,brand_id,store_name,telephone_number,address,city_name,city_id,last_sync_time)values (%s,%s,%s,%s,%s,%s,%s,%s)"
                    cursor.execute(insert_sql, data)
                    conn.commit()  # 提交数据

                    print('数据提交完成')
    except TypeError:
        pass
示例#20
0
conn = pymysql.connect(host="112.126.89.134", user="******", password="******", port=3306, db="jgcproddb",charset="utf8")
# conn = pymysql.connect(host="localhost", user="******", password="", port=3306, db="jdbc", charset="utf8")
cursor = conn.cursor()  # 创建游标


headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
        'Connection': 'close'
    }


url = "https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId"

resp = requests.get(url=url,headers=headers)

response = parsel.Selector(resp.text)



# 品牌id
ids = response.xpath('//body/ul/li/@id').getall()

base_url = 'https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId'

for var in ids:
    id = var.replace('b', '=')
    base_url= url + id
    base_response = requests.get(base_url,headers=headers)
    base_response.encoding='gbk'
    base_resp = parsel.Selector(base_response.text)
    # print(base_resp)
示例#21
0
def choice_fun31(ra2, e2, window4):
    global keyword
    flag = 0
    choice = ra2.get()
    keyword = e2.get()
    if choice == 1:
        PATH = path + keyword + '/'
        if os.path.exists(PATH):
            pass
        else:
            os.makedirs(PATH)
        # https://www.fabiaoqing.com/search/search/keyword/%E5%B0%8F%E9%BB%84%E9%B8%AD/type/bq.html
        url2 = 'https://www.fabiaoqing.com/search/search/keyword/' + keyword + '/type/bq.html'
        page_num = \
            parsel.Selector(request_url(url2).text).xpath('//div[@class="ui pagination menu"]/a/text()').extract()[
                -2].strip()
        tk.Label(window4, text="请输入下载页数:", font=('Arial', 10),
                 width=20).grid(column=2)
        tk.Label(window4,
                 text="(每页 45 张,共 " + page_num + " 页。)",
                 font=('Arial', 8),
                 width=30).grid(column=2)
        e1 = tk.Entry(window4, font=('Arial', 10), width=10)
        e1.grid(column=2)
        tk.Button(window4,
                  text='确认',
                  font=('Arial', 9),
                  width=6,
                  height=1,
                  command=lambda: check_download1(e1, PATH, window4, text2,
                                                  flag)).grid(column=2)
        tk.Label(window4, text=" ", width=8).grid()
        tk.Label(window4, text="下载情况:").grid()
        tk.Label(window4, text=" ", width=8).grid()
        text2 = ScrolledText(window4,
                             font=('微软雅黑', 10),
                             width=53,
                             height=12,
                             fg='blue')
        text2.grid()
    elif choice == 2:
        url3 = 'https://www.fabiaoqing.com/search/search/keyword/' + keyword + '/type/bqb.html'
        page_num = parsel.Selector(request_url(url3).text).xpath('//div[@class="ui pagination menu"]/a/text()') \
            .extract()[-2].strip()
        tk.Label(window4, text=" ", width=8).grid()
        tk.Label(window4, text="请输入下载页数:", font=('Arial', 10), width=20).grid()
        tk.Label(window4,
                 text="(每页 8 套,共 " + page_num + " 页。)",
                 font=('Arial', 8),
                 width=30).grid()
        e1 = tk.Entry(window4, font=('Arial', 10), width=10)
        e1.grid()
        tk.Button(window4,
                  text='确认',
                  font=('Arial', 9),
                  width=6,
                  height=1,
                  command=lambda: check_download2(url3, e1, text3, window4, 0)
                  ).grid()
        tk.Label(window4, text=" ", width=8).grid()
        tk.Label(window4, text="下载情况:").grid()
        tk.Label(window4, text=" ", width=8).grid()
        text3 = ScrolledText(window4,
                             font=('微软雅黑', 10),
                             width=53,
                             height=12,
                             fg='blue')
        text3.grid()
async def get_html(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers) as response:
            html = await response.text()  # 可以直接获取bytes
            resp = parsel.Selector(html)
            tr_list = resp.xpath(
                '//table[@class="xl-table-def xl-table-a"]//tr[position()>1]')
            for tr in tr_list:
                # ranking = tr.xpath('./td[@class="xl-td-t1"]/text()').get()
                # # print('排名',ranking)

                series_name = tr.xpath('./td[2]/a/text()').get()
                if series_name == '宏光MINI EV':
                    series_name = '宏光MINIEV'
                # print('车系名称',series_name)
                """获取车主之家车系销量的连接"""
                series_link = 'https://xl.16888.com' + tr.xpath(
                    './td[2]/a/@href').get()
                series_link_list.append(series_link)
                # print(series_link)
            """月销量数据"""
            for url in series_link_list:
                detail_resp = requests.get(url, headers).text
                detail_data = parsel.Selector(detail_resp)
                # print(detail_resp)
                # 车系
                series_name = detail_data.xpath(
                    '//div[@class="xl-level-head clr"]/span/text()').get(
                    ).replace('销量详情', '')
                if series_name == '宏光MINI EV':
                    series_name = '宏光MINIEV'
                """获取车系id/厂商id"""
                cursor.execute(
                    "select series_id,car_series_id,car_series,car_brand_id from t_car_category where category_fullname='{}'"
                    .format(series_name))
                id_list = cursor.fetchall()
                # 车系id
                series_id = ''
                # 厂商id
                car_series_id = ''
                # 厂商名称
                car_series = ''
                # 品牌id
                brand_id = ''
                for id in id_list:
                    series_id = id[0]
                    car_series_id = id[1]
                    car_series = id[2]
                    brand_id = id[3]
                """获取品牌名称"""
                cursor.execute(
                    "select brand_name from t_car_brand where brand_id='{}'".
                    format(brand_id))
                brand_name = cursor.fetchone()[0]

                tr_list = detail_data.xpath(
                    '//table[@class="xl-table-def xl-table-a"]/tr[position()>1][position()<24]'
                )
                # print(tr_list)
                for tr in tr_list:
                    # 时间
                    sale_time = tr.xpath('./td[1]/text()').get()
                    # print(sale_time)

                    # 月销量
                    monthly_sales = tr.xpath('./td[2]/text()').get()
                    # print(monthly_sales)

                    # 当前销量排行
                    now_monthly_sales = tr.xpath('./td[3]/a/text()').get()
                    # print(now_monthly_sales)

                    # 占厂商份额
                    share_of_manufacturers = tr.xpath('./td[4]/text()').get()
                    # print(share_of_manufacturers)

                    # 在厂商排名
                    ranking_among_manufacturers = tr.xpath(
                        './td[5]/a/text()').get()
                    # print(ranking_among_manufacturers)

                    # 在SUV排名
                    ranking_in_suv = tr.xpath('./td[6]/a/text()').get()
                    # print(ranking_in_suv)

                    # 更新时间
                    last_sync_time = datetime.datetime.now().strftime(
                        "%Y-%m-28 23:35:23")
                    data = (series_id, series_name, car_series_id, car_series,
                            brand_id, brand_name, sale_time, monthly_sales,
                            now_monthly_sales, share_of_manufacturers,
                            ranking_among_manufacturers, ranking_in_suv,
                            last_sync_time)
                    data_list.append(data)
示例#23
0
文件: yiwan.py 项目: 506780892/spider
import requests
import parsel

with open('易玩角色扮演.csv', 'a', encoding='utf-8') as f:
    biaoti = f'{"名字"},{"id"},{"标签"},{"简介"},{"下载地址"}'
    f.write(biaoti)
    f.write('\n')
for pag in range(1, 101):
    url = f'http://www.yiwan.com/az/3_0_new_{pag}/'

    response = requests.get(url=url, verify=False)
    selector = parsel.Selector(response.text)
    listli = selector.xpath("//div[@class='r-content softlist']/ul/li")
    for li in listli:
        name = li.xpath(
            "./div[@class='softlist-t']/h3[@class='softlist-t2']/a/text()"
        ).get()
        id = li.xpath("./div[@class='softlist-download']/a/@href").re(
            "game/(.*?)/")[0]
        xqlj = "http://www.yiwan.com/" + li.xpath(
            "./div[@class='softlist-download']/a/@href").get()
        res = requests.get(url=xqlj)
        sel = parsel.Selector(res.text)
        downurl = sel.xpath("//div[@class='gi_r']/a/@href").get().strip("\n")
        if downurl == "javascript:;":
            downurl = ""
        tag = li.xpath(
            "./div[@class='softlist-t']/p[@class='softlist-t4']//a/text()"
        ).getall()
        # tag = []
        # for tag in tags:
示例#24
0
import requests
import re
from fake_useragent import UserAgent
import parsel

url='https://tieba.baidu.com/f?kw=%CA%AF%D4%AD%C0%EF%C3%C0&fr=ala0&loc=rec'
headers={'UserAgent':UserAgent().chrome}
response=requests.get(url,headers).text
#print(response)

#解析
html=parsel.Selector(response)
print(html)
title=html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href').getall()
print(title)

#拼接
furl='https://tieba.baidu.com'


for tit in title:
    ur=furl+tit
    #print('当前地址为',ur)
    #再次请求
    response2=requests.get(ur,headers).text

    f_img=parsel.Selector(response2)
    #再次解析
    img_data=f_img.xpath('//cc/div/img[@class="BDE_Image"]/@src').getall()
    print(img_data)
示例#25
0
# coding = utf-8
import requests
import parsel

url = 'http://www.win4000.com/zt/yingxionglianmeng.html'
response = requests.get(url=url).text
html = parsel.Selector(response)
img_url_a = html.xpath("//div[@class='tab_box']//a/@href").getall()
print(img_url_a)
i = 1
for url in img_url_a:
    response1 = requests.get(url).text
    html1 = parsel.Selector(response1)
    jpg_url = html1.xpath("//div[@class='pic-meinv']//img/@src").get()
    print(jpg_url)
    jpg = requests.get(jpg_url).content


    print(i)
    with open('图片{}.jpg'.format(i), 'wb') as fp:
        fp.write(jpg)
        print('保存成功', i)
    i += 1
示例#26
0
def parse_single_league_page(html, league_name):
    selector = parsel.Selector(html)
    club_names = selector.xpath('./body/tbody/tr/td/a/text()').extract()
    return [{'club': club, 'league': league_name} for club in club_names]
示例#27
0
import parsel
import os

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for page in range(1, 310):  # Total 309pages

    print(f'======= Scraping data from page {page} =======')

    url = f'https://www.bikeexif.com/page/{page}'

    response = requests.get(url, headers=headers)
    selector = parsel.Selector(response.text)

    containers = selector.xpath(
        '//div[@class="container"]/div/article[@class="smallhalf"]')

    for v in containers:

        old_title = v.xpath(
            './/div[2]/h2/a/text()').get()  #.replace(':', ' -')
        if old_title is not None:
            title = old_title.replace(':', ' -')

        title_url = v.xpath('.//div[2]/h2/a/@href').get()
        print(title, title_url)

        os.makedirs(os.path.join('img', title),
示例#28
0
import parsel
import requests

url = "https://www.138u.cn/"
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43'
}
respond = requests.get(url=url, headers=headers)
respond.raise_for_status()
respond.encoding = "UTF-8"
data = respond.text

select1 = parsel.Selector(data)
title = select1.xpath('//div[@class="card-deck"]//div/a/img/@alt').getall()

select2 = parsel.Selector(data)
photo_url = select2.xpath(
    '//div[@class="card-deck"]//div/a/img/@data-original').getall()

for each, i in zip(photo_url, title):
    MyPhoto = requests.get(url=each, headers=headers, timeout=50).content

    with open('../辅助爬取/Photoes/' + i + '.jpg', mode="wb") as f:
        f.write(MyPhoto)
        print(i + "成功了")
print("------finished------")
示例#29
0
def extract_headlines(html_content, modifier, logger):
    '''
    Extract headlines from Skysports pages
    The modifier points to regional articles or not
    '''

    sel = pr.Selector(html_content)
    articles_info = {}

    if modifier:
        # For the regional articles, would like to tag on the source
        sources = sel.xpath('//div[@class = "paper-stories"]')
        article_titles = []
        for source in sources:
            source_name = source.xpath('.//p/text()').extract_first()
            article_titles.extend([
                source_name + ' - ' + title
                for title in source.xpath('.//li//text()').extract()
            ])  # Skybet means need // for text

        # Fill in rest with blanks or filler data (links - for title of file)
        article_links = [
            '/fake_link/article_' + str(i)
            for i in range(0, len(article_titles))
        ]
        article_summaries = [''] * len(article_titles)
        article_images = [''] * len(article_titles)
        article_dates = [''] * len(article_titles)

    else:
        # Declare an extra condition because sometimes it would link to a generic article that caused bugs
        extra_condition = './div[@class = "figure span1/3 -spr0-5"]/a/@href = "http://www.skysports.com/transfer-centre"'

        # Note that there is a "show more" section that cannot load HTML for and three different types of article in general
        transfer_headlines = sel.xpath(
            '//div[@class = "box media -vertical -bp20-horizontal"]')
        transfer_sublines = sel.xpath(
            '//div[@class = "box media -bp30-vertical" and not({})]'.format(
                extra_condition))
        transfer_sublinks = sel.xpath('//ul[@class = "list -bullet text-s"]')

        # For main headlines
        article_titles = transfer_headlines.xpath(
            './/a[@class = "-a-block -clear"]/h2/text()').extract()
        article_links = transfer_headlines.xpath(
            './/a[@class = "-a-block -clear"]/@href').extract()
        article_summaries = transfer_headlines.xpath(
            './/a[@class = "-a-block -clear"]/p/text()').extract()
        article_images = transfer_headlines.xpath('.//img/@data-src').re(
            '[^\/](\/[^\./]*\.[A-z]*|#)'
        )  # Found one with a ? in the middle - first [] removes things after ://
        article_dates = [''] * len(article_titles)

        # For subheadlines
        article_titles.extend(
            transfer_sublines.xpath('.//h2/text()').extract())
        article_links.extend(
            transfer_sublines.xpath('.//a[not(@class)]/@href').extract())
        article_summaries.extend(
            transfer_sublines.xpath('.//a[not(@class)]/p/text()').extract())
        article_images.extend(
            transfer_sublines.xpath('.//img/@alt | .//img/@data-src').extract(
            ))  # Not sure how to extract one and re the other
        article_dates.extend(
            transfer_sublines.xpath(
                './/h5[@class = "caption"]/text()').extract())

        # Sublinks in headlines
        sublink_titles = transfer_sublinks.xpath('./li/a/text()').extract(
        )  # get the titles to fill in blanks later
        article_titles.extend(sublink_titles)
        article_links.extend(transfer_sublinks.xpath('./li/a/@href').extract())
        article_summaries.extend([''] * len(sublink_titles))
        article_images.extend([''] * len(sublink_titles))
        article_dates.extend([''] * len(sublink_titles))

    # Now combine into dictionaries
    for i, title in enumerate(article_titles):
        if article_summaries[i]:
            article_summaries[i] = article_summaries[i].strip()

        if article_links[i] != '' and 'http://' not in article_links[
                i] and 'https://' not in article_links[i]:
            article_links[i] = 'http://www.skysports.com' + article_links[i]

        article_info = {
            'article_title': title.strip(),
            'article_link': article_links[i],
            'article_summary': article_summaries[i],
            'article_image': article_images[i],
            'article_date': article_dates[i]
        }

        articles_info['article_{}'.format(i + 1)] = article_info

    return articles_info
示例#30
0
        """
        108
        120
        132
        相差 12 
        """
        one_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
            'Referer':'https://www.pearvideo.com/category_31'
        }

        response = requests.get(url,headers=one_headers,verify=False).text
        # print(response)
        # 转为计算机识别
        resp_html = parsel.Selector(response)
        # 解析数据 获得li_list
        li_list = resp_html.xpath('//body/li')
        # print(li_list)

        for li in li_list:
            detail_link = 'https://www.pearvideo.com/'+li.xpath('.//a/@href').get()
            # print(detail_link)

            # 请求详情页
            detail_page_data = requests.get(detail_link,verify=False).text
            detail_data = parsel.Selector(detail_page_data)

            contID = detail_link.split('_')[1]

            video_date = f'https://www.pearvideo.com/videoStatus.jsp?contId={contID}&mrd=0.7890920916276076'