Пример #1
0
#!/usr/local/python3/bin/python3

from lxml import etree
import os

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='lxml.log')

html = '''
<html>
<title> This is Title </title>
<body>
	<h1> This is h1 </h1>
	<div> This is fisrt div </div>
	<div id="divid">
		<img src="1111.png"/>
		<span id="sp1"> desc 1111.png </span>

		<img src="2222.png"/>
		<span id="sp2"> desc 2222.png </span>

		<p>
			<a href="http://www.xxxxx.com/"> link-of-xxxxxx </a>
		</p>

		<a href="http://www.yyyyyyy.com/"> link-of-yyyyyyyyy </a>
		<br/>
		<a href="http://www.zzzzzzz.com/"> link-of-zzzzzzzzz </a>

	</div>
import os
import time

from selenium import webdriver
import selenium.common

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name="play_tengxun.log")


def play_tengxun_video():
    index_url = "https://v.qq.com/"
    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)

    # debug timeout的情况下再试一次
    n = 1
    while n < 3:
        try:
            driver.get(index_url)

            # 当前窗口
            index_win = driver.current_window_handle

            # 进入动漫频道
Пример #3
0
# encoding:utf-8
import urllib.request
import os

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='check_proxy.log')
'''
httpproxy=urllib2.ProxyHandler({"http":"10.36.132.41:808"})#代理无需账号
opener=urllib2.build_opener(httpproxy)#创建一个打开器
request=urllib2.Request("http://www.baidu.com") #访问百度
response=opener.open(request)#打开网页,内置代理服务器
print response.read()
'''


# 测试代理是否可用
#  测试个多代理
def check_proxys(proxy_dict_list):
    if not isinstance(proxy_dict_list, list) and not isinstance(
            proxy_dict_list[0], dict):
        logger.debug("请输入正确的代理")
    i = 0
    for proxy_dict in proxy_dict_list:
        proxy = urllib.request.ProxyHandler(proxy_dict)
        # nohttpproxy=urllib.request.ProxyHandler({}) #空代理
        opener = urllib.request.build_opener(proxy)
        request = urllib.request.Request(
            "http://www.baidu.com/")  #代理访问,URL必须完整,
        try:
            response = opener.open(request, timeout=4)
import os

import selenium.webdriver
from selenium.webdriver.chrome.options import Options

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='chrome.log')


def use_chrome_withNonhead():
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")  # root用户需加
    chrome_options.add_argument('--headless')  # 无界面模式
    chrome_options.add_argument('window-size=1920x3000')  # 指定浏览器分辨率
    chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
    chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片

    driver = selenium.webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://www.cnblogs.com/z-x-y/p/9026226.html')

    print(driver.page_source)


if __name__ == '__main__':
    use_chrome_withNonhead()
Пример #5
0
import selenium.webdriver
import os
import time

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='qq_screen.log')


def get_qq_screen():
    driver = selenium.webdriver.PhantomJS()
    driver.get('http://www.baidu.com/')  # https://www.baidu.com/
    # time.sleep(5)
    # 截图保存
    print(dir(driver))
    driver.save_screenshot(os.getcwd() + '/biadu_screen.png')

    driver.close()
    pass


if __name__ == '__main__':
    get_qq_screen()
    pass
Пример #6
0
#!/usr/local/python3/bin/python3
'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1'
import selenium
import selenium.webdriver
import urllib.parse
import os
import re
import urllib.request

from Tools.tools import debug_log

logger = debug_log(os.getcwd())


def get_pages():
    pass


def get_url(addr='深圳', search_word='python'):
    data = {'jl': addr, 'kw': search_word}
    data = urllib.parse.urlencode(data)
    url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + data + '&sm=0&p=1'
    logger.debug(url)
    driver = selenium.webdriver.PhantomJS()
    driver.get(url)

    page_source = driver.page_source

    # logger.debug(page_source)
    restr = 'href=(\s\S*?)'
    src_pattern = re.compile(restr)
import time
import os
import re

from selenium import webdriver


from Tools.tools import debug_log


logger = debug_log(os.getcwd(), name='windows.log')
def handle_windowns():
    index_url = "https://baike.baidu.com/item/Java/85979?fr=aladdin"

    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(index_url)

    # 主窗口
    index_win = driver.current_window_handle

    # 进行登陆
    login_elem = driver.find_element_by_link_text("登录")
    login_elem.click()
    login_win = driver.current_window_handle
    time.sleep(3)
import time
import os

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='solve_some_tags.log')


def solve_some_tags():
    url = "https://www.baidu.com/"

    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    # 进入设置页面
    setting_elem = driver.find_element_by_link_text("设置")
    ActionChains(driver).move_to_element(setting_elem).perform()
    search_settring_elem = driver.find_element_by_link_text("搜索设置")
    search_settring_elem.click()

    # 进行select标签的设置
Пример #9
0
import os

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='lovepython.log')


# join 函数
def test_join():
    str = 'hello'
    str2 = 'python'

    str3 = ' '.join(str2)

    print(str3)


# is 和 == 的区别
def test_is():
    a = 100
    b = 100
    c = 50
    print(id(a))
    print(id(b))
    print(id(c))
    print(a is b)
    # python和java等变量赋值的方式不同,python的奇葩的地址赋值


# 深度拷贝和浅拷贝
def test_deepcopy():
Пример #10
0
import selenium.webdriver
import os
import time

from Tools.tools import debug_log
from selenium.webdriver.support.select import Select

logger = debug_log(os.getcwd(), name='login_qq_mailbox.log')


def login_qq_mailbox():
    driver = selenium.webdriver.PhantomJS()
    driver.get('https://qzone.qq.com/')
    # driver.get('https://blog.csdn.net/linlu_home/article/details/78799878')
    time.sleep(3)

    #login user u and p
    '''
                element = driver.switch_to.active_element
                alert = driver.switch_to.alert
                driver.switch_to.default_content()
                driver.switch_to.frame('frame_name')
                driver.switch_to.frame(1)
                driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
                driver.switch_to.parent_frame()
                driver.switch_to.window('main')
    '''
    # driver.switch_to.frame('login_frame')
    driver.switch_to_frame('login_frame')
    # driver.switch_to.frame(driver.find_element_by_xpath("//div[@id='login_div']/iframe"))
    print(driver.page_source)
Пример #11
0
import requests
import os


from Tools.tools import debug_log


logger = debug_log(os.getcwd(), name='requests_session.log')

# reqiests.session进行登陆
def login_withsession():
    session = requests.session()
    params = {
        'emp_no':	'admin',
        'password':	'******',
    }
    login_url = 'http://demo.smeoa.com/index.php?m=&c=public&a=check_login'
    response = session.post(login_url, params)

    responsenex = session.get('http://demo.smeoa.com/index.php?m=&c=index&a=index')

    # 获取服务器响应的cookies
    cookies = response.cookies.get()
    cookies2 = response.cookies.get()

    print(responsenex.text)


# 使用requests.post登陆
def login_oswith_request_post():
    data = {
Пример #12
0
import subprocess
import os

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='identify_picture_code.log')


def identify_picture_code():
    # 使用ocr工具
    p = subprocess.Popen(['tesseract', '3.png', '3.png'],
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    p.wait()

    # 读取识别结果
    test_file = open('3.png.txt', 'r')
    line = test_file.readline()
    line = line.replace('\n', '')

    logger.debug(type(line))
    logger.debug(line)
    pass


if __name__ == '__main__':
    identify_picture_code()
    pass
Пример #13
0
import http.cookiejar
import urllib.request
import os
import re
import urllib.parse
import random

# 使用xpath获取数据不成功
# from lxml import etree

from Tools.check_proxy import check_proxy
from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='login_csdnwithcookie.log')


def login_csdnwithcookie():
    # 验证并使用代理
    '''
    HTTP
    浙江省温州市 电信
    117.87.178.31
    HTTP
    江苏省徐州市 电信
    115.223.234.116
    HTTP
    浙江省温州市 电信
    101.71.226.188

    '''
    if_proxy = False  # 是否开启代理
Пример #14
0
import selenium.webdriver
import selenium.webdriver.common.keys
import os
import time

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='selenium_baidu.log')


def click_baidu():
    driver = selenium.webdriver.PhantomJS()
    driver.get("https://www.baidu.com/")
    time.sleep(2)
    keyword_elem = driver.find_element_by_id('kw')

    keyword_elem.send_keys("python")
    driver.save_screenshot(os.getcwd() + "baidu1.png")

    # 点击
    click_elem = driver.find_element_by_id("su")
    click_elem.click()
    driver.save_screenshot(os.getcwd() + "baidu2.png")


if __name__ == '__main__':
    click_baidu()
Пример #15
0
import os
import re
import time

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='login_jd.log')


# 未成功, 元素没有加载
def login_jd():
    login_url = "https://plogin.m.jd.com/user/login.action?appid=100&kpkey=&returnurl=http%3A%2F%2Fhome.m.jd.com%2FmyJd%2Fhome.action%3Fsid%3D583ee9874b9874ddf1515a4ada050e44"
    aim_url = ""
    # 无界面设置
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(login_url)

    # 登陆
    username_elem = driver.find_element_by_id("username")
    password_elem = driver.find_element_by_id("password")
    login_elem = driver.find_element_by_id("loginBtn")

    time.sleep(4)
    username_elem.send_keys('17688166224')
Пример #16
0
import os
import time

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name="dowload.log")


def dowload_file():
    index_url = "https://pypi.org/project/selenium/"
    current_dir = os.getcwd()
    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    # 配置下载文件点击下载   # 测试失败
    # prefs = {"profile.default_content_settings.popups":0, "download.default_directory": r"/workspace/sofeware/spider/pro2/12 selenium设置文件点击自动下载/"}
    prefs = {
        'profile.default_content_settings.popups': 0,
        'download.default_directory': current_dir
    }
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(index_url)

    # 进入下载页面
Пример #17
0
import time
import os

import selenium.webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='oa_login.log')


# 判断是否登陆
def if_logined(username=None, passwd=None):
    try:
        login_url = 'http://demo.smeoa.com/index.php?m=&c=public&a=login'
        driver = selenium.webdriver.PhantomJS()
        driver.get(login_url)
        time.sleep(2)

        # 登陆
        username_elem = driver.find_element_by_id('emp_no')
        passwd_elem = driver.find_element_by_id('password')
        login_elem = driver.find_element_by_id('login_btn')

        username_elem.send_keys(username)
        passwd_elem.send_keys(passwd)
        login_elem.click()
        time.sleep(2)

        # 获取网页代码
        person_index_url = 'http://demo.smeoa.com/index.php?m=&c=index&a=index'
        driver.get(person_index_url)
Пример #18
0
import urllib.parse
import urllib.request
import os
import json
import ssl


from Tools.tools import debug_log
logger = debug_log(os.getcwd(),name='debug.log')
# 使用伪装浏览器以及标准url规范爬取智联招聘
def get_zhilian(addr='深圳', position='python'):
    "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1"
    addr = {'jl': addr}
    addr = urllib.parse.urlencode(addr)
    position = {'kw': position}
    position = urllib.parse.urlencode(position)
    url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?' + addr + '&' + position + '&sm=0&p=1'

    headers = {
        'User-Agent': "User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"
    }

    request = urllib.request.Request(url = url, headers = headers)


    response = urllib.request.urlopen(request)

    print(response.read().decode())

    return response
Пример #19
0
import os
import time

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='shucheng.log')


# 创建书本章节对象
class Book():
    def __init__(self, booke_num=0, book_name="未知"):
        self.booke_num = booke_num
        self.book_name = book_name
        self.capter_url_list = []
        self.capters = []


# 内容存储对象
# 章节对象
class Capter():
    def __init__(self, capter_num, capter_title, content):
        self.capter_num = capter_num  # int
        self.capter_title = capter_title
        self.content = content


# 创建driver
def create_driver(url):
    # 设置无界面
Пример #20
0
#!/usr/local/python3/bin/python3
import os
import urllib.request
import http.cookiejar
import urllib.parse
import gzip

import selenium.webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='myprogram2.log')


# 使用open容器捉取网页cookie
def read_cookie():
    # 设置代理ip
    proxy_hander = urllib.request.ProxyHandler({'https': '123.57.207.2'})
    cookie = http.cookiejar.CookieJar()
    cookie_hander = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(cookie_hander, proxy_hander)

    response = opener.open('http://www.baidu.com')

    cookies = ""
    for data in cookie:
        cookies = cookies + data.name + '=' + data.value + ";\r\n"

    logger.debug(cookies)
    return cookies
Пример #21
0
import time
import re
import os

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name="proxy.log")


# 设计存储对象
class Proxy():
    def __init__(self, ip=None, port=0, type=None, addr=None):
        self.ip = ip
        self.port = port
        self.type = type
        self.addr = addr


def create_driver(url):
    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)

    driver.get(url)
Пример #22
0
import os
import time

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name="play_video.log")


def play_video():
    url = "http://videojs.com/"
    # 设置无界面
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    # 点击视频播放
    # video_elem = driver.find_element_by_id("preview-player")
    video_emem = driver.find_element_by_class_name("vjs-big-play-button")
    video_emem.click()

    # 截图
    time.sleep(5)
    driver.save_screenshot(os.getcwd() + "/play_video.png")

    driver.quit()
Пример #23
0
import urllib.request
import os
import re

from lxml import etree

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='jiaobenzhijia.log')


# 获取网页代码
def get_page_source(url=None):
    try:
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
        }
        request = urllib.request.Request(url=url, headers=headers)
        response = urllib.request.urlopen(request)
        return response.read().decode('gb2312')
    except ValueError as e:
        logger.error(e)
        logger.debug('参数不正确, 路由错误')


# 获取网页中需要的url
def get_info_url(url):
    page_source = get_page_source(url)
    xpath = "//div[@class='artlist clearfix']/dl/dt/a/@href"
    xpath2 = "//div[@class='artlist clearfix']/dl/dt/a/@title"
    html = etree.HTML(page_source)
Пример #24
0
import re
import os

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='04 dowload_meinv.log')


def create_driver(url=None):
    # 设置无界面模式
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(chrome_options=options)

    driver.get(url)
    return driver


# all_page_urls
# def get_all_page_urls():
#     url = 'http://m.umei.cc/p/gaoqing/rihan/1.htm'
#     # url = 'http://m.umei.cc/p/gaoqing/rihan/'
#     driver = create_driver(url)
#
#     res_page_num = "<strong id='pagelist_all'>(\d+)</strong>"
#     page_num = re.findall(res_page_num, driver.page_source)
Пример #25
0
import random
import urllib.request
import urllib.parse
import os
import re
import time


from lxml import etree


from Tools.tools import debug_log


logger = debug_log(os.getcwd(), name='etree.log')
def get_page_source(url=None):
    headers = {
        "User-Agent": "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
    }
    # 避免url为空的问题
    try:
        request = urllib.request.Request(url=url, headers=headers)
        response = urllib.request.urlopen(request)
        return response.read().decode('utf-8')
    except ValueError as e:
        logger.debug('url格式有误')


def get_page_urls():
    '''
    'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python&isadv=0&sg=39e470246d7e4727944af8c5e9417893&p=4'
Пример #26
0
import urllib.request
import os

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='get_picture.log')


def get_picture():
    response = urllib.request.urlopen(
        "https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q")
    print(dir(response))
    print(response.read().decode('utf-8'))


def get_picture_with_requests():
    import requests
    response = requests.get(
        'https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q', verify=False)
    print(response.text)


# 使用selenium
def get_picture_with_selenim():
    import selenium.webdriver

    url = 'https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q'
    driver = selenium.webdriver.PhantomJS()
    driver.get(url)

    # picture_span_elems = driver.find_elements_by_xpath("//span[@style='font-size: 15px;']//img/")
Пример #27
0
import os
import time

from selenium import webdriver

from Tools.tools import debug_log

logger = debug_log(os.getcwd(), name='login_taobao.log')


# 网页端无登陆按钮,直接抓取
def login_taobao():
    login_url = ""
    aim_url = ""
    # chrome无界面配置
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(login_url)

    print(driver.page_source)