Пример #1
0
    def get_html(self, url):
        """
        访问 url,并转换成 BeautifulSoup 类型
        """
        # time.sleep(0.5)

        ps, ips = get_ips()

        def get_resp():
            index = choice(range(len(ps)))
            try:
                proxy = request.ProxyHandler({ps[index]: ips[index]})
                request.install_opener(request.build_opener(proxy))
                req = Request(url, headers=self.HEADERS)
                response = request.urlopen(req, timeout=3)
            except Exception as e:
                print(e, f'{ps[index]}://{ips[index]}')
                ps.pop(index)
                ips.pop(index)
                return get_resp()

            return response

        resp = get_resp()

        html = resp.read().decode('utf8')  # 读取后数据为 bytes,需用 utf-8 进行解码
        html = BeautifulSoup(html, 'html.parser')
        return html
Пример #2
0
def test_rpc(net_type):
    ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt')
    num_clients = 1
    num_servers = 1
    ips = utils.get_ips(ip_config)
    num_machines = len(ips)
    test_bin = os.path.join(os.environ.get('DIST_DGL_TEST_PY_BIN_DIR', '.'),
                            'rpc_basic.py')
    base_envs = dgl_envs + \
        f" DGL_DIST_MODE=distributed DIST_DGL_TEST_IP_CONFIG={ip_config} DIST_DGL_TEST_NUM_SERVERS={num_servers} DIST_DGL_TEST_NET_TYPE={net_type} "
    procs = []
    # start server processes
    server_id = 0
    for ip in ips:
        for _ in range(num_servers):
            server_envs = base_envs + \
                f" DIST_DGL_TEST_ROLE=server DIST_DGL_TEST_SERVER_ID={server_id} DIST_DGL_TEST_NUM_CLIENTS={num_clients * num_machines} "
            procs.append(
                utils.execute_remote(server_envs + " python3 " + test_bin, ip))
            server_id += 1
    # start client processes
    client_envs = base_envs + " DIST_DGL_TEST_ROLE=client DIST_DGL_TEST_GROUP_ID=0 "
    for ip in ips:
        for _ in range(num_clients):
            procs.append(
                utils.execute_remote(client_envs + " python3 " + test_bin, ip))
    for p in procs:
        p.join()
        assert p.exitcode == 0
Пример #3
0
def test_dist_objects(target, net_type, num_servers, num_clients, hetero,
                      shared_mem):
    if not shared_mem and num_servers > 1:
        pytest.skip(
            f"Backup servers are not supported when shared memory is disabled")
    ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt')
    workspace = os.environ.get('DIST_DGL_TEST_WORKSPACE',
                               '/shared_workspace/dgl_dist_tensor_test/')

    ips = utils.get_ips(ip_config)
    num_part = len(ips)

    test_bin = os.path.join(os.environ.get('DIST_DGL_TEST_PY_BIN_DIR', '.'),
                            'run_dist_objects.py')

    dist_graph_path = os.path.join(
        workspace, 'hetero_dist_graph' if hetero else 'dist_graph')
    if not os.path.isdir(dist_graph_path):
        create_graph(num_part, dist_graph_path, hetero)

    base_envs = f"DIST_DGL_TEST_WORKSPACE={workspace} " \
                f"DIST_DGL_TEST_NUM_PART={num_part} " \
                f"DIST_DGL_TEST_NUM_SERVER={num_servers} " \
                f"DIST_DGL_TEST_NUM_CLIENT={num_clients} " \
                f"DIST_DGL_TEST_NET_TYPE={net_type} " \
                f"DIST_DGL_TEST_GRAPH_PATH={dist_graph_path} " \
                f"DIST_DGL_TEST_IP_CONFIG={ip_config} "

    procs = []
    # Start server
    server_id = 0
    for part_id, ip in enumerate(ips):
        for _ in range(num_servers):
            cmd_envs = base_envs + \
                       f"DIST_DGL_TEST_SERVER_ID={server_id} " \
                       f"DIST_DGL_TEST_PART_ID={part_id} " \
                       f"DIST_DGL_TEST_SHARED_MEM={str(int(shared_mem))} " \
                       f"DIST_DGL_TEST_MODE=server "
            procs.append(
                utils.execute_remote(f"{cmd_envs} python3 {test_bin}", ip))
            server_id += 1
    # Start client processes
    for part_id, ip in enumerate(ips):
        for _ in range(num_clients):
            cmd_envs = base_envs + \
                       f"DIST_DGL_TEST_PART_ID={part_id} " \
                       f"DIST_DGL_TEST_OBJECT_TYPE={target} " \
                       f"DIST_DGL_TEST_MODE=client "
            procs.append(
                utils.execute_remote(f"{cmd_envs} python3 {test_bin}", ip))

    for p in procs:
        p.join()
        assert p.exitcode == 0
Пример #4
0
def test_tensorpipe_comm():
    base_dir = os.environ.get('DIST_DGL_TEST_CPP_BIN_DIR', '.')
    ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt')
    client_bin = os.path.join(base_dir, 'rpc_client')
    server_bin = os.path.join(base_dir, 'rpc_server')
    ips = get_ips(ip_config)
    num_machines = len(ips)
    procs = []
    for ip in ips:
        procs.append(
            execute_remote(server_bin + " " + str(num_machines) + " " + ip,
                           ip))
    for ip in ips:
        procs.append(execute_remote(client_bin + " " + ip_config, ip))
    for p in procs:
        p.join()
        assert p.exitcode == 0
Пример #5
0
def main():
    epd = epd2in7.EPD()
    epd.init()
    epd.Clear(0xFF)

    font24 = ImageFont.truetype('./lib/Font.ttc', 24)
    font10 = ImageFont.truetype('./lib/Font.ttc', 10)

    while True:
        himage = Image.new('1', (epd.height, epd.width),
                           255)  # 255: clear the frame
        draw = ImageDraw.Draw(himage)
        draw.text((10, 0), utils.get_ips(network_adaptor), font=font10, fill=0)
        draw.text((60, 20),
                  datetime.now().strftime('%Y-%m-%d %H:%M'),
                  font=font24,
                  fill=0)
        epd.display(epd.getbuffer(himage))

        time.sleep(30)
Пример #6
0
def spider_href(job, citys, num_page):
    # fake userAgent
    ua = UserAgent()
    #ip pool
    ip_pool = get_ips()
    # save path
    out_path = './data/hrefs/href_%s.txt' % (job)
    # open file
    fp = open(out_path, 'w', encoding='utf-8')
    # spider the href
    for city in citys:
        for page in range(1, int(num_page / len(city))):
            # sleep(1)
            # 搜索关键字为“数据挖掘”,工作地区为北京的url,dps 为工作地区的参数,010 为猎聘网为北京地区指定的区域号
            url="https://www.liepin.com/zhaopin/?key=%s&dqs=%s&curPage=%s" % (job, citys[city], str(page))
            # 发起访问请求
            proxy = {'http': random.choice(ip_pool)}
            html = requests.get(url = url, proxies=proxy, headers={ "User-Agent":ua.random})
            # 输出返回信息
            print(html.url)
            # 初始化soup 对象,page.text 为爬取到的带有html 标签页面
            soup = BeautifulSoup(html.text,"html.parser")
            # 找到<h3>标签,实质是获取所有包含职位名称及链接的标签内容
            soup = soup.find_all("h3")
            #在每个<h3>中进行抽取链接信息
            # with open('href.txt', 'a', encoding='utf-8') as f:
            for i in soup:
                #有些<h3>标签不包含求职信息,做简要判断
                if i.has_attr("title"):
                    #抽取链接内容
                    href=i.find_all("a")[0]["href"]
                    if('https://www.liepin.com' in href):
                        fp.write(href + '\n')
                    else:
                        fp.write('https://www.liepin.com' + href + '\n')
                    print(href)
        fp.flush()
    fp.close()
Пример #7
0
import os
from fake_useragent import UserAgent
import utils

# ip pool
# ip_pool = []
# with open("verified.txt", "r") as f:
#     while True:
#         ll = f.readline()
#         if not ll: break
#         line = ll.strip().split('|')
#         ip = line[1]
#         port = line[2]
#         realip = ip + ':' + port
#         ip_pool.append(realip)
ip_pool = utils.get_ips()

hrefs = []
with open("href.txt", "r", encoding='utf-8') as f:
    while True:
        ll = f.readline()
        if not ll: break
        line = ll.strip()
        hrefs.append(line)

# 伪装
ua = UserAgent()
# page是我们需要获取多少页的ip,这里我们获取到第x页
# 打开我们创建的txt文件
demand = open('demand.txt', 'w', encoding='utf-8')
error = 0
Пример #8
0
# -*- coding: utf-8 -*-
from flask import Flask, render_template, request

from config import my_port, source_page_path
from utils import get_qr_code, get_ips, spider_data, get_dir, get_dir_name, get_zip_file

app = Flask(__name__)
get_dir(source_page_path)
my_ip = get_ips()


@app.route('/')
def login():
    return render_template('index.html', image_data=get_qr_code())


@app.route('/show_time')
def show_time():
    return render_template('show_time.html',
                           name_l=get_dir_name(source_page_path))


@app.route('/spider')
def spider():
    try:
        args = request.args
        path_prefix, gzh_name = f"{source_page_path}/{args['name']}", args[
            'gzh_name']
        start_page, end_page = int(args['start_page']), int(args['end_page'])
        return spider_data(path_prefix, gzh_name, start_page, end_page)
    except Exception: