def get_html(self, url): """ 访问 url,并转换成 BeautifulSoup 类型 """ # time.sleep(0.5) ps, ips = get_ips() def get_resp(): index = choice(range(len(ps))) try: proxy = request.ProxyHandler({ps[index]: ips[index]}) request.install_opener(request.build_opener(proxy)) req = Request(url, headers=self.HEADERS) response = request.urlopen(req, timeout=3) except Exception as e: print(e, f'{ps[index]}://{ips[index]}') ps.pop(index) ips.pop(index) return get_resp() return response resp = get_resp() html = resp.read().decode('utf8') # 读取后数据为 bytes,需用 utf-8 进行解码 html = BeautifulSoup(html, 'html.parser') return html
def test_rpc(net_type): ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt') num_clients = 1 num_servers = 1 ips = utils.get_ips(ip_config) num_machines = len(ips) test_bin = os.path.join(os.environ.get('DIST_DGL_TEST_PY_BIN_DIR', '.'), 'rpc_basic.py') base_envs = dgl_envs + \ f" DGL_DIST_MODE=distributed DIST_DGL_TEST_IP_CONFIG={ip_config} DIST_DGL_TEST_NUM_SERVERS={num_servers} DIST_DGL_TEST_NET_TYPE={net_type} " procs = [] # start server processes server_id = 0 for ip in ips: for _ in range(num_servers): server_envs = base_envs + \ f" DIST_DGL_TEST_ROLE=server DIST_DGL_TEST_SERVER_ID={server_id} DIST_DGL_TEST_NUM_CLIENTS={num_clients * num_machines} " procs.append( utils.execute_remote(server_envs + " python3 " + test_bin, ip)) server_id += 1 # start client processes client_envs = base_envs + " DIST_DGL_TEST_ROLE=client DIST_DGL_TEST_GROUP_ID=0 " for ip in ips: for _ in range(num_clients): procs.append( utils.execute_remote(client_envs + " python3 " + test_bin, ip)) for p in procs: p.join() assert p.exitcode == 0
def test_dist_objects(target, net_type, num_servers, num_clients, hetero, shared_mem): if not shared_mem and num_servers > 1: pytest.skip( f"Backup servers are not supported when shared memory is disabled") ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt') workspace = os.environ.get('DIST_DGL_TEST_WORKSPACE', '/shared_workspace/dgl_dist_tensor_test/') ips = utils.get_ips(ip_config) num_part = len(ips) test_bin = os.path.join(os.environ.get('DIST_DGL_TEST_PY_BIN_DIR', '.'), 'run_dist_objects.py') dist_graph_path = os.path.join( workspace, 'hetero_dist_graph' if hetero else 'dist_graph') if not os.path.isdir(dist_graph_path): create_graph(num_part, dist_graph_path, hetero) base_envs = f"DIST_DGL_TEST_WORKSPACE={workspace} " \ f"DIST_DGL_TEST_NUM_PART={num_part} " \ f"DIST_DGL_TEST_NUM_SERVER={num_servers} " \ f"DIST_DGL_TEST_NUM_CLIENT={num_clients} " \ f"DIST_DGL_TEST_NET_TYPE={net_type} " \ f"DIST_DGL_TEST_GRAPH_PATH={dist_graph_path} " \ f"DIST_DGL_TEST_IP_CONFIG={ip_config} " procs = [] # Start server server_id = 0 for part_id, ip in enumerate(ips): for _ in range(num_servers): cmd_envs = base_envs + \ f"DIST_DGL_TEST_SERVER_ID={server_id} " \ f"DIST_DGL_TEST_PART_ID={part_id} " \ f"DIST_DGL_TEST_SHARED_MEM={str(int(shared_mem))} " \ f"DIST_DGL_TEST_MODE=server " procs.append( utils.execute_remote(f"{cmd_envs} python3 {test_bin}", ip)) server_id += 1 # Start client processes for part_id, ip in enumerate(ips): for _ in range(num_clients): cmd_envs = base_envs + \ f"DIST_DGL_TEST_PART_ID={part_id} " \ f"DIST_DGL_TEST_OBJECT_TYPE={target} " \ f"DIST_DGL_TEST_MODE=client " procs.append( utils.execute_remote(f"{cmd_envs} python3 {test_bin}", ip)) for p in procs: p.join() assert p.exitcode == 0
def test_tensorpipe_comm(): base_dir = os.environ.get('DIST_DGL_TEST_CPP_BIN_DIR', '.') ip_config = os.environ.get('DIST_DGL_TEST_IP_CONFIG', 'ip_config.txt') client_bin = os.path.join(base_dir, 'rpc_client') server_bin = os.path.join(base_dir, 'rpc_server') ips = get_ips(ip_config) num_machines = len(ips) procs = [] for ip in ips: procs.append( execute_remote(server_bin + " " + str(num_machines) + " " + ip, ip)) for ip in ips: procs.append(execute_remote(client_bin + " " + ip_config, ip)) for p in procs: p.join() assert p.exitcode == 0
def main(): epd = epd2in7.EPD() epd.init() epd.Clear(0xFF) font24 = ImageFont.truetype('./lib/Font.ttc', 24) font10 = ImageFont.truetype('./lib/Font.ttc', 10) while True: himage = Image.new('1', (epd.height, epd.width), 255) # 255: clear the frame draw = ImageDraw.Draw(himage) draw.text((10, 0), utils.get_ips(network_adaptor), font=font10, fill=0) draw.text((60, 20), datetime.now().strftime('%Y-%m-%d %H:%M'), font=font24, fill=0) epd.display(epd.getbuffer(himage)) time.sleep(30)
def spider_href(job, citys, num_page): # fake userAgent ua = UserAgent() #ip pool ip_pool = get_ips() # save path out_path = './data/hrefs/href_%s.txt' % (job) # open file fp = open(out_path, 'w', encoding='utf-8') # spider the href for city in citys: for page in range(1, int(num_page / len(city))): # sleep(1) # 搜索关键字为“数据挖掘”,工作地区为北京的url,dps 为工作地区的参数,010 为猎聘网为北京地区指定的区域号 url="https://www.liepin.com/zhaopin/?key=%s&dqs=%s&curPage=%s" % (job, citys[city], str(page)) # 发起访问请求 proxy = {'http': random.choice(ip_pool)} html = requests.get(url = url, proxies=proxy, headers={ "User-Agent":ua.random}) # 输出返回信息 print(html.url) # 初始化soup 对象,page.text 为爬取到的带有html 标签页面 soup = BeautifulSoup(html.text,"html.parser") # 找到<h3>标签,实质是获取所有包含职位名称及链接的标签内容 soup = soup.find_all("h3") #在每个<h3>中进行抽取链接信息 # with open('href.txt', 'a', encoding='utf-8') as f: for i in soup: #有些<h3>标签不包含求职信息,做简要判断 if i.has_attr("title"): #抽取链接内容 href=i.find_all("a")[0]["href"] if('https://www.liepin.com' in href): fp.write(href + '\n') else: fp.write('https://www.liepin.com' + href + '\n') print(href) fp.flush() fp.close()
import os from fake_useragent import UserAgent import utils # ip pool # ip_pool = [] # with open("verified.txt", "r") as f: # while True: # ll = f.readline() # if not ll: break # line = ll.strip().split('|') # ip = line[1] # port = line[2] # realip = ip + ':' + port # ip_pool.append(realip) ip_pool = utils.get_ips() hrefs = [] with open("href.txt", "r", encoding='utf-8') as f: while True: ll = f.readline() if not ll: break line = ll.strip() hrefs.append(line) # 伪装 ua = UserAgent() # page是我们需要获取多少页的ip,这里我们获取到第x页 # 打开我们创建的txt文件 demand = open('demand.txt', 'w', encoding='utf-8') error = 0
# -*- coding: utf-8 -*- from flask import Flask, render_template, request from config import my_port, source_page_path from utils import get_qr_code, get_ips, spider_data, get_dir, get_dir_name, get_zip_file app = Flask(__name__) get_dir(source_page_path) my_ip = get_ips() @app.route('/') def login(): return render_template('index.html', image_data=get_qr_code()) @app.route('/show_time') def show_time(): return render_template('show_time.html', name_l=get_dir_name(source_page_path)) @app.route('/spider') def spider(): try: args = request.args path_prefix, gzh_name = f"{source_page_path}/{args['name']}", args[ 'gzh_name'] start_page, end_page = int(args['start_page']), int(args['end_page']) return spider_data(path_prefix, gzh_name, start_page, end_page) except Exception: