def get_page(url, options={}): try: ua = UserAgent() except: pass try: base_headers = { 'User-Agent': ua.random(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8' } except: base_headers = { 'User-Agent': ua.random(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8' } headers = dict(base_headers, **options) print('Getting', url) try: r = requests.get(url, headers=headers) print('Getting result', url, r.status_code) if r.status_code == 200: return r.text except ConnectionError: print('Crawling Failed', url) return None
def gen_news(): ua = UserAgent() user_agent = ua.random() referer = 'https://tushare.pro/login?next=%2Fnews%2Fnews_sina' headers = { 'User-Agent': user_agent, 'Host': 'tushare.pro', 'Origin': 'https://tushare.pro', 'Referer': referer } stockPageRequest = request.urlopen('http://finance.eastmoney.com/news/cdfsd.html') htmlTitleContent = str(stockPageRequest.read(), 'utf-8') # 正则匹配标题 titlePattern = re.compile('<span class="l3 a3">title="(.*?)"</span>', re.S) p_title = 'title="(.*?)"(.*?)' title = re.findall(p_title, htmlTitleContent) title = [t[0] for t in title if not t[0].find('【')] news = [] for t in title: a = t.find('【') b = t.find('】') news.append({'title': t[a+1:b], 'content': t[b+1:]}) # news = News.objects.all() return news
def test(): ua = UserAgent(family='chrome', os_family='linux') for i in range(100): res = ua.random() print(res)
def _build_chrome_options(self, headless=True, random_user=False): chrome_options = Options() chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--verbose") chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--no-sandbox") chrome_options.add_experimental_option( "prefs", { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing_for_trusted_sources_enabled": False, "safebrowsing.enabled": False, }, ) chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-software-rasterizer") if headless: chrome_options.add_argument("--headless") if random_user: ua = UserAgent(family="chrome") randomua = ua.random() chrome_options.add_argument(f"user-agent={randomua}") return chrome_options
def set_options(): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") ua = UserAgent(family='chrome') randomua = ua.random() chrome_options.add_argument(f'user-agent={randomua}') print(randomua) return chrome_options
def youtube_scrapper(self, query, number_results=2): "Function to scrape results from Youtube Search" query = urllib.parse.quote_plus(query) # Format into URL encoding ua = UserAgent(family='chrome') assert isinstance(query, str) #Search term must be a string assert isinstance(number_results, int) #Number of results must be an integer escaped_search_term = query.replace(' ', '+') google_url = "https://www.google.com/search?q={}&num={}".format( query + "+site:youtube.com", 1) #print(google_url) response = requests.get(google_url, {"User-Agent": ua.random()}) soup = BeautifulSoup(response.text, "html.parser") result_div = soup.find_all('div', attrs={'class': 'ZINbbc'}) self.Links = [] self.Titles = [] for r in result_div: # Checks if each element is present, else, raise exception try: link = r.find('a', href=True) title = r.find('div', attrs={'class': 'vvjwJb'}).get_text() # Check to make sure everything is present before appending if link != '' and title != '': self.Links.append(link['href']) self.Titles.append(title) if (len(self.Links) == number_results): break # Next loop if one element is not present except: continue for i in range(0, len(self.Links)): self.Links[i] = self.Links[i].replace("/url?q=", "") for i in range(0, len(self.Links)): if (self.Links[i].find("watch") != -1): self.Links[i] = self.Links[i].replace("%3F", "?") self.Links[i] = self.Links[i].replace("%3D", "=") self.Links[i] = self.Links[i].split("&")[0] else: continue if (len(self.Links) == 0): return else: for i in range(0, len(self.Links)): d = dict() d["title"] = self.Titles[i] d["linktopage"] = self.Links[i] self.youtube_result.append(d)
class DownloadImg(): def __init__(self): self.ua = UserAgent() def download_one_img(self, img_url, saved_path): # 下载图片 header = { "User-Agent": "{}".format(self.ua.random().strip()), 'Connection': 'close'} r = requests.get(img_url, headers=header, stream=True) print("请求图片状态码 {}".format(r.status_code)) # 返回状态码 if r.status_code == 200: # 写入图片 with open(saved_path, mode="wb") as f: f.write(r.content) print("download {} success!".format(saved_path)) del r return saved_path
import requests from my_fake_useragent import UserAgent URL = 'https://pastr.io/login' client = requests.Session() ua = UserAgent() print(ua.random()) header = {'User-Agent': str(ua.random())} login_payload = { "email": "*****@*****.**", "password": "******", "remember": False, } r = client.post(URL, data=login_payload, headers=header) print(r)
def getRandomUserAgent(): ua = UserAgent() return ua.random()
import re import requests from lxml import etree from my_fake_useragent import UserAgent import MySQLdb conn = MySQLdb.connect(host='127.0.0.1', port=3306, user='******', passwd='123', db='yunyun', charset='utf8') cursor = conn.cursor() a = UserAgent() p = a.random() headers = { 'User-Agent': p, # 'cookie': '__cfduid=dce1ed34975ff71acb9b22d4959d0263b1563521810; ASP.NET_SessionId=1oj0zvk0wttwcudymxjeftpt; UM_distinctid=16c0928d2b2448-03463007e150d9-e343166-144000-16c0928d2b32f6; CNZZDATA1255263807=653621382-1563520703-%7C1563520703; ViewHistory_4=1oj0zvk0wttwcudymxjeftpt; .ynzpauth=869D169A9273686FE3F281194E66EAF796DA177B8799BC0686C9AFD983575676620178F545B8CC60F7FEAA6886B258DF06E4D0E13BBE33ABBA3DCF46FB3A659EE847BBE2696F2256B15111D8D1BDD642178E9567CF7161BDEA9BC44159707D7DF2F8D7D349B8397F87AA820265CC36F284BFECA0EF6E38D76411703DA70E1B5EB03806C9211CD2EC6C800D8E4E9CC840A8734ACC7E31910E493DCF0B2D859E27; viewedResume=2088560%2C1515707%2C727002%2C1218946%2C1623681%2C2131167%2C2121066' } for i in range(2957, 10000): url = 'http://www.bole.com.cn/resume/resume-show.php?id=' + str(i) + '' # print(url) try: with requests.session() as s: a = s.get(url, headers=headers) pr = a.text # print(pr) pattern = re.compile('<div class="personal_info_item">(.*?)</div>') rev1 = pattern.findall(pr) # print(rev1)
from bs4 import BeautifulSoup import requests import csv from my_fake_useragent import UserAgent # Mimic the access to the website like a browser ua = UserAgent(family='chrome') BrowserUserAgent = ua.random() # Define URL and Requests object f = csv.writer(open('drug-names.csv', 'w')) f.writerow(['Name']) pages = [] headers = BrowserUserAgent firstAlphaNumeric = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9', '' ] # secondAlphaNumeric = firstAlphaNumeric finalList = [] for first in firstAlphaNumeric: for second in firstAlphaNumeric: url = 'https://www.drugs.com/alpha/' + str(first) + str( second) + '.html' pages.append(url) for item in pages: page = requests.get(item, headers) soup = BeautifulSoup(page.text, 'html.parser')
class IpPool: def __init__(self): self.ua = UserAgent() self.headers = {'User-Agent': self.ua.random()} # ip代理API self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4®ions=' # redis数据库 self.redi = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True, password='******') # 接口请求失败计数 self.count = 0 # 获取代理ip def get_ip(self): try: res = requests.get(url=self.ipurl, headers=self.headers, timeout=10) print(res.status_code) print( '获取时间:{}'.format( str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))))), res.text) if res.status_code != 200: self.count += 1 else: self.count -= 1 # 接口返回数据 # {"code":0,"data":[{"ip":"223.241.61.18","port":"4336"}],"msg":"0","success":true} json_obj = res.json() if res.status_code == 200 and json_obj['data'][0]: if self.proxyip(json_obj['data'][0]['ip']): return json_obj['data'][0] # return {'ip': '127.0.0.1', 'port': '1234'} except: self.count += 1 # 存储ip def set_ip(self, ip): print('存入:', ip) self.redi.lpush('ip:iplist', json.dumps(ip)) # 检测IP有效性 def test_ip(self, item): item = json.loads(item) try: telnetlib.Telnet(item['ip'], port=item['port'], timeout=10) except: return False else: return True def proxyip(self, ip): url = 'https://iphunter.net/ip/{}'.format(ip) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' } res = requests.get(url, headers=headers) e = etree.HTML(res.text) data = ''.join(e.xpath('/html/body/article/script[3]/text()')) if '代理' not in data and '爬虫' not in data: return True else: return False # 引擎 def engine(self): while True: if self.redi.llen('ip:iplist') >= 19: for item in self.redi.lrange('ip:iplist', 0, -1): print( '检测时间:{}'.format( str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int( time.time()))))), item) if item == None: print(None) # 清除无效IP self.redi.lrem('ip:iplist', 1, item) # # 补充有效IP time.sleep(2) ip = self.get_ip() if ip: self.set_ip(ip) if not self.test_ip(item): print(self.test_ip(item)) # 清除无效IP self.redi.lrem('ip:iplist', 1, item) # # 补充有效IP time.sleep(2) ip = self.get_ip() if ip: self.set_ip(ip) else: for i in range(20): time.sleep(2) if self.redi.llen('ip:iplist') <= 20: print('ip数量小于20') ip = self.get_ip() if ip: self.set_ip(ip) time.sleep(30) # 客户端随机ip def random_ip(self): try: iplist = self.redi.lrange('ip:iplist', 0, -1) except: iplist = [] if iplist: while True: ip = random.choice(iplist) if ip: ip = json.loads(ip) # ip_info = '183.166.164.209:4370' ip_info = ip['ip'] + ':' + ip['port'] proxies = {'https': ip_info} return ip_info # proxies = {'https': '119.5.74.242:4385'} else: return None # 运行 def run(self): pid = str(os.getpid()) self.redi.set('pid:ip_pool', pid) self.engine()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Cedar # @Date : 2021/3/22 # @Desc : from my_fake_useragent import UserAgent ua = UserAgent(phone=True) print(ua.random())
def crawl(self): ua = UserAgent() headers = {'User-Agent': '{}'.format(ua.random())} print(self.spider_name, 'now crawling', self.url_key) try: raw_contents = requests.get(self.url, headers=headers).text match_pattern = r'<td(.*?)</td>' level_1_soup_list = re.findall(match_pattern, raw_contents, re.S | re.M) level_2_soup_list = [] for level_1_soup in level_1_soup_list: level_2_soup = level_1_soup.split('>')[1] level_2_soup_list.append(level_2_soup) project_name = level_2_soup_list[1] project_number = level_2_soup_list[3] project_intro = level_2_soup_list[5] project_link = level_2_soup_list[7].split('\"')[ 1] # Special Design project_purpose = level_2_soup_list[9] project_size = level_2_soup_list[11] project_duration = level_2_soup_list[13] project_apr = level_2_soup_list[15] project_repay_start = level_2_soup_list[17] project_repay_method = level_2_soup_list[19].strip( ) # Special Design project_repay_details = level_2_soup_list[21] project_status = level_2_soup_list[23].strip() # Special Design project_raise_start = level_2_soup_list[25] project_guarantee = level_2_soup_list[27] project_repay_source = level_2_soup_list[29] project_risk = level_2_soup_list[31] project_expense = level_2_soup_list[33] project_template_number = level_2_soup_list[35] project_lender_notice = level_2_soup_list[37] project_borrower_type = level_2_soup_list[39].strip( ) # Special Design project_borrower_name = level_2_soup_list[43] project_document_type = level_2_soup_list[45].strip( ) # Special Design project_document_number = level_2_soup_list[47] project_borrower_job = level_2_soup_list[49] project_borrower_other_info = level_2_soup_list[51] project_borrower_credit = level_2_soup_list[53] project_borrower_default_times = level_2_soup_list[55] project_borrower_default_amounts = level_2_soup_list[57] project_borrower_income_and_debt = level_2_soup_list[59] self.list_of_attribute = [ self.url_key, project_name, project_number, project_intro, project_link, project_purpose, project_size, project_duration, project_apr, project_repay_start, project_repay_method, project_repay_details, project_status, project_raise_start, project_guarantee, project_repay_source, project_risk, project_expense, project_template_number, project_lender_notice, project_borrower_type, project_borrower_name, project_document_type, project_document_number, project_borrower_job, project_borrower_other_info, project_borrower_credit, project_borrower_default_times, project_borrower_default_amounts, project_borrower_income_and_debt ] print(self.spider_name, 'has finished the crawling from', self.url_key) except: project_name = "FAIL" project_number = "FAIL" project_intro = "FAIL" project_link = "FAIL" project_purpose = "FAIL" project_size = "FAIL" project_duration = "FAIL" project_apr = "FAIL" project_repay_start = "FAIL" project_repay_method = "FAIL" project_repay_details = "FAIL" project_status = "FAIL" project_raise_start = "FAIL" project_guarantee = "FAIL" project_repay_source = "FAIL" project_risk = "FAIL" project_expense = "FAIL" project_template_number = "FAIL" project_lender_notice = "FAIL" project_borrower_type = "FAIL" project_borrower_name = "FAIL" project_document_type = "FAIL" project_document_number = "FAIL" project_borrower_job = "FAIL" project_borrower_other_info = "FAIL" project_borrower_credit = "FAIL" project_borrower_default_times = "FAIL" project_borrower_default_amounts = "FAIL" project_borrower_income_and_debt = "FAIL" self.list_of_attribute = [ "FAIL", project_name, project_number, project_intro, project_link, project_purpose, project_size, project_duration, project_apr, project_repay_start, project_repay_method, project_repay_details, project_status, project_raise_start, project_guarantee, project_repay_source, project_risk, project_expense, project_template_number, project_lender_notice, project_borrower_type, project_borrower_name, project_document_type, project_document_number, project_borrower_job, project_borrower_other_info, project_borrower_credit, project_borrower_default_times, project_borrower_default_amounts, project_borrower_income_and_debt ] print(self.spider_name, "has failed and gives", self.url_key, "to another spider")
import requests from my_fake_useragent import UserAgent import json from pymongo import MongoClient from pyquery import PyQuery as pq import random import time ua = UserAgent() headers = {"User-Agent": ua.random()} client = MongoClient(host="localhost", port=27017) collection = client["发改委"]['辽宁1'] def parse_detail(html, url): ret = {} doc = pq(html) ret['url'] = url ret['title'] = doc(".news-content-main h1").text() ret['sourceTime'] = doc(".news-info").text() ret['content'] = doc('#ContTextSize').text() ret['contentUrl'] = doc("#ContTextSize a").attr("href") print(ret) collection.insert_one(ret) def parse_index(html): doc = pq(html) items = doc(".mod-body2 ul li").items() for item in items:
def __init__(self): self.proxies = [] # 代理列表 ua = UserAgent() # 使用随机UA self.headers = {"UserAgent": ua.random()}
def get_request_headers(): ua = UserAgent() return {"User-Agent": ua.random()}