def Random_agent_generator(): r_agent = Randomize() agent = r_agent.random_agent('desktop', 'windows') # returns 'Desktop / Linux' return agent
def __init__(self , showWindow = True ): options = webdriver.ChromeOptions() options.add_argument("--disable-dev-shm-usage") ; options.add_argument("--no-sandbox") ; with open("proxy.txt") as f: lines = f.readlines() global pr pr = random.choice(lines) options.add_argument('--proxy-server=%s' % pr); r_agent = Randomize() useragent = r_agent.random_agent('desktop','windows') options.add_argument(f'user-agent={useragent}') prefs = {"profile.default_content_setting_values.geolocation" :2} options.add_experimental_option("prefs", prefs); if(not showWindow): options.set_headless(headless=True) ; if sys.platform == 'linux' or sys.platform == 'linux2': driverfilename = 'chrome_linux' elif sys.platform == 'win32': driverfilename = 'chrome_windows.exe' elif sys.platform == 'darwin': driverfilename = 'chrome_mac' driverpath = os.path.join(os.path.split(__file__)[0] , 'drivers{0}{1}'.format(os.path.sep , driverfilename)) os.chmod(driverpath , 0o755 ) self.driver = webdriver.Chrome(executable_path=driverpath , chrome_options=options) self.Key = Keys ; self.errors = list() ; [setattr(self , function , getattr(self.driver , function) ) for function in ['add_cookie' ,'delete_all_cookies','delete_cookie' , 'execute_script' , 'execute_async_script' ,'fullscreen_window','get_cookie' ,'get_cookies','get_log','get_network_conditions','get_screenshot_as_base64' ,'get_screenshot_as_file','get_screenshot_as_png','get_window_position','get_window_rect','get_window_size','maximize_window','minimize_window','implicitly_wait','quit','refresh','save_screenshot','set_network_conditions','set_page_load_timeout','set_script_timeout','set_window_position','set_window_rect','start_client','start_session','stop_client','switch_to_alert']]
def start_requests(self): r_agent = Randomize() firstrequest_headers = { "X-FORWARDED-FOR": "2.16.167.33", "Host": "www.assetmanagement.hsbc.co.uk", "User-Agent": r_agent.random_agent('desktop','windows'), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br" } firsturl = "https://hsbcbankglobal.sc.omtrdc.net/b/ss/hsbc-amg-uk,hsbc-amg-global-rollup/1/JS-2.0.0/s78194187988030?AQB=1&ndh=1&pf=1&t=7%2F10%2F2019%2011%3A45%3A31%204%20-480&sdid=5D631A30BB0D4398-0DF62F7812D9139F&mid=64095394369463805659018752405389641208&ce=UTF-8&ns=hsbcbankglobal&pageName=Global%20High%20Income%20Bond%20-%20HSBC%20Global%20Asset%20Management%20UK&g=https%3A%2F%2Fwww.assetmanagement.hsbc.co.uk%2Fen%2Fintermediary%2Finvestment-expertise%2Ffixed-income%2Fglobal-high-income-bond&cc=USD&server=www.assetmanagement.hsbc.co.uk&events=event27&v1=Global%20High%20Income%20Bond%20-%20HSBC%20Global%20Asset%20Management%20UK&v2=High%20Income%20Bond%20-%20HSBC%20Global%20Asset%20Management%20UK&v3=www.assetmanagement.hsbc.co.uk%2Fen%2Fintermediary%2Finvestment-expertise%2Ffixed-income%2Fglobal-high-income-bond&c6=hsbc-amg-uk%2Chsbc-amg-global-rollup&c7=11%3A45%20AM%7CThursday&c13=accept&v15=11%3A45%20AM%7CThursday&v16=hsbc-amg-uk%2Chsbc-amg-global-rollup&c17=uk-gam&v17=uk-gam&v96=content&v98=Terms%20and%20conditions&v99=accept&pe=lnk_o&pev2=no%20link_name&pid=Intermediary%20%7C%20Investment%20Expertise%20%7C%20Fixed%20Income%20%7C%20Global%20High%20Income%20Bond&pidt=1&oid=https%3A%2F%2Fwww.assetmanagement.hsbc.co.uk%2Fen%2Fintermediary%2Finvestment-expertise%2Ffixed-income%2Fglobal-high&ot=A&s=1920x1080&c=24&j=1.6&v=N&k=Y&bw=1835&bh=634&AQE=1" response = requests.get(firsturl, headers=firstrequest_headers) self.log("Http code,reason:%s,%s" % (response.status_code, response.reason)) referer = response.request.headers.get('Referer', None) headers = firstrequest_headers.setdefault("Referer",referer) self.log("headers:%s"%headers) urls_filepath = os.path.join("./resources/","urls.txt") with open(urls_filepath, mode='r') as handler: self.start_urls = handler.readlines() self.subfoldername = "../staging" + "/" + date.today().strftime("%m-%d-%Y") subfolderpath = os.path.normpath(os.path.join(os.getcwd(), self.subfoldername)) self.log("subfolderpath:%s"%type(subfolderpath)) if os.path.exists(subfolderpath): # os.rmdir(subfolderpath) shutil.rmtree(subfolderpath, ignore_errors=True) os.mkdir(subfolderpath) for url in self.start_urls: yield scrapy.Request(url=url, headers=headers, callback=self.parse)
def user_a(): m = randint(0, 6) agents = [] # Get aspect ratio list. r_agent = Randomize() r_agent.get_aspect_ratio_list( ) # returns ['3:2', '4:3', '5:3', '5:4', '16:9', '16:10']. # Takes 2 arguments (self, aspect_ratio). r_agent.random_resolution('3:2') # returns screen resolution. # Takes 3 arguments (self, device_type, os) agents.append(r_agent.random_agent('desktop', 'linux')) # returns 'Desktop / Linux' agents.append(r_agent.random_agent('desktop', 'mac')) # returns 'Desktop / Linux' agents.append(r_agent.random_agent( 'desktop', 'windows')) # returns 'Desktop / Macintosh' agents.append(r_agent.random_agent( 'tablet', 'android')) # returns 'Tablet / Android' agents.append(r_agent.random_agent('tablet', 'ios')) # returns 'Tablet / iOS' agents.append(r_agent.random_agent( 'smartphone', 'android')) # returns 'Smartphone / Android' agents.append(r_agent.random_agent('tablet', 'ios')) # returns 'Smartphone / iOS' return (agents[m])
class RandomUserAgentMiddleware: """ Random user-agent middleware, change a random UA for every request. """ def __init__(self): self.r_agent = Randomize() self.platform = ['windows', 'mac', 'linux'] def process_request(self, request, spider): random_user_agent = self.r_agent.random_agent('desktop', random.choice(self.platform)) request.headers['User-Agent'] = random_user_agent
class UserAgentDownloaderMiddleware(object): """ User-Agent 自动切换插件 """ def __init__(self): self.r_agent = Randomize() def process_request(self, request, spider): if hasattr(spider, "user_agent_flag" ) and spider.user_agent_flag and not request.meta.get( "dont_user_agent", False): request.headers["User-Agent"] = self.r_agent.random_agent( 'desktop', 'windows')
def sayfaalt(variable,vari): git="http://www.pinterest.com/"+variable+"/"+vari+"/" adres=request.remote_addr try: return render_template(variable+"-"+vari+".html") except: if adres!="127.0.0.1": return render_template("404.html") else: r_agent = Randomize() useragent = r_agent.random_agent('desktop','windows') with open("proxy.txt") as f: lines = f.readlines() pr = random.choice(lines) http_proxy = "http://"+chomp(pr) https_proxy = "https://"+chomp(pr) proxyDict = { "http" : http_proxy, "https" : https_proxy} headers = {'User-Agent': useragent, 'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7'} r = requests.get(git, headers=headers, proxies=proxyDict) soup = BeautifulSoup(r.text, "html.parser") for tag in soup.find_all("meta"): if tag.get("property", None) == "og:title": titlecek= tag.get("content", None) elif tag.get("property", None) == "og:description": descek= tag.get("content", None) with open('sablonalt.txt', 'r') as file : filedata = file.read() filedata = filedata.replace('<title>Clean Blog </title>', '<title>'+titlecek+'</title>') filedata = filedata.replace('<meta name="description" content="desc">', '<meta name="description" content="'+descek+'">') filedata = filedata.replace('<a target="_blank" href="adresss">title</a></p>', '<a target="_blank" href="'+git+'">'+titlecek+'</a></p>') filedata = filedata.replace('<a target="_blank" href="adress"><i class="next">', '<a target="_blank" href="'+git+'"><i class="next"> ') with open(app.root_path+'/templates/'+variable+"-"+vari+'.html', 'w') as file: file.write(filedata) liste=open("urllist.txt","a") icerik=titlecek+":"+"/"+variable+"/"+vari+"/" print(icerik, file=liste) liste.close() return "ok"+adres
import gzip import json import requests from datetime import datetime from time import strptime from datetime import date import dateutil.parser # specific dependency modules next from bs4 import BeautifulSoup from tqdm import tqdm from random_useragent.random_useragent import Randomize r_agent_agent = Randomize() rm_agent = r_agent_agent.random_agent('desktop', 'linux') agent = {"User-Agent": rm_agent} class AWS: def get_all_images(soup): # print(soup.prettify()) meta = soup.find_all('img', attrs={'class': 'card-img-top'}) # print(meta) urls = [] for data in meta: Mystr = data["src"] Newstr = re.sub(r"_hu.*$", '.jpg', Mystr) urls.append(Newstr) return urls
class EventGenerator: '''Generates a set of synthetic behavioral events, with timestamps constrained to a particular date. ''' def __init__(self, start_date): self.faker = Faker() self.faker.add_provider(internet) self.start_date = start_date self.ua_generator = Randomize() def _gen_user_agent(self): devices = [ ('desktop', 'mac'), ('desktop', 'windows'), ('tablet', 'ios'), ('smartphone', 'ios'), ('smartphone', 'android'), ] ua = self.ua_generator.random_agent(*random.choice(devices)) return ua def _gen_event_type(self): '''Creates event type like "io.dagster.page_view". ''' event_types = [ 'page_view', 'button_click', 'reload', 'user_create', 'user_delete', 'signup', ] return 'io.dagster.{}'.format(random.choice(event_types)) def _gen_timestamp(self): midnight = datetime.datetime.combine( self.start_date, datetime.time.min, tzinfo=datetime.timezone.utc).timestamp() return midnight + random.randint(0, 86400 - 1) def __iter__(self): return self def __next__(self): # pylint: disable=no-member return json.dumps({ 'environment': 'production', 'method': 'GET', # Nested dicts 'cookies': { 'session': secrets.token_urlsafe(16), 'persistent': secrets.token_urlsafe(16), }, 'run_id': self.faker.uuid4(), 'type': self._gen_event_type(), 'user_agent': self._gen_user_agent(), 'ip_address': self.faker.ipv4_public(), 'timestamp': self._gen_timestamp(), 'url': '/' + self.faker.uri_path(), # like any good production system, we throw some random PII in our behavioral events 'name': self.faker.name(), 'email': self.faker.ascii_email(), # Nested lists 'location': list(self.faker.location_on_land(coords_only=False)), })
def random_pick(): r_agent = Randomize() return r_agent.random_agent(device_type=random.choice(device_types), os=random.choice(os))
class SlavesAPI: def __init__(self, app_auth: str) -> None: self.app_auth = app_auth self.user_agent = Randomize() self.me: Optional["User"] = None self.slaves: Optional[List["User"]] = None self._error_handler = ErrorHandler self._log = logging.getLogger("vkslaves") async def accept_duel(self, id: int, rps_type: RpsTypes) -> DuelAcceptResponse: """Accept duel request (rock-paper-scissors game) :param int id: Duel request id :param RpsTypes rps_type: Your move :return DuelAcceptResponse: Game result """ req = await self.request("GET", "acceptDuel", { "id": id, "rps_type": rps_type }) return DuelAcceptResponse(**req) async def buy_fetter(self, slave_id: int) -> User: """Buy fetter to your slave :param int slave_id: Id of your slave :return User: Slave data """ self._log.debug(f"Buying fetter for {slave_id}") req = await self.request("POST", "buyFetter", {"slave_id": slave_id}) return User(**req) async def buy_slave(self, slave_id: int) -> User: """Buy slave :param int slave_id: ID of the user you want to buy :return User: Your data """ self._log.debug(f"Buying {slave_id}") req = await self.request("POST", "buySlave", {"slave_id": slave_id}) return User(**req) async def create_duel(self, user_id: int, amount: int, rps_type: RpsType) -> Duel: """Create duel request (rock-paper-scissors game) :param int user_id: Opponent id :param int amount: Bet :param RpsTypes rps_type: Your move :return Duel: Game object """ req = await self.request( "GET", "createDuel", { "user_id": user_id, "amount": amount, "rps_type": rps_type }, ) return Duel(**req) async def groups_as_slaves(self) -> List[User]: """Doesn't work yet :return List[User]: List of users objects """ req = await self.request("GET", "groupAsSlaves") return [User(**item) for item in req["slaves"]] async def job_slave(self, name: str, slave_id: int) -> User: """Give a job for slave :param int slave_id: Id of your slave :param str name: Job name :return User: Slave data """ self._log.debug(f"Setting job {name} for {slave_id}") req = await self.request("POST", "jobSlave", { "name": name, "slave_id": slave_id }) return User(**req["slave"]) async def reject_duel(self, id: int) -> DuelRejectResponse: """Reject duel request (rock-paper-scissors game) :param int id: Duel request id :return DuelRejectResponse: """ req = await self.request("POST", "rejectDuel", {"id": id}) return DuelRejectResponse(**req["slave"]) async def slave_list(self, id: int) -> List[User]: """Get a list of user's slaves :param int id: User id :return List[User]: List of user's slaves """ req = await self.request("GET", "slaveList", {"id": id}) return [User(**item) for item in req["slaves"]] async def sell_slave(self, slave_id: int) -> BalanceResponse: """Sell your slave :param int slave_id: ID of slave you want to sell :return BalanceResponse: """ self._log.debug(f"Selling {slave_id}") req = await self.request("POST", "saleSlave", {"slave_id": slave_id}) return BalanceResponse(**req) async def start(self, post=0) -> StartResponse: """Start app request :param int post: Referral id :return StartResponse: """ self._log.debug("Updating data") req = StartResponse( **(await self.request("GET", "start", {"post": post}))) self.me = req.me self.slaves = req.slaves return req async def top_friends(self, ids: List[int]) -> List[TopResponseItem]: """Get top of your friends :param List[int] ids: Your friends ids :return List[TopResponseItem]: """ req = await self.request("POST", "topFriends", {"ids": ids}) return [TopResponseItem(**item) for item in req["list"]] async def top_users(self) -> List[TopResponseItem]: """Get top of all users :return List[TopResponseItem]: """ req = await self.request("GET", "topUsers") return [TopResponseItem(**item) for item in req["list"]] async def transactions(self) -> List[Transaction]: """Get your transactions :return List[Transaction]: """ req = await self.request("GET", "transactions") return [Transaction(**item) for item in req["list"]] async def transfer_money(self, id: int, amount: int) -> BalanceResponse: """Give your money to other user :param int id: User id :param int amount: Amount to transfer :return BalanceResponse: Your balance """ req = await self.request("POST", "user", {"id": id, "amount": amount}) return BalanceResponse(**req) async def user(self, id: int) -> User: """Get info of user :param int id: User id :return User: User data """ req = await self.request("GET", "user", {"id": id}) return User(**req) async def users(self, ids: List[int]) -> List[User]: """Get info of users (max 5000) :param List[int] ids: IDs of users :return List[User]: List of users data """ req = await self.request("POST", "user", {"ids": ids}) return [User(**item) for item in req["users"]] async def request(self, method: str, path: str, data: dict = None) -> Optional[dict]: params = {"params": data} if method == "GET" else {"json": data} headers = { "authorization": "Bearer " + self.app_auth, "content_type": "application/json", "user-agent": self.user_agent.random_agent("desktop", "windows"), "origin": PROD_SERVER, "referer": PROD_SERVER, } async with aiohttp.ClientSession(headers=headers) as session: async with session.request("OPTIONS", API_URL + path): async with session.request(method, API_URL + path, **params) as response: return self._error_handler.check(await response.text())
class Scraper(object): def __init__(self, proxies): self.proxies = proxies self.ip_check_urls = [ "https://wtfismyip.com/json", "https://wtfismyip.com/json" ] self.r_agent = Randomize() self.sessions = self.prepare_sessions() return None # def get_ip_details(self, session): for url in self.ip_check_urls: check_ip = session.get(url) if check_ip.status_code == 200: print(check_ip.text) break return None # # def prepare_sessions(self): # print("Initializing Scraper and Preparing Sessions") # sessions = [] # for session_count in range(15): # proxy_host = "proxy.crawlera.com" # proxy_port = "8010" # proxy_auth = "f7115f81a6444eeab4003ac4a668f3ee:" # Make sure to include ':' at the end # proxy = { # "https": "https://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port), # "http": "http://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port) # } # _session = requests.Session() # _session.headers["User-Agent"] = self.r_agent.random_agent('desktop','windows') # print(_session.headers["User-Agent"]) # _session.proxies = proxy # # Requests counter is good for assessing proxy quality. # _session.requests_count = 0 # sessions.append(_session) # del([proxy, _session]) # return sessions # # def prepare_sessions(self): print("Initializing Scraper and Preparing Sessions") # http_proxy = "http://194.62.145.248:8080" # https_proxy = "https://194.62.145.248:8080" proxies = self.proxies sessions = [] # print(proxies) for each_proxy in proxies: proxy = { "http": "http://{}".format(each_proxy), "https": "https://{}".format(each_proxy) } _session = requests.Session() # Add code to change User Agents in the future. # _session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3205.0 Safari/537.36" _session.headers["User-Agent"] = self.r_agent.random_agent( 'desktop', 'windows') # print(_session.headers["User-Agent"]) _session.proxies = proxy # Requests counter is good for assessing proxy quality. _session.requests_count = 0 # self.get_ip_details(_session) sessions.append(_session) del ([proxies, each_proxy, proxy, _session]) return sessions # def get_best_session(self): filtered_sessions = list(filter(lambda x: x.active, self.sessions)) best_session = min(filtered_sessions, key=lambda session: session.requests_count) del ([filtered_sessions]) return best_session # def make_request(self, url, method="GET", headers={}, data={}, request_error=False): _response = None current_session = self.get_best_session() # print(current_session, current_session.requests_count, url) print(current_session.requests_count, url) # print(current_session.headers["User-Agent"]) # headers["User-Agent"] = self.r_agent.random_agent('desktop','windows') # headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" current_session.requests_count += 1 try: if method == "GET": # _response = current_session.get(url, timeout=15, verify="crawlera-ca.crt") _response = current_session.get(url, timeout=5, headers=headers, auth=HTTPProxyAuth( 'bbercaw10', 'RU9EFHLx')) elif method == "POST": # _response = current_session.post(url, timeout=15, verify="crawlera-ca.crt") _response = current_session.post(url, timeout=5, headers=headers, auth=HTTPProxyAuth( 'bbercaw10', 'RU9EFHLx')) if _response: # Filter out responses if _response.status_code == 503: # Sleep for some random time before making the next request and change the header current_session.headers[ "User-Agent"] = self.r_agent.random_agent( 'desktop', 'windows') current_session.pause() return self.make_request(url, method, headers, data) if _response.status_code == 407: # Proxy authentication error, stop bot raise SystemExit return None # time.sleep(2) # except Exception as e: # print("\n\n\n\n\n\n\n") # print("Timeout Exception Occured") # print(current_session.proxies, url, e) # print("\n\n\n\n\n\n\n") # return self.make_request(url, method, headers, data) except ConnectionError as ce: if (isinstance(ce.args[0], MaxRetryError) and isinstance(ce.args[0].reason, ProxyError)): print( "Could not connect to Proxy, removing the current session") self.sessions.remove(current_session) return _response # except Exception as e: print("\n\n\n\n\n\n\n") print("Errror occured") print(current_session.proxies, url, e) print("\n\n\n\n\n\n\n") if not request_error: print("Retrying request") return self.make_request(url, method, headers, data, request_error=True) # raise SystemExit del ([current_session, url, method, headers, data]) return _response
# from zipcodes import l import time import requests from tqdm import tqdm from random_useragent.random_useragent import Randomize import random r_agent = Randomize() prefix = "https://www.redfin.com" for zipcode in tqdm(sorted(l)): # Setup agent and timer for each iteration zipcode_str = str(zipcode).zfill(5) ua = r_agent.random_agent('desktop', 'windows') # Get webpage address = prefix + "/zipcode/" + zipcode_str res = requests.get(address, headers={'User-Agent': ua}).content time.sleep(1 + random.uniform(1, 3)) # Parse webpage and get csv address webpage = str(res) end = webpage.find('" class="downloadLink"') begin = webpage.find('/stingray/api/gis-csv?') csv_address = prefix + webpage[begin:end] csv_address = csv_address.replace('\n', '').replace('&', '').replace(';', '&') # Download csv