def open_browser(self): """ This entire first block of code is randomizing the browser's user agent info Apparently Canvas really doesn't like Selenium / Automation tools, so I needed to obscure the user agent info to prevent them from detecting it. The last two lines are actually opening the browser. """ software_names = [SoftwareName.CHROME.value] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) # Get list of user agents. user_agents = user_agent_rotator.get_user_agents() # Get Random User Agent String. user_agent = user_agent_rotator.get_random_user_agent() # Open the browser to the designated URL self.browser.get(self.url) # Make sure Alamo is in the tab title assert 'Alamo' in self.browser.title
def get_user_agents(): software_names = [SoftwareName.CHROME.value] operating_systems = [OperatingSystem.LINUX.value] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) user_agents = user_agent_rotator.get_user_agents() user_agent = user_agent_rotator.get_random_user_agent() headers = {'userAgent': 'python 3.7.5', 'platform': user_agent} return (headers)
class RandomUserAgent: def __init__(self): self.software_names = [SoftwareName.CHROME.value] self.os = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value] self.user_agent_rotator = UserAgent(software_names=self.software_names, operating_system=self.os, limit=100) self.user_agents = self.user_agent_rotator.get_user_agents() def get_random_number(self): return random.randint(0, 99) def get_random_user_agents(self): return self.user_agents[self.get_random_number()]["user_agent"]
def getRandomUserAgent(self): software_names = [ SoftwareName.CHROME.value, SoftwareName.FIREFOX.value, SoftwareName.EDGE.value, SoftwareName.INTERNET_EXPLORER.value, SoftwareName.ANDROID.value ] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) user_agents = user_agent_rotator.get_user_agents() user_agent = user_agent_rotator.get_random_user_agent() self.__log__.debug('using user agent: ' + str(user_agent)) return user_agent
def get_user_agent_driver(): software_names = [SoftwareName.CHROME.value] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) user_agents = user_agent_rotator.get_user_agents() user_agent1 = user_agent_rotator.get_random_user_agent() options = Options() options.add_argument(f'—-headless') options.add_argument(f'—-no-sandbox') options.add_argument(f'—-disable-gpu') options.add_argument(f'—-window-size=1420,1080') options.add_argument(f'user-agent={user_agent1}') driver = webdriver.Chrome('../../data/chromedriver.exe', options=options) return driver
print('start ' + str(datetime.now())) # In[71]: # you can also import SoftwareEngine, HardwareType, SoftwareType, Popularity from random_user_agent.params # you can also set number of user agents required by providing `limit` as parameter software_names = [SoftwareName.CHROME.value] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) # Get list of user agents. user_agents = user_agent_rotator.get_user_agents() user_agent = user_agent_rotator.get_random_user_agent() # Get Random User Agent String. def new_agent(): return user_agent_rotator.get_random_user_agent() # In[72]: chromedriver = "/Applications/chromedriver" # path to the chromedriver executable os.environ["webdriver.chrome.driver"] = chromedriver reviews = "https://apps.apple.com/us/app/eastmeeteast-1-asian-dating/id890664813#see-all/reviews" #user_agent = {'User-agent': user_agent} driver = webdriver.Chrome(chromedriver)
"download.default_directory": f"{path_zip}", "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing_for_trusted_sources_enabled": False, "safebrowsing.enabled": False }) options.add_argument('--disable-gpu') options.add_argument('--disable-software-rasterizer') software_names = [SoftwareName.CHROME.value] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) user_agents = user_agent_rotator.get_user_agents() # Get list of user agents. user_agent = user_agent_rotator.get_random_user_agent( ) # Get Random User Agent String. driver = webdriver.Chrome(chrome_options=options, executable_path=r'chromedriver.exe') driver.execute_cdp_cmd( f'Network.setUserAgentOverride', {"userAgent": f'{user_agent_rotator.get_random_user_agent()}'}) print(driver.execute_script("return navigator.userAgent;") ) # This would change the user string of the driver try: driver.get('https://github.com/' + repo + '/archive/master.zip') time_to_wait = 1200 time_counter = 0 while not os.path.exists(path_zip + repo.split('/')[1] + "-master.zip"):
def main(image_id, image, excel): software_names = [SoftwareName.CHROME.value] operating_systems = [ OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value ] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100) # Get list of user agents. user_agents = user_agent_rotator.get_user_agents() if (image_id == None): return 'Ссылка на каталог -', '' info_url = 'https://obd-memorial.ru/html/info.htm?id={}'.format(image_id) img_info = 'https://obd-memorial.ru/html/getimageinfo?id={}'.format( image_id) print(info_url) res1 = requests.get(info_url, allow_redirects=True) dirpath = tempfile.mkdtemp() print('dirpath = ' + dirpath) # создаем каталог сразу - один раз #name_folder_save = str(image_id)+"_"+os.path.basename(tempfile.mktemp ()) d = datetime.now() #.strftime('%Y-%m-%d:%H_%M_%S') #print(d.tzinfo) # Return time zone info #d = pytz.timezone('Europe/Paris').localize(d) #print(d.strftime('%Y-%m-%d %H:%M:%S')) name_folder_save = str(image_id) + ' ' + d.strftime('%Y-%m-%d %H:%M:%S') print('name_folder_save = ' + name_folder_save) #create catalog file_metadata = { 'name': name_folder_save, 'mimeType': 'application/vnd.google-apps.folder', 'parents': [id_root_folder] } result = service.files().create(body=file_metadata, fields='id,webViewLink').execute() # id каталога для сохранения id_folder_save = result['id'] # ссылка на каталог web_link = result['webViewLink'] if (res1.status_code == 307): print(res1.status_code) print('*****************') if (not '3fbe47cd30daea60fc16041479413da2' in res1.cookies): # Удаляем каталог за ненадобностью result = service.files().delete(fileId=id_folder_save).execute() print('*****************************************') print(' delete catalog = ' + name_folder_save) pp.pprint(result) return 'no folder', 'Запись сводного документа не найдена' cookies = {} cookies['3fbe47cd30daea60fc16041479413da2'] = res1.cookies[ '3fbe47cd30daea60fc16041479413da2'] cookies['JSESSIONID'] = res1.cookies['JSESSIONID'] ############################# # load list id's images # ############################# response = requests.get(img_info, cookies=cookies) response_dict = json.loads(response.text) print('response_dict = ' + str(len(response_dict))) ############################# i = 0 if (excel): row_num = 1 workbook = Workbook() # Get active worksheet/tab worksheet = workbook.active worksheet.title = 'Person' columns = cols for col_num, column_title in enumerate(columns, 1): cell = worksheet.cell(row=row_num, column=col_num) cell.value = column_title # идем по списку id сканов # сохраняем имена файлов [id].jpg в list, потом его вернем для выгрузки Google Drive list_file = [] for item in response_dict: i += 1 #print(i, item['id']) if (excel): for id in item['mapData'].keys(): row_num += 1 row = get_info(item['id'], id, cookies) #print('\t',id) for col_num, cell_value in enumerate(row, 1): cell = worksheet.cell(row=row_num, column=col_num) cell.value = cell_value if (image): img_url = "https://obd-memorial.ru/html/images3?id=" + str( item['id']) + "&id1=" + (getStringHash( item['id'])) + "&path=" + item['img'] headers_302 = parse_file(BASE_DIR + '/header_302.txt') headers_302['Cookie'] = make_str_cookie(cookies) headers_302['Referer'] = info_url req302 = requests.get(img_url, headers=headers_302, cookies=cookies, allow_redirects=False) if (req302.status_code == 302): params = {} params['id'] = str(item['id']) params['id1'] = getStringHash(item['id']) params['path'] = item['img'] headers_img = parse_file(BASE_DIR + '/header_img.txt') header_img[ 'User-Agent'] = user_agent_rotator.get_random_user_agent( ) headers_img['Referer'] = info_url ##################### req_img = requests.get( "https://cdn.obd-memorial.ru/html/images3", headers=headers_img, params=params, cookies=cookies, stream=True, allow_redirects=False) ##################### if (req_img.status_code == 200): location = os.path.abspath(dirpath + "/" + str(item['id']) + '.jpg') f = open(location, 'wb') f.write(req_img.content) f.close() list_file.append(dirpath + "/" + str(item['id']) + '.jpg') name = str(item['id']) + '.jpg' file_metadata = { 'name': name, 'parents': [id_folder_save] } try: media = MediaFileUpload(dirpath + "/" + str(item['id']) + '.jpg', resumable=True, chunksize=-1, mimetype='image/jpg') r = service.files().create(body=file_metadata, media_body=media, fields='id').execute() except HttpError as e: print('ERROR *************************') print(e) if e.resp.status in [404]: # Start the upload all over again. print("ERROR404 ********") elif e.resp.status in [500, 502, 503, 504]: print("ERROR 50* ********") # Call next_chunk() again, but use an exponential backoff for repeated errors. else: print('OK') # Do not retry. Log the error and fail. print('ERROR *************************') if (excel): name = str(item['id']) + '.xlsx' file_metadata = {'name': name, 'parents': [id_folder_save]} workbook.save(filename=dirpath + "/" + str(item['id']) + '_book.xlsx') media = MediaFileUpload( dirpath + "/" + str(item['id']) + '_book.xlsx', resumable=True, chunksize=-1, mimetype= 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) r = service.files().create(body=file_metadata, media_body=media, fields='id').execute() # загружаем файлы на GoogleDrive ''' for _file in list_file: name = os.path.basename(_file) print(_file) file_metadata = {'name': name,'parents': [id_folder_save]} media = MediaFileUpload(_file, resumable=True,chunksize=-1, mimetype = 'image/jpg') r = service.files().create(body=file_metadata, media_body=media, fields='id').execute() # Определяем - записались ли файлы в каталог ''' result = service.files().list( pageSize=1000, fields="nextPageToken, files(id, name, mimeType,webViewLink)", q=Template("name contains '$name_folder_save'").safe_substitute( name_folder_save=name_folder_save)).execute() if (result['files']): return web_link, name_folder_save else: return 'no folder', 'records not found'