def run(self): print("Generating... ", end="") ua = shadow_useragent.ShadowUserAgent() userAgent = ua.percent(0.05) url = 'https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=AIzaSyCpE0t4v_h4NTJbTSEIaAuLuV0FmzahJD0' headers = { 'Host': 'texttospeech.googleapis.com', 'User-Agent': userAgent, 'Referer': 'https://www.voicebooking.com/ttsfr-v5/' } data = '{\"input\":{\"text\":\"' + self.text + '\"},\"voice\":{\"name\":\"' + self.voice + '\",\"languageCode\":\"' + self.languageCode + '\"},\"audioConfig\":{\"audioEncoding\":\"LINEAR16\",\"speakingRate\":' + str( self.rate) + ',\"pitch\":' + str(self.pitch) + '}}' data = data.encode('utf-8') res = requests.post(url=url, data=data, headers=headers) if res.status_code == 200: #ok content = res.content content = json.loads(content.decode('utf-8')) binaryAudio = content["audioContent"] decode_string = base64.b64decode(binaryAudio) wav_file = open(self.fileName, "wb") wav_file.write(decode_string) wav_file.close() print("Done!") if self.play: print("Playing...", end="") playsound(self.fileName) print("Done!") else: print("Failed!")
def get_driver(self): self.display.start() user_agent = shadow_useragent.ShadowUserAgent() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--user-agent=%s' % user_agent.most_common) chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") # chrome_options.add_extension(self.proxy_extension()) driver = webdriver.Chrome(options=chrome_options) driver.implicitly_wait(15) return driver
def scrape_single_url(url, proxies, user_agent, date_scraped): ua = shadow_useragent.ShadowUserAgent() agent = ua.percent(0.05) headers = {'user-agent': user_agent} proxies = { 'http' : random.choice(PROXIE_LIST) } try: ua = shadow_useragent.ShadowUserAgent() agent = ua.percent(0.05) headers = {'user-agent': user_agent} proxies = { 'http' : random. (PROXIE_LIST) } req = requests.get(url=url, proxies=proxies, headers=headers, timeout=6) req.encoding = 'ISO-8859-1' html = req.text if (req.url != url): print "Course Removed" none = ["None"] * len(new_cols) return dict(zip(new_cols,none)) if (req.status_code == 200): data = scrape_html(html, date_scraped) return data elif req.status_code == 403: print req.status_code time.sleep(1800) return scrape_single_url(url, proxies, agent, date_scraped) else: print req.status_code time.sleep(3) return scrape_single_url(url, proxies, agent, date_scraped) except Exception as e: print e time.sleep(2) return scrape_single_url(url, proxies, agent, date_scraped)
def pull_html(url, user_agent): # ugent = shadow_useragent.ShadowUserAgent() # user_agent = ugent.percent(0.05) driver = build_driver(user_agent) driver.get(url) element = WebDriverWait(driver,6).until( EC.presence_of_element_located((By.XPATH, "//div[@class='course-price-text price-text--base-price__discount--1J7vF price-text--black--1qJbH price-text--medium--2clK9 price-text--bold--ldWad']")) ) category = WebDriverWait(driver,6).until( EC.presence_of_element_located((By.XPATH, "//a[@class='btn btn-quaternary btn-xs']")) ) html = driver.page_source test_soup = bs.BeautifulSoup(html, 'lxml') test_search = test_soup.find('div', attrs = {'class': re.compile('curriculum-course-card--container*')}) if test_search is None: print "BLOCKED ON " + url driver.quit() ugent = shadow_useragent.ShadowUserAgent() user_agent = ugent.percent(0.05) pull_html(url,user_agent) driver.quit() return driver, html
def task(csv, proxies): ua = shadow_useragent.ShadowUserAgent() agent = ua.percent(0.05) today = date.today() date_string = today.strftime("%m/%d/%y") df = pull_info(csv, proxies, agent, date_string)
import requests from bs4 import BeautifulSoup import shadow_useragent ua = shadow_useragent.ShadowUserAgent() my_user_agent = ua.percent(0.05) protocols = ["HTTPS", "HTTP", "SOCKS5"] headers = { 'User-Agent': '{}'.format(my_user_agent), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', } url = "https://spys.one" r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') proxies = [] def listOfProxies(n=0): for line in soup.find_all('tr', {"class": "spy1xx"}): for k, td in enumerate(line.find_all('td')[:2]):
def __init__(self): # Get available useragents self.ua = shadow_useragent.ShadowUserAgent()
async def get_car_list(url=None, is_all_pages=False): async with async_playwright() as p: browser = await p.chromium.launch( headless=True, # proxy={"server": "", "username": "", "password": "",}, args=[ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--no-first-run", "--no-zygote", "--single-process", "--disable-gpu", ], ) user_agent = shadow_useragent.ShadowUserAgent() context = await browser.newContext( userAgent=user_agent.random, ignoreHTTPSErrors=True, # viewport={"width": 1920, "height": 1080}, ) page = await context.newPage() await page.goto(url) try: await page.selectOption(".top [name='serverSideDataTable_length']", "100") await page.waitForFunction( "document.querySelector('#serverSideDataTable_processing').style.cssText == 'display: none;'" ) if page.url != "https://www.copart.com/notfound-error": next_status = await page.evaluate( """document.querySelector('#serverSideDataTable_next').getAttribute('class')""" ) car_list = [] if is_all_pages and next_status != "paginate_button next disabled": page_numbers = await page.evaluate( """document.querySelector('#serverSideDataTable_last>a').getAttribute('data-dt-idx')""" ) for i in range(int(page_numbers) - 3): car_list = await get_row_data(page, car_list) if (await page.evaluate( """document.querySelector('#serverSideDataTable_next').getAttribute('class')""" ) != "paginate_button next disabled"): await page.click("#serverSideDataTable_next>a") await page.waitForFunction( "document.querySelector('#serverSideDataTable_processing').style.cssText == 'display: none;'" ) return car_list else: car_list = await get_row_data(page, car_list) return car_list else: return 404 except Exception: return 404 finally: await browser.close()
async def get_car_info(lot_id, member=False): async with async_playwright() as p: browser = await p.chromium.launch( headless=True, # proxy={"server": "", "username": "", "password": "",}, args=[ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--no-first-run", "--no-zygote", "--single-process", "--disable-gpu", ], ) user_agent = shadow_useragent.ShadowUserAgent() context = await browser.newContext( userAgent=user_agent.random, ignoreHTTPSErrors=True, # viewport={"width": 1920, "height": 1080}, ) page = await context.newPage() async def login(page): await page.goto("https://www.copart.com/login/") await page.type("input#username", "") await page.type("input#password", "") await page.click(".loginfloatright.margin15") await page.waitForSelector(".welcomeMsg") if member: await login(page) url = f"https://www.copart.com/lot/{lot_id}" await page.goto(url) if page.url != "https://www.copart.com/notfound-error": keys = list( filter( lambda x: x != "Notes:", [ await page.evaluate( "(elem) => elem.innerText.replace(':', '')", v) for v in await page.querySelectorAll( ".lot-detail-section label") ], )) values = [ await page.evaluate("(elem) => elem.innerText", v) for v in await page.querySelectorAll(".lot-detail-section label+span") ] car_info = {} car_info = dict(zip(keys, values)) car_info["Bid Price"] = await get_text(page, ".bid-price") car_info["Sale Location"] = await get_text( page, ".panel.clr [data-uname='lotdetailSaleinformationlocationvalue']", ) car_info["Sale Date"] = await get_text( page, "[data-uname='lotdetailSaleinformationsaledatevalue'] [ng-if^='validateDate']", ) response = car_info else: response = 404 await browser.close() return response
#!/usr/bin/env python3 import requests import shadow_useragent URL = "https://launchpad.binance.com/gateway-api/v1/public/launchpool/project/list" # nopep8 data = requests.get(url=URL, headers={ "Accept": "application/json", "User-Agent": shadow_useragent.ShadowUserAgent().most_common }).json() # Filter projects that are available (aka "MINING"), with an annual rate (APY). projects = filter( lambda elem: elem["status"] == "MINING" and elem["annualRate"] is not None, data["data"]["tracking"]["list"]) # Get assets for which a Launchpool is currently in progress. assetsDict = dict() assetsSet = set() for project in projects: assetsDict[project["projectId"]] = dict(project) assetsSet.add(project["asset"]) ASSET_INPUT = "" while ASSET_INPUT.upper() not in assetsSet: # Fetch user input for asset, and set default if none. ASSET_INPUT = input("Select your asset ({}): ".format(", ".join( sorted(assetsSet)))) or "BNB"
def task(topic, job_list): ugent = shadow_useragent.ShadowUserAgent() user_agent = ugent.percent(0.05) url = topics[topic] drv,dframe = scrape(topic, url, user_agent) dframe.to_csv('./out/'+topic+'.csv', index = None, header=True,encoding = 'utf-8')
class LogicImmoScraping(scrapy.Spider): """ Scraper logic-immo.com """ offers_scrap_nb = 0 offers_already_listed = 0 name = 'logic-immo' name_csv = "logic_immo.csv" start_scrap_time = time.time() # Define User Agent ua = shadow_useragent.ShadowUserAgent() my_user_agent = ua.percent(0.03) headers = {'User-Agent': '{}'.format(my_user_agent)} fieldnames = ['id', 'area', 'rooms', 'district', 'price'] if path.isfile(name_csv): mode = 'a' csv_file = open(name_csv, mode, newline='') writer = csv.DictWriter(csv_file, fieldnames) else: mode = 'w' csv_file = open(name_csv, mode, newline='') writer = csv.DictWriter(csv_file, fieldnames) writer.writeheader() def start_requests(self): url = "https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1.html" yield scrapy.http.Request(url, headers=self.headers) def parse(self, response): list_urls = [] # retrieving number of the last page total_pages = response.xpath('//div[@class="numbers"]') last_page_nb = total_pages.xpath('.//a[last()]/text()').get() # add url of the first page list_urls.append( 'https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1.html' ) # add url of each page from page 2 to the latest for i in range(2, int(last_page_nb)): list_urls.append( f'https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1-{i}.html' ) for url in list_urls: yield response.follow(url, callback=self.parse_page, headers=self.headers, dont_filter=True) def parse_page(self, response): id_offers_list = [] format_prices = [] districts = [] links_offers = response.xpath( '//div[starts-with(@id,"header-offer")]/@id').extract() offers_prices = response.xpath( '//p[@class="offer-price"]/span/text()').extract() areas = response.xpath( '//span[@class="offer-area-number"]/text()').extract() rooms = response.xpath( '//span[@class="offer-details-caracteristik--rooms"]/span["offer-rooms-number"]/text()' ).extract() bedrooms = response.xpath( '//span[@class="offer-details-caracteristik--bedrooms"]/span["offer-rooms-number"]/text()' ).extract() postal_codes = response.xpath('//div[@class="offer-details-location"]') for link_offer in links_offers: id_offer = link_offer.replace('header-offer-', '') id_offers_list.append(id_offer) for price in offers_prices: format_prices.append(price.replace('€', '').replace(' ', '')) for postal_code_str in postal_codes: soup = BeautifulSoup(postal_code_str.get(), 'html.parser') text = soup.get_text().replace('\n', ' ') postal_code = re.findall('[0-9]{5}', text) # keep 2 last numbers to get district districts.append(postal_code[0][-2:]) for id_offer, area, nb_rooms, district, price in zip( id_offers_list, areas, rooms, districts, format_prices): scraped_info = { 'id': id_offer, 'area': int(area), 'rooms': int(nb_rooms), 'district': district, 'price': int(price) } # Create a new csv if self.mode == 'w': self.writer.writerow(scraped_info) self.offers_scrap_nb += 1 else: # Open csv with scraped data df = pd.read_csv(self.name_csv) # Check if offer is already listed in our dataset if any(df['id'] == scraped_info['id']): self.offers_already_listed += 1 else: self.writer.writerow(scraped_info) self.offers_scrap_nb += 1 def closed(self, response): interval = time.time() - self.start_scrap_time print("End of Scraping") print(f"{self.offers_scrap_nb} offers added") print(f"{self.offers_already_listed} offers already listed in .csv") print(f'Elapsed time for scraping : {round(interval,2)} seconds')