Exemplo n.º 1
0
Arquivo: main.py Projeto: NeilSCGH/TTS
    def run(self):
        print("Generating... ", end="")

        ua = shadow_useragent.ShadowUserAgent()
        userAgent = ua.percent(0.05)

        url = 'https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=AIzaSyCpE0t4v_h4NTJbTSEIaAuLuV0FmzahJD0'
        headers = {
            'Host': 'texttospeech.googleapis.com',
            'User-Agent': userAgent,
            'Referer': 'https://www.voicebooking.com/ttsfr-v5/'
        }

        data = '{\"input\":{\"text\":\"' + self.text + '\"},\"voice\":{\"name\":\"' + self.voice + '\",\"languageCode\":\"' + self.languageCode + '\"},\"audioConfig\":{\"audioEncoding\":\"LINEAR16\",\"speakingRate\":' + str(
            self.rate) + ',\"pitch\":' + str(self.pitch) + '}}'
        data = data.encode('utf-8')
        res = requests.post(url=url, data=data, headers=headers)

        if res.status_code == 200:  #ok
            content = res.content
            content = json.loads(content.decode('utf-8'))
            binaryAudio = content["audioContent"]
            decode_string = base64.b64decode(binaryAudio)

            wav_file = open(self.fileName, "wb")
            wav_file.write(decode_string)
            wav_file.close()
            print("Done!")

            if self.play:
                print("Playing...", end="")
                playsound(self.fileName)
                print("Done!")
        else:
            print("Failed!")
    def get_driver(self):
        self.display.start()
        user_agent = shadow_useragent.ShadowUserAgent()

        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--user-agent=%s' % user_agent.most_common)
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        # chrome_options.add_extension(self.proxy_extension())

        driver = webdriver.Chrome(options=chrome_options)
        driver.implicitly_wait(15)

        return driver
Exemplo n.º 3
0
def scrape_single_url(url, proxies, user_agent, date_scraped):
    ua = shadow_useragent.ShadowUserAgent()
    agent = ua.percent(0.05)
    headers = {'user-agent': user_agent}
        
    proxies = { 'http' : random.choice(PROXIE_LIST) }
    
    try:
        ua = shadow_useragent.ShadowUserAgent()
        agent = ua.percent(0.05)
        headers = {'user-agent': user_agent}
        
        proxies = { 'http' : random.                                                                    (PROXIE_LIST) }
        
        req = requests.get(url=url, proxies=proxies, headers=headers, timeout=6)
        req.encoding = 'ISO-8859-1'
        html = req.text
        if (req.url != url):
            print "Course Removed"
            none = ["None"] * len(new_cols)
            return dict(zip(new_cols,none))
        
        if (req.status_code == 200): 
            data = scrape_html(html, date_scraped)
            return data
        elif req.status_code == 403:
            print req.status_code
            time.sleep(1800)
            return scrape_single_url(url, proxies, agent, date_scraped)
        else:
            print req.status_code
            time.sleep(3)
            return scrape_single_url(url, proxies, agent, date_scraped)
    except Exception as e:
        print e
        time.sleep(2)
        return scrape_single_url(url, proxies, agent, date_scraped)
Exemplo n.º 4
0
def pull_html(url, user_agent):
#     ugent = shadow_useragent.ShadowUserAgent()
#     user_agent = ugent.percent(0.05)
    driver = build_driver(user_agent)
    driver.get(url)
    element = WebDriverWait(driver,6).until(
        EC.presence_of_element_located((By.XPATH, "//div[@class='course-price-text price-text--base-price__discount--1J7vF price-text--black--1qJbH price-text--medium--2clK9 price-text--bold--ldWad']"))
    )
    category = WebDriverWait(driver,6).until(
        EC.presence_of_element_located((By.XPATH, "//a[@class='btn btn-quaternary btn-xs']"))
    )
    html = driver.page_source
    test_soup = bs.BeautifulSoup(html, 'lxml')
    test_search = test_soup.find('div', attrs = {'class': re.compile('curriculum-course-card--container*')})
    if test_search is None:
        print "BLOCKED ON " + url
        driver.quit()
        ugent = shadow_useragent.ShadowUserAgent()
        user_agent = ugent.percent(0.05)
        pull_html(url,user_agent)
    driver.quit()
    return driver, html
Exemplo n.º 5
0
def task(csv, proxies):
    ua = shadow_useragent.ShadowUserAgent()
    agent = ua.percent(0.05)
    today = date.today()
    date_string = today.strftime("%m/%d/%y")
    df = pull_info(csv, proxies, agent, date_string)
Exemplo n.º 6
0
import requests
from bs4 import BeautifulSoup
import shadow_useragent

ua = shadow_useragent.ShadowUserAgent()
my_user_agent = ua.percent(0.05)

protocols = ["HTTPS", "HTTP", "SOCKS5"]

headers = {
    'User-Agent': '{}'.format(my_user_agent),
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

url = "https://spys.one"

r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')

proxies = []


def listOfProxies(n=0):
    for line in soup.find_all('tr', {"class": "spy1xx"}):
        for k, td in enumerate(line.find_all('td')[:2]):
Exemplo n.º 7
0
 def __init__(self):
     # Get available useragents
     self.ua = shadow_useragent.ShadowUserAgent()
Exemplo n.º 8
0
async def get_car_list(url=None, is_all_pages=False):
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            # proxy={"server": "", "username": "", "password": "",},
            args=[
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-dev-shm-usage",
                "--disable-accelerated-2d-canvas",
                "--no-first-run",
                "--no-zygote",
                "--single-process",
                "--disable-gpu",
            ],
        )

        user_agent = shadow_useragent.ShadowUserAgent()
        context = await browser.newContext(
            userAgent=user_agent.random,
            ignoreHTTPSErrors=True,
            # viewport={"width": 1920, "height": 1080},
        )

        page = await context.newPage()
        await page.goto(url)

        try:
            await page.selectOption(".top [name='serverSideDataTable_length']",
                                    "100")
            await page.waitForFunction(
                "document.querySelector('#serverSideDataTable_processing').style.cssText == 'display: none;'"
            )

            if page.url != "https://www.copart.com/notfound-error":
                next_status = await page.evaluate(
                    """document.querySelector('#serverSideDataTable_next').getAttribute('class')"""
                )
                car_list = []
                if is_all_pages and next_status != "paginate_button next disabled":
                    page_numbers = await page.evaluate(
                        """document.querySelector('#serverSideDataTable_last>a').getAttribute('data-dt-idx')"""
                    )

                    for i in range(int(page_numbers) - 3):
                        car_list = await get_row_data(page, car_list)
                        if (await page.evaluate(
                                """document.querySelector('#serverSideDataTable_next').getAttribute('class')"""
                        ) != "paginate_button next disabled"):
                            await page.click("#serverSideDataTable_next>a")
                            await page.waitForFunction(
                                "document.querySelector('#serverSideDataTable_processing').style.cssText == 'display: none;'"
                            )
                    return car_list
                else:
                    car_list = await get_row_data(page, car_list)
                    return car_list
            else:
                return 404
        except Exception:
            return 404
        finally:
            await browser.close()
Exemplo n.º 9
0
async def get_car_info(lot_id, member=False):
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            # proxy={"server": "", "username": "", "password": "",},
            args=[
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-dev-shm-usage",
                "--disable-accelerated-2d-canvas",
                "--no-first-run",
                "--no-zygote",
                "--single-process",
                "--disable-gpu",
            ],
        )

        user_agent = shadow_useragent.ShadowUserAgent()
        context = await browser.newContext(
            userAgent=user_agent.random,
            ignoreHTTPSErrors=True,
            # viewport={"width": 1920, "height": 1080},
        )

        page = await context.newPage()

        async def login(page):
            await page.goto("https://www.copart.com/login/")
            await page.type("input#username", "")
            await page.type("input#password", "")
            await page.click(".loginfloatright.margin15")
            await page.waitForSelector(".welcomeMsg")

        if member:
            await login(page)

        url = f"https://www.copart.com/lot/{lot_id}"
        await page.goto(url)

        if page.url != "https://www.copart.com/notfound-error":
            keys = list(
                filter(
                    lambda x: x != "Notes:",
                    [
                        await page.evaluate(
                            "(elem) => elem.innerText.replace(':', '')", v)
                        for v in await page.querySelectorAll(
                            ".lot-detail-section label")
                    ],
                ))

            values = [
                await page.evaluate("(elem) => elem.innerText", v) for v in
                await page.querySelectorAll(".lot-detail-section label+span")
            ]

            car_info = {}
            car_info = dict(zip(keys, values))
            car_info["Bid Price"] = await get_text(page, ".bid-price")
            car_info["Sale Location"] = await get_text(
                page,
                ".panel.clr [data-uname='lotdetailSaleinformationlocationvalue']",
            )
            car_info["Sale Date"] = await get_text(
                page,
                "[data-uname='lotdetailSaleinformationsaledatevalue'] [ng-if^='validateDate']",
            )
            response = car_info
        else:
            response = 404

        await browser.close()

        return response
Exemplo n.º 10
0
#!/usr/bin/env python3

import requests
import shadow_useragent

URL = "https://launchpad.binance.com/gateway-api/v1/public/launchpool/project/list"  # nopep8
data = requests.get(url=URL,
                    headers={
                        "Accept":
                        "application/json",
                        "User-Agent":
                        shadow_useragent.ShadowUserAgent().most_common
                    }).json()

# Filter projects that are available (aka "MINING"), with an annual rate (APY).
projects = filter(
    lambda elem: elem["status"] == "MINING" and elem["annualRate"] is not None,
    data["data"]["tracking"]["list"])

# Get assets for which a Launchpool is currently in progress.
assetsDict = dict()
assetsSet = set()
for project in projects:
    assetsDict[project["projectId"]] = dict(project)
    assetsSet.add(project["asset"])

ASSET_INPUT = ""
while ASSET_INPUT.upper() not in assetsSet:
    # Fetch user input for asset, and set default if none.
    ASSET_INPUT = input("Select your asset ({}): ".format(", ".join(
        sorted(assetsSet)))) or "BNB"
Exemplo n.º 11
0
def task(topic, job_list):
    ugent = shadow_useragent.ShadowUserAgent()
    user_agent = ugent.percent(0.05)
    url = topics[topic]
    drv,dframe = scrape(topic, url, user_agent)
    dframe.to_csv('./out/'+topic+'.csv', index = None, header=True,encoding = 'utf-8')
class LogicImmoScraping(scrapy.Spider):
    """ Scraper logic-immo.com """

    offers_scrap_nb = 0
    offers_already_listed = 0

    name = 'logic-immo'
    name_csv = "logic_immo.csv"
    start_scrap_time = time.time()

    # Define User Agent
    ua = shadow_useragent.ShadowUserAgent()
    my_user_agent = ua.percent(0.03)
    headers = {'User-Agent': '{}'.format(my_user_agent)}

    fieldnames = ['id', 'area', 'rooms', 'district', 'price']

    if path.isfile(name_csv):
        mode = 'a'
        csv_file = open(name_csv, mode, newline='')
        writer = csv.DictWriter(csv_file, fieldnames)
    else:
        mode = 'w'
        csv_file = open(name_csv, mode, newline='')
        writer = csv.DictWriter(csv_file, fieldnames)
        writer.writeheader()

    def start_requests(self):

        url = "https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1.html"
        yield scrapy.http.Request(url, headers=self.headers)

    def parse(self, response):

        list_urls = []

        # retrieving number of the last page
        total_pages = response.xpath('//div[@class="numbers"]')
        last_page_nb = total_pages.xpath('.//a[last()]/text()').get()

        # add url of the first page
        list_urls.append(
            'https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1.html'
        )

        # add url of each page from page 2 to the latest
        for i in range(2, int(last_page_nb)):
            list_urls.append(
                f'https://www.logic-immo.com/appartement-paris/vente-appartement-paris-75-100_1-{i}.html'
            )

        for url in list_urls:
            yield response.follow(url,
                                  callback=self.parse_page,
                                  headers=self.headers,
                                  dont_filter=True)

    def parse_page(self, response):

        id_offers_list = []
        format_prices = []
        districts = []

        links_offers = response.xpath(
            '//div[starts-with(@id,"header-offer")]/@id').extract()
        offers_prices = response.xpath(
            '//p[@class="offer-price"]/span/text()').extract()
        areas = response.xpath(
            '//span[@class="offer-area-number"]/text()').extract()
        rooms = response.xpath(
            '//span[@class="offer-details-caracteristik--rooms"]/span["offer-rooms-number"]/text()'
        ).extract()
        bedrooms = response.xpath(
            '//span[@class="offer-details-caracteristik--bedrooms"]/span["offer-rooms-number"]/text()'
        ).extract()
        postal_codes = response.xpath('//div[@class="offer-details-location"]')

        for link_offer in links_offers:
            id_offer = link_offer.replace('header-offer-', '')
            id_offers_list.append(id_offer)

        for price in offers_prices:
            format_prices.append(price.replace('€', '').replace(' ', ''))

        for postal_code_str in postal_codes:
            soup = BeautifulSoup(postal_code_str.get(), 'html.parser')
            text = soup.get_text().replace('\n', ' ')
            postal_code = re.findall('[0-9]{5}', text)
            # keep 2 last numbers to get district
            districts.append(postal_code[0][-2:])

        for id_offer, area, nb_rooms, district, price in zip(
                id_offers_list, areas, rooms, districts, format_prices):
            scraped_info = {
                'id': id_offer,
                'area': int(area),
                'rooms': int(nb_rooms),
                'district': district,
                'price': int(price)
            }

            # Create a new csv
            if self.mode == 'w':
                self.writer.writerow(scraped_info)
                self.offers_scrap_nb += 1
            else:
                # Open csv with scraped data
                df = pd.read_csv(self.name_csv)
                # Check if offer is already listed in our dataset
                if any(df['id'] == scraped_info['id']):
                    self.offers_already_listed += 1
                else:
                    self.writer.writerow(scraped_info)
                    self.offers_scrap_nb += 1

    def closed(self, response):
        interval = time.time() - self.start_scrap_time
        print("End of Scraping")
        print(f"{self.offers_scrap_nb} offers added")
        print(f"{self.offers_already_listed} offers already listed in .csv")
        print(f'Elapsed time for scraping : {round(interval,2)} seconds')